CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 80 results for author: <span class="mathjax">Wu, P</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Wu%2C+P">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Wu, P"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Wu%2C+P&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Wu, P"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Wu%2C+P&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Wu%2C+P&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Wu%2C+P&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12817">arXiv:2502.12817</a> <span> [<a href="https://arxiv.org/pdf/2502.12817">pdf</a>, <a href="https://arxiv.org/format/2502.12817">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> An Attention-Assisted AI Model for Real-Time Underwater Sound Speed Estimation Leveraging Remote Sensing Sea Surface Temperature Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+P">Pengfei Wu</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+W">Wei Huang</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+Y">Yujie Shi</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12817v2-abstract-short" style="display: inline;"> The estimation of underwater sound velocity distribution serves as a critical basis for facilitating effective underwater communication and precise positioning, given that variations in sound velocity influence the path of signal transmission. Conventional techniques for the direct measurement of sound velocity, as well as methods that involve the inversion of sound velocity utilizing acoustic fie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12817v2-abstract-full').style.display = 'inline'; document.getElementById('2502.12817v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12817v2-abstract-full" style="display: none;"> The estimation of underwater sound velocity distribution serves as a critical basis for facilitating effective underwater communication and precise positioning, given that variations in sound velocity influence the path of signal transmission. Conventional techniques for the direct measurement of sound velocity, as well as methods that involve the inversion of sound velocity utilizing acoustic field data, necessitate on--site data collection. This requirement not only places high demands on device deployment, but also presents challenges in achieving real-time estimation of sound velocity distribution. In order to construct a real-time sound velocity field and eliminate the need for underwater onsite data measurement operations, we propose a self-attention embedded multimodal data fusion convolutional neural network (SA-MDF-CNN) for real-time underwater sound speed profile (SSP) estimation. The proposed model seeks to elucidate the inherent relationship between remote sensing sea surface temperature (SST) data, the primary component characteristics of historical SSPs, and their spatial coordinates. This is achieved by employing CNNs and attention mechanisms to extract local and global correlations from the input data, respectively. The ultimate objective is to facilitate a rapid and precise estimation of sound velocity distribution within a specified task area. Experimental results show that the method proposed in this paper has lower root mean square error (RMSE) and stronger robustness than other state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12817v2-abstract-full').style.display = 'none'; document.getElementById('2502.12817v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03974">arXiv:2502.03974</a> <span> [<a href="https://arxiv.org/pdf/2502.03974">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Spatiotemporal Trajectory Tracking Method for Vehicles Incorporating Lead-Lag Judgement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yuan Li</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+X">Xiang Dong</a>, <a href="/search/eess?searchtype=author&query=Li%2C+T">Tao Li</a>, <a href="/search/eess?searchtype=author&query=Hao%2C+J">Junfeng Hao</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+X">Xiaoxue Xu</a>, <a href="/search/eess?searchtype=author&query=Ullaha%2C+S">Sana Ullaha</a>, <a href="/search/eess?searchtype=author&query=Cai%2C+Y">Yincai Cai</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peng Wu</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+T">Ting Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03974v1-abstract-short" style="display: inline;"> In the domain of intelligent transportation systems, especially within the context of autonomous vehicle control, the preemptive holistic collaborative system has been presented as a promising solution to bring a remarkable enhancement in traffic efficiency and a substantial reduction in the accident rate, demonstrating a great potential of development. In order to ensure this system operates as i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03974v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03974v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03974v1-abstract-full" style="display: none;"> In the domain of intelligent transportation systems, especially within the context of autonomous vehicle control, the preemptive holistic collaborative system has been presented as a promising solution to bring a remarkable enhancement in traffic efficiency and a substantial reduction in the accident rate, demonstrating a great potential of development. In order to ensure this system operates as intended, accurate tracking of the spatiotemporal trajectory is of crucial significance. Moreover, minimizing the tracking error is a necessary step in this process. To this end, a novel lead-lag judgment mechanism is proposed. This mechanism precisely quantifies the longitudinal positional deviation between the vehicle and the target trajectory over time, then the deviation is corrected with a real - time acceleration compensation strategy, as a result, the accuracy and reliability of trajectory tracking are significantly enhanced. Real - vehicle experiments were conducted in a dedicated test field to validate the feasibility of this innovative approach empirically. Subsequently, the obtained tracking data was subsequent processed using the lead-lag judgment mechanism. In this step, we carefully analyzed the spatiotemporal error patterns between the vehicle and the target trajectory under different alignments and speeds. Finally, using real highway speed and alignment data, we conducted comprehensive spatiotemporal trajectory tracking simulations. Through experiments and simulations, tracking errors maintained in an acceptable range and reasonable spatiotemporal distance is given during the preemptive merging process on highway ramps. Overall, this study offers valuable insights for highway ramp emerging safety. Future work can expand on these findings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03974v1-abstract-full').style.display = 'none'; document.getElementById('2502.03974v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15385">arXiv:2501.15385</a> <span> [<a href="https://arxiv.org/pdf/2501.15385">pdf</a>, <a href="https://arxiv.org/format/2501.15385">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> DDUNet: Dual Dynamic U-Net for Highly-Efficient Cloud Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yijie Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hewei Wang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+J">Jinfeng Xu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Puzhen Wu</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+Y">Yunzhong Xiao</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shaofan Wang</a>, <a href="/search/eess?searchtype=author&query=Dev%2C+S">Soumyabrata Dev</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15385v1-abstract-short" style="display: inline;"> Cloud segmentation amounts to separating cloud pixels from non-cloud pixels in an image. Current deep learning methods for cloud segmentation suffer from three issues. (a) Constrain on their receptive field due to the fixed size of the convolution kernel. (b) Lack of robustness towards different scenarios. (c) Requirement of a large number of parameters and limitations for real-time implementation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15385v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15385v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15385v1-abstract-full" style="display: none;"> Cloud segmentation amounts to separating cloud pixels from non-cloud pixels in an image. Current deep learning methods for cloud segmentation suffer from three issues. (a) Constrain on their receptive field due to the fixed size of the convolution kernel. (b) Lack of robustness towards different scenarios. (c) Requirement of a large number of parameters and limitations for real-time implementation. To address these issues, we propose a Dual Dynamic U-Net (DDUNet) for supervised cloud segmentation. The DDUNet adheres to a U-Net architecture and integrates two crucial modules: the dynamic multi-scale convolution (DMSC), improving merging features under different reception fields, and the dynamic weights and bias generator (DWBG) in classification layers to enhance generalization ability. More importantly, owing to the use of depth-wise convolution, the DDUNet is a lightweight network that can achieve 95.3% accuracy on the SWINySEG dataset with only 0.33M parameters, and achieve superior performance over three different configurations of the SWINySEg dataset in both accuracy and efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15385v1-abstract-full').style.display = 'none'; document.getElementById('2501.15385v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07989">arXiv:2501.07989</a> <span> [<a href="https://arxiv.org/pdf/2501.07989">pdf</a>, <a href="https://arxiv.org/ps/2501.07989">ps</a>, <a href="https://arxiv.org/format/2501.07989">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Movable Antenna Enhanced DF and AF Relaying Systems: Performance Analysis and Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+N">Nianzu Li</a>, <a href="/search/eess?searchtype=author&query=Mei%2C+W">Weidong Mei</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peiran Wu</a>, <a href="/search/eess?searchtype=author&query=Ning%2C+B">Boyu Ning</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+L">Lipeng Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07989v1-abstract-short" style="display: inline;"> Movable antenna (MA) has been deemed as a promising technology to flexibly reconfigure wireless channels by adjusting the antenna positions in a given local region. In this paper, we investigate the application of the MA technology in both decode-and-forward (DF) and amplify-and-forward (AF) relaying systems, where a relay is equipped with multiple MAs to assist in the data transmission between tw… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07989v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07989v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07989v1-abstract-full" style="display: none;"> Movable antenna (MA) has been deemed as a promising technology to flexibly reconfigure wireless channels by adjusting the antenna positions in a given local region. In this paper, we investigate the application of the MA technology in both decode-and-forward (DF) and amplify-and-forward (AF) relaying systems, where a relay is equipped with multiple MAs to assist in the data transmission between two single-antenna nodes. For the DF relaying system, our objective is to maximize the achievable rate at the destination by jointly optimizing the positions of the MAs in two stages for receiving signals from the source and transmitting signals to the destination, respectively. To drive essential insights, we first derive a closed-form upper bound on the maximum achievable rate of the DF relaying system. Then, a low-complexity algorithm based on projected gradient ascent (PGA) and alternating optimization (AO) is proposed to solve the antenna position optimization problem. For the AF relaying system, our objective is to maximize the achievable rate by jointly optimizing the two-stage MA positions as well as the AF beamforming matrix at the relay, which results in a more challenging optimization problem due to the intricate coupling variables. To tackle this challenge, we first reveal the hidden separability among the antenna position optimization in the two stages and the beamforming optimization. Based on such separability, we derive a closed-form upper bound on the maximum achievable rate of the AF relaying system and propose a low-complexity algorithm to obtain a high-quality suboptimal solution to the considered problem. Simulation results validate the efficacy of our theoretical analysis and demonstrate the superiority of the MA-enhanced relaying systems to the conventional relaying systems with fixed-position antennas (FPAs) and other benchmark schemes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07989v1-abstract-full').style.display = 'none'; document.getElementById('2501.07989v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13387">arXiv:2412.13387</a> <span> [<a href="https://arxiv.org/pdf/2412.13387">pdf</a>, <a href="https://arxiv.org/format/2412.13387">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Deep Speech Synthesis from Multimodal Articulatory Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peter Wu</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+B">Bohan Yu</a>, <a href="/search/eess?searchtype=author&query=Scheck%2C+K">Kevin Scheck</a>, <a href="/search/eess?searchtype=author&query=Black%2C+A+W">Alan W Black</a>, <a href="/search/eess?searchtype=author&query=Krishnapriyan%2C+A+S">Aditi S. Krishnapriyan</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+I+Y">Irene Y. Chen</a>, <a href="/search/eess?searchtype=author&query=Schultz%2C+T">Tanja Schultz</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Anumanchipalli%2C+G+K">Gopala K. Anumanchipalli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13387v1-abstract-short" style="display: inline;"> The amount of articulatory data available for training deep learning models is much less compared to acoustic speech data. In order to improve articulatory-to-acoustic synthesis performance in these low-resource settings, we propose a multimodal pre-training framework. On single-speaker speech synthesis tasks from real-time magnetic resonance imaging and surface electromyography inputs, the intell… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13387v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13387v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13387v1-abstract-full" style="display: none;"> The amount of articulatory data available for training deep learning models is much less compared to acoustic speech data. In order to improve articulatory-to-acoustic synthesis performance in these low-resource settings, we propose a multimodal pre-training framework. On single-speaker speech synthesis tasks from real-time magnetic resonance imaging and surface electromyography inputs, the intelligibility of synthesized outputs improves noticeably. For example, compared to prior work, utilizing our proposed transfer learning methods improves the MRI-to-speech performance by 36% word error rate. In addition to these intelligibility results, our multimodal pre-trained models consistently outperform unimodal baselines on three objective and subjective synthesis quality metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13387v1-abstract-full').style.display = 'none'; document.getElementById('2412.13387v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07603">arXiv:2411.07603</a> <span> [<a href="https://arxiv.org/pdf/2411.07603">pdf</a>, <a href="https://arxiv.org/format/2411.07603">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> $\mathscr{H}_2$ Model Reduction for Linear Quantum Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+G+P">G. P. Wu</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+S">S. Xue</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+G+F">G. F. Zhang</a>, <a href="/search/eess?searchtype=author&query=Petersen%2C+I+R">I. R. Petersen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07603v2-abstract-short" style="display: inline;"> In this paper, an $\mathscr{H}_2$ norm-based model reduction method for linear quantum systems is presented, which can obtain a physically realizable model with a reduced order for closely approximating the original system. The model reduction problem is described as an optimization problem, whose objective is taken as an $\mathscr{H}_2$ norm of the difference between the transfer function of the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07603v2-abstract-full').style.display = 'inline'; document.getElementById('2411.07603v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07603v2-abstract-full" style="display: none;"> In this paper, an $\mathscr{H}_2$ norm-based model reduction method for linear quantum systems is presented, which can obtain a physically realizable model with a reduced order for closely approximating the original system. The model reduction problem is described as an optimization problem, whose objective is taken as an $\mathscr{H}_2$ norm of the difference between the transfer function of the original system and that of the reduced one. Different from classical model reduction problems, physical realizability conditions for guaranteeing that the reduced-order system is also a quantum system should be taken as nonlinear constraints in the optimization. To solve the optimization problem with such nonlinear constraints, we employ a matrix inequality approach to transform nonlinear inequality constraints into readily solvable linear matrix inequalities (LMIs) and nonlinear equality constraints, so that the optimization problem can be solved by a lifting variables approach. We emphasize that different from existing work, which only introduces a criterion to evaluate the performance after model reduction, we guide our method to obtain an optimal reduced model with respect to the $\mathscr{H}_2$ norm. In addition, the above approach for model reduction is extended to passive linear quantum systems. Finally, examples of active and passive linear quantum systems validate the efficacy of the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07603v2-abstract-full').style.display = 'none'; document.getElementById('2411.07603v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages,3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06449">arXiv:2411.06449</a> <span> [<a href="https://arxiv.org/pdf/2411.06449">pdf</a>, <a href="https://arxiv.org/format/2411.06449">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Improved Video VAE for Latent Video Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+P">Pingyu Wu</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+K">Kai Zhu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yu Liu</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+L">Liming Zhao</a>, <a href="/search/eess?searchtype=author&query=Zhai%2C+W">Wei Zhai</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+Y">Yang Cao</a>, <a href="/search/eess?searchtype=author&query=Zha%2C+Z">Zheng-Jun Zha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06449v1-abstract-short" style="display: inline;"> Variational Autoencoder (VAE) aims to compress pixel data into low-dimensional latent space, playing an important role in OpenAI's Sora and other latent video diffusion generation models. While most of existing video VAEs inflate a pretrained image VAE into the 3D causal structure for temporal-spatial compression, this paper presents two astonishing findings: (1) The initialization from a well-tra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06449v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06449v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06449v1-abstract-full" style="display: none;"> Variational Autoencoder (VAE) aims to compress pixel data into low-dimensional latent space, playing an important role in OpenAI's Sora and other latent video diffusion generation models. While most of existing video VAEs inflate a pretrained image VAE into the 3D causal structure for temporal-spatial compression, this paper presents two astonishing findings: (1) The initialization from a well-trained image VAE with the same latent dimensions suppresses the improvement of subsequent temporal compression capabilities. (2) The adoption of causal reasoning leads to unequal information interactions and unbalanced performance between frames. To alleviate these problems, we propose a keyframe-based temporal compression (KTC) architecture and a group causal convolution (GCConv) module to further improve video VAE (IV-VAE). Specifically, the KTC architecture divides the latent space into two branches, in which one half completely inherits the compression prior of keyframes from a lower-dimension image VAE while the other half involves temporal compression through 3D group causal convolution, reducing temporal-spatial conflicts and accelerating the convergence speed of video VAE. The GCConv in above 3D half uses standard convolution within each frame group to ensure inter-frame equivalence, and employs causal logical padding between groups to maintain flexibility in processing variable frame video. Extensive experiments on five benchmarks demonstrate the SOTA video reconstruction and generation capabilities of the proposed IV-VAE (https://wpy1999.github.io/IV-VAE/). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06449v1-abstract-full').style.display = 'none'; document.getElementById('2411.06449v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10351">arXiv:2409.10351</a> <span> [<a href="https://arxiv.org/pdf/2409.10351">pdf</a>, <a href="https://arxiv.org/format/2409.10351">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/LWC.2024.3485513">10.1109/LWC.2024.3485513 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Over-the-Air Computation via 2D Movable Antenna Array </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+N">Nianzu Li</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peiran Wu</a>, <a href="/search/eess?searchtype=author&query=Ning%2C+B">Boyu Ning</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+L">Lipeng Zhu</a>, <a href="/search/eess?searchtype=author&query=Mei%2C+W">Weidong Mei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10351v1-abstract-short" style="display: inline;"> Movable antenna (MA) has emerged as a promising technology for improving the performance of wireless communication systems, which enables local movement of the antennas to create more favorable channel conditions. In this letter, we advance its application for over-the-air computation (AirComp) network, where an access point is equipped with a two-dimensional (2D) MA array to aggregate wireless da… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10351v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10351v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10351v1-abstract-full" style="display: none;"> Movable antenna (MA) has emerged as a promising technology for improving the performance of wireless communication systems, which enables local movement of the antennas to create more favorable channel conditions. In this letter, we advance its application for over-the-air computation (AirComp) network, where an access point is equipped with a two-dimensional (2D) MA array to aggregate wireless data from massive users. We aim to minimize the computation mean square error (CMSE) by jointly optimizing the antenna position vector (APV), the receive combining vector at the access point and the transmit coefficients from all users. To tackle this highly non-convex problem, we propose a two-loop iterative algorithm, where the particle swarm optimization (PSO) approach is leveraged to obtain a suboptimal APV in the outer loop while the receive combining vector and transmit coefficients are alternately optimized in the inner loop. Numerical results demonstrate that the proposed MA-enhanced AirComp network outperforms the conventional network with fixed-position antennas (FPAs). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10351v1-abstract-full').style.display = 'none'; document.getElementById('2409.10351v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Wireless Communications Letters, vol. 14, no. 1, pp. 33-37, Jan. 2025 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02451">arXiv:2409.02451</a> <span> [<a href="https://arxiv.org/pdf/2409.02451">pdf</a>, <a href="https://arxiv.org/format/2409.02451">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Fast, High-Quality and Parameter-Efficient Articulatory Synthesis using Differentiable DSP </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yisi Liu</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+B">Bohan Yu</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+D">Drake Lin</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peter Wu</a>, <a href="/search/eess?searchtype=author&query=Cho%2C+C+J">Cheol Jun Cho</a>, <a href="/search/eess?searchtype=author&query=Anumanchipalli%2C+G+K">Gopala Krishna Anumanchipalli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02451v1-abstract-short" style="display: inline;"> Articulatory trajectories like electromagnetic articulography (EMA) provide a low-dimensional representation of the vocal tract filter and have been used as natural, grounded features for speech synthesis. Differentiable digital signal processing (DDSP) is a parameter-efficient framework for audio synthesis. Therefore, integrating low-dimensional EMA features with DDSP can significantly enhance th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02451v1-abstract-full').style.display = 'inline'; document.getElementById('2409.02451v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02451v1-abstract-full" style="display: none;"> Articulatory trajectories like electromagnetic articulography (EMA) provide a low-dimensional representation of the vocal tract filter and have been used as natural, grounded features for speech synthesis. Differentiable digital signal processing (DDSP) is a parameter-efficient framework for audio synthesis. Therefore, integrating low-dimensional EMA features with DDSP can significantly enhance the computational efficiency of speech synthesis. In this paper, we propose a fast, high-quality, and parameter-efficient DDSP articulatory vocoder that can synthesize speech from EMA, F0, and loudness. We incorporate several techniques to solve the harmonics / noise imbalance problem, and add a multi-resolution adversarial loss for better synthesis quality. Our model achieves a transcription word error rate (WER) of 6.67% and a mean opinion score (MOS) of 3.74, with an improvement of 1.63% and 0.16 compared to the state-of-the-art (SOTA) baseline. Our DDSP vocoder is 4.9x faster than the baseline on CPU during inference, and can generate speech of comparable quality with only 0.4M parameters, in contrast to the 9M parameters required by the SOTA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02451v1-abstract-full').style.display = 'none'; document.getElementById('2409.02451v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted for Spoken Language Technology Workshop 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08121">arXiv:2408.08121</a> <span> [<a href="https://arxiv.org/pdf/2408.08121">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ACCESS.2025.3539370">10.1109/ACCESS.2025.3539370 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Enhancing Expressway Ramp Merge Safety and Efficiency via Spatiotemporal Cooperative Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Peng%2C+T">Ting Peng</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+X">Xiaoxue Xu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yuan Li</a>, <a href="/search/eess?searchtype=author&query=WU%2C+J">Jie WU</a>, <a href="/search/eess?searchtype=author&query=Li%2C+T">Tao Li</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+X">Xiang Dong</a>, <a href="/search/eess?searchtype=author&query=Cai%2C+Y">Yincai Cai</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peng Wu</a>, <a href="/search/eess?searchtype=author&query=Ullah%2C+S">Sana Ullah</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08121v3-abstract-short" style="display: inline;"> In the context of autonomous driving on expressways, the issue of ensuring safe and efficient ramp merging remains a significant challenge. Existing systems often struggle to accurately assess the status and intentions of other vehicles, leading to a persistent occurrence of accidents despite efforts to maintain safe distances. This study proposes a novel spatiotemporal cooperative control approac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08121v3-abstract-full').style.display = 'inline'; document.getElementById('2408.08121v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08121v3-abstract-full" style="display: none;"> In the context of autonomous driving on expressways, the issue of ensuring safe and efficient ramp merging remains a significant challenge. Existing systems often struggle to accurately assess the status and intentions of other vehicles, leading to a persistent occurrence of accidents despite efforts to maintain safe distances. This study proposes a novel spatiotemporal cooperative control approach integrating vehicle-road coordination to address this critical issue. A comprehensive methodology is developed, beginning with the calculation of safe distances under varying spatiotemporal conditions. This involves considering multiple factors, including vehicle speed differentials, positioning errors, and clock synchronization errors. Subsequently, an advanced vehicle conflict risk evaluation model is constructed. By incorporating collision acceleration and emergency acceleration as key parameters, this model offers a more accurate and detailed evaluation of potential risks during the ramp merging process. Based on the calculated safe distances and conflict risk evaluations, a mainline priority coordinated control method is formulated. This method enables the pre-planning of vehicle trajectories, effectively reducing conflicts among vehicles. Through rigorous simulations using diverse traffic volume and speed scenarios, the efficacy of the proposed strategy is validated. The results demonstrate remarkable improvements, with the average delay time reduced by an impressive 97.96% and fuel consumption decreased by 6.01%. These outcomes indicate that the proposed approach not only enhances the speed of vehicle merging but also significantly reduces latency and fuel consumption, thereby enhancing the overall performance of ramp merging operations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08121v3-abstract-full').style.display = 'none'; document.getElementById('2408.08121v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Access, vol. 13, pp. 25664-25682, 2025 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06789">arXiv:2408.06789</a> <span> [<a href="https://arxiv.org/pdf/2408.06789">pdf</a>, <a href="https://arxiv.org/ps/2408.06789">ps</a>, <a href="https://arxiv.org/format/2408.06789">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/LWC.2024.3403138">10.1109/LWC.2024.3403138 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Sum Rate Maximization for Movable Antenna Enabled Uplink NOMA </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+N">Nianzu Li</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peiran Wu</a>, <a href="/search/eess?searchtype=author&query=Ning%2C+B">Boyu Ning</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+L">Lipeng Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06789v1-abstract-short" style="display: inline;"> Movable antenna (MA) has been recently proposed as a promising candidate technology for the next generation wireless communication systems due to its significant capability of reconfiguring wireless channels via antenna movement. In this letter, we study an MA-enabled uplink non-orthogonal multiple access (NOMA) system, where each user is equipped with a single MA. Our objective is to maximize the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06789v1-abstract-full').style.display = 'inline'; document.getElementById('2408.06789v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06789v1-abstract-full" style="display: none;"> Movable antenna (MA) has been recently proposed as a promising candidate technology for the next generation wireless communication systems due to its significant capability of reconfiguring wireless channels via antenna movement. In this letter, we study an MA-enabled uplink non-orthogonal multiple access (NOMA) system, where each user is equipped with a single MA. Our objective is to maximize the users' sum rate by jointly optimizing the MAs' positions, the decoding order and the power control. To solve this non-convex problem, we equivalently transform it into two tractable subproblems. First, we use the successive convex approximation (SCA) to find a locally optimal solution for the antenna position optimization subproblem. Next, we derive the closed-form optimal solution of the decoding order and power control subproblem. Numerical results show that our proposed MA-enabled NOMA system can significantly enhance the sum rate compared to fixed-position antenna (FPA) systems and orthogonal multiple access (OMA) systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06789v1-abstract-full').style.display = 'none'; document.getElementById('2408.06789v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures. Accepted to IEEE Wireless Communications Letters</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Wireless Communications Letters, vol. 13, no. 8, pp. 2140-2144, Aug. 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05746">arXiv:2408.05746</a> <span> [<a href="https://arxiv.org/pdf/2408.05746">pdf</a>, <a href="https://arxiv.org/ps/2408.05746">ps</a>, <a href="https://arxiv.org/format/2408.05746">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Movable Antenna Enhanced AF Relaying: Two-Stage Antenna Position Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+N">Nianzu Li</a>, <a href="/search/eess?searchtype=author&query=Mei%2C+W">Weidong Mei</a>, <a href="/search/eess?searchtype=author&query=Ning%2C+B">Boyu Ning</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peiran Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05746v1-abstract-short" style="display: inline;"> The movable antenna (MA) technology has attracted increasing attention in wireless communications due to its capability for flexibly adjusting the positions of multiple antennas in a local region to reconfigure channel conditions. In this paper, we investigate its application in an amplify-and-forward (AF) relay system, where a multi-MA AF relay is deployed to assist in the wireless communications… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05746v1-abstract-full').style.display = 'inline'; document.getElementById('2408.05746v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05746v1-abstract-full" style="display: none;"> The movable antenna (MA) technology has attracted increasing attention in wireless communications due to its capability for flexibly adjusting the positions of multiple antennas in a local region to reconfigure channel conditions. In this paper, we investigate its application in an amplify-and-forward (AF) relay system, where a multi-MA AF relay is deployed to assist in the wireless communications from a source to a destination. In particular, we aim to maximize the achievable rate at the destination, by jointly optimizing the AF weight matrix at the relay and its MAs' positions in two stages for receiving the signal from the source and transmitting its amplified version to the destination, respectively. However, compared to the existing one-stage antenna position optimization, the two-stage position optimization is more challenging due to its intricate coupling in the achievable rate at the destination. To tackle this challenge, we decompose the considered problem into several subproblems by invoking the alternating optimization (AO) and solve them by using the semidefinite programming and the gradient ascent. Numerical results demonstrate the superiority of our proposed system over the conventional relaying system with fixed-position antennas (FPAs) and also drive essential insights. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05746v1-abstract-full').style.display = 'none'; document.getElementById('2408.05746v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.02934">arXiv:2408.02934</a> <span> [<a href="https://arxiv.org/pdf/2408.02934">pdf</a>, <a href="https://arxiv.org/format/2408.02934">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Learned Trimmed-Ridge Regression for Channel Estimation in Millimeter-Wave Massive MIMO </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+P">Pengxia Wu</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+J">Julian Cheng</a>, <a href="/search/eess?searchtype=author&query=Eldar%2C+Y+C">Yonina C. Eldar</a>, <a href="/search/eess?searchtype=author&query=Cioffi%2C+J+M">John M. Cioffi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.02934v1-abstract-short" style="display: inline;"> Channel estimation poses significant challenges in millimeter-wave massive multiple-input multiple-output systems, especially when the base station has fewer radio-frequency chains than antennas. To address this challenge, one promising solution exploits the beamspace channel sparsity to reconstruct full-dimensional channels from incomplete measurements. This paper presents a model-based deep lear… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02934v1-abstract-full').style.display = 'inline'; document.getElementById('2408.02934v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.02934v1-abstract-full" style="display: none;"> Channel estimation poses significant challenges in millimeter-wave massive multiple-input multiple-output systems, especially when the base station has fewer radio-frequency chains than antennas. To address this challenge, one promising solution exploits the beamspace channel sparsity to reconstruct full-dimensional channels from incomplete measurements. This paper presents a model-based deep learning method to reconstruct sparse, as well as approximately sparse, vectors fast and accurately. To implement this method, we propose a trimmed-ridge regression that transforms the sparse-reconstruction problem into a least-squares problem regularized by a nonconvex penalty term, and then derive an iterative solution. We then unfold the iterations into a deep network that can be implemented in online applications to realize real-time computations. To this end, an unfolded trimmed-ridge regression model is constructed using a structural configuration to reduce computational complexity and a model ensemble strategy to improve accuracy. Compared with other state-of-the-art deep learning models, the proposed learning scheme achieves better accuracy and supports higher downlink sum rates. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02934v1-abstract-full').style.display = 'none'; document.getElementById('2408.02934v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE Transactions on Communications</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21345">arXiv:2407.21345</a> <span> [<a href="https://arxiv.org/pdf/2407.21345">pdf</a>, <a href="https://arxiv.org/format/2407.21345">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Towards EMG-to-Speech with a Necklace Form Factor </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peter Wu</a>, <a href="/search/eess?searchtype=author&query=Kaveh%2C+R">Ryan Kaveh</a>, <a href="/search/eess?searchtype=author&query=Nautiyal%2C+R">Raghav Nautiyal</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+C">Christine Zhang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+A">Albert Guo</a>, <a href="/search/eess?searchtype=author&query=Kachinthaya%2C+A">Anvitha Kachinthaya</a>, <a href="/search/eess?searchtype=author&query=Mishra%2C+T">Tavish Mishra</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+B">Bohan Yu</a>, <a href="/search/eess?searchtype=author&query=Black%2C+A+W">Alan W Black</a>, <a href="/search/eess?searchtype=author&query=Muller%2C+R">Rikky Muller</a>, <a href="/search/eess?searchtype=author&query=Anumanchipalli%2C+G+K">Gopala Krishna Anumanchipalli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21345v1-abstract-short" style="display: inline;"> Electrodes for decoding speech from electromyography (EMG) are typically placed on the face, requiring adhesives that are inconvenient and skin-irritating if used regularly. We explore a different device form factor, where dry electrodes are placed around the neck instead. 11-word, multi-speaker voiced EMG classifiers trained on data recorded with this device achieve 92.7% accuracy. Ablation studi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21345v1-abstract-full').style.display = 'inline'; document.getElementById('2407.21345v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21345v1-abstract-full" style="display: none;"> Electrodes for decoding speech from electromyography (EMG) are typically placed on the face, requiring adhesives that are inconvenient and skin-irritating if used regularly. We explore a different device form factor, where dry electrodes are placed around the neck instead. 11-word, multi-speaker voiced EMG classifiers trained on data recorded with this device achieve 92.7% accuracy. Ablation studies reveal the importance of having more than two electrodes on the neck, and phonological analyses reveal similar classification confusions between neck-only and neck-and-face form factors. Finally, speech-EMG correlation experiments demonstrate a linear relationship between many EMG spectrogram frequency bins and self-supervised speech representation dimensions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21345v1-abstract-full').style.display = 'none'; document.getElementById('2407.21345v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.18627">arXiv:2407.18627</a> <span> [<a href="https://arxiv.org/pdf/2407.18627">pdf</a>, <a href="https://arxiv.org/ps/2407.18627">ps</a>, <a href="https://arxiv.org/format/2407.18627">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Multi-Agent Deep Reinforcement Learning for Energy Efficient Multi-Hop STAR-RIS-Assisted Transmissions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liao%2C+P">Pei-Hsiang Liao</a>, <a href="/search/eess?searchtype=author&query=Shen%2C+L">Li-Hsiang Shen</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Po-Chen Wu</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+K">Kai-Ten Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.18627v1-abstract-short" style="display: inline;"> Simultaneously transmitting and reflecting reconfigurable intelligent surface (STAR-RIS) provides a promising way to expand coverage in wireless communications. However, limitation of single STAR-RIS inspire us to integrate the concept of multi-hop transmissions, as focused on RIS in existing research. Therefore, we propose the novel architecture of multi-hop STAR-RISs to achieve a wider range of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18627v1-abstract-full').style.display = 'inline'; document.getElementById('2407.18627v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.18627v1-abstract-full" style="display: none;"> Simultaneously transmitting and reflecting reconfigurable intelligent surface (STAR-RIS) provides a promising way to expand coverage in wireless communications. However, limitation of single STAR-RIS inspire us to integrate the concept of multi-hop transmissions, as focused on RIS in existing research. Therefore, we propose the novel architecture of multi-hop STAR-RISs to achieve a wider range of full-plane service coverage. In this paper, we intend to solve active beamforming of the base station and passive beamforming of STAR-RISs, aiming for maximizing the energy efficiency constrained by hardware limitation of STAR-RISs. Furthermore, we investigate the impact of the on-off state of STAR-RIS elements on energy efficiency. To tackle the complex problem, a Multi-Agent Global and locAl deep Reinforcement learning (MAGAR) algorithm is designed. The global agent elevates the collaboration among local agents, which focus on individual learning. In numerical results, we observe the significant improvement of MAGAR compared to the other benchmarks, including Q-learning, multi-agent deep Q network (DQN) with golbal reward, and multi-agent DQN with local rewards. Moreover, the proposed architecture of multi-hop STAR-RISs achieves the highest energy efficiency compared to mode switching based STAR-RISs, conventional RISs and deployment without RISs or STAR-RISs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18627v1-abstract-full').style.display = 'none'; document.getElementById('2407.18627v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Proc. IEEE VTC-fall</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.17691">arXiv:2407.17691</a> <span> [<a href="https://arxiv.org/pdf/2407.17691">pdf</a>, <a href="https://arxiv.org/format/2407.17691">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> System-Level Simulation Framework for NB-IoT: Key Features and Performance Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shutao Zhang</a>, <a href="/search/eess?searchtype=author&query=Wen%2C+W">Wenkun Wen</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peiran Wu</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Hongqing Huang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+L">Liya Zhu</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Yijia Guo</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+T">Tingting Yang</a>, <a href="/search/eess?searchtype=author&query=Xia%2C+M">Minghua Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.17691v2-abstract-short" style="display: inline;"> Narrowband Internet of Things (NB-IoT) is a technology specifically designated by the 3rd Generation Partnership Project (3GPP) to meet the explosive demand for massive machine-type communications (mMTC), and it is evolving to RedCap. Industrial companies have increasingly adopted NB-IoT as the solution for mMTC due to its lightweight design and comprehensive technical specifications released by 3… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17691v2-abstract-full').style.display = 'inline'; document.getElementById('2407.17691v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.17691v2-abstract-full" style="display: none;"> Narrowband Internet of Things (NB-IoT) is a technology specifically designated by the 3rd Generation Partnership Project (3GPP) to meet the explosive demand for massive machine-type communications (mMTC), and it is evolving to RedCap. Industrial companies have increasingly adopted NB-IoT as the solution for mMTC due to its lightweight design and comprehensive technical specifications released by 3GPP. This paper presents a system-level simulation framework for NB-IoT networks to evaluate their performance. The system-level simulator is structured into four parts: initialization, pre-generation, main simulation loop, and post-processing. Additionally, three essential features are investigated to enhance coverage, support massive connections, and ensure low power consumption, respectively. Simulation results demonstrate that the cumulative distribution function curves of the signal-to-interference-and-noise ratio fully comply with industrial standards. Furthermore, the throughput performance explains how NB-IoT networks realize massive connections at the cost of data rate. This work highlights its practical utility and paves the way for developing NB-IoT networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17691v2-abstract-full').style.display = 'none'; document.getElementById('2407.17691v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.15754">arXiv:2406.15754</a> <span> [<a href="https://arxiv.org/pdf/2406.15754">pdf</a>, <a href="https://arxiv.org/format/2406.15754">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Multimodal Segmentation for Vocal Tract Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jain%2C+R">Rishi Jain</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+B">Bohan Yu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peter Wu</a>, <a href="/search/eess?searchtype=author&query=Prabhune%2C+T">Tejas Prabhune</a>, <a href="/search/eess?searchtype=author&query=Anumanchipalli%2C+G">Gopala Anumanchipalli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.15754v1-abstract-short" style="display: inline;"> Accurate modeling of the vocal tract is necessary to construct articulatory representations for interpretable speech processing and linguistics. However, vocal tract modeling is challenging because many internal articulators are occluded from external motion capture technologies. Real-time magnetic resonance imaging (RT-MRI) allows measuring precise movements of internal articulators during speech… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.15754v1-abstract-full').style.display = 'inline'; document.getElementById('2406.15754v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.15754v1-abstract-full" style="display: none;"> Accurate modeling of the vocal tract is necessary to construct articulatory representations for interpretable speech processing and linguistics. However, vocal tract modeling is challenging because many internal articulators are occluded from external motion capture technologies. Real-time magnetic resonance imaging (RT-MRI) allows measuring precise movements of internal articulators during speech, but annotated datasets of MRI are limited in size due to time-consuming and computationally expensive labeling methods. We first present a deep labeling strategy for the RT-MRI video using a vision-only segmentation approach. We then introduce a multimodal algorithm using audio to improve segmentation of vocal articulators. Together, we set a new benchmark for vocal tract modeling in MRI video segmentation and use this to release labels for a 75-speaker RT-MRI dataset, increasing the amount of labeled public RT-MRI data of the vocal tract by over a factor of 9. The code and dataset labels can be found at \url{rishiraij.github.io/multimodal-mri-avatar/}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.15754v1-abstract-full').style.display = 'none'; document.getElementById('2406.15754v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.12998">arXiv:2406.12998</a> <span> [<a href="https://arxiv.org/pdf/2406.12998">pdf</a>, <a href="https://arxiv.org/format/2406.12998">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Coding Speech through Vocal Tract Kinematics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cho%2C+C+J">Cheol Jun Cho</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peter Wu</a>, <a href="/search/eess?searchtype=author&query=Prabhune%2C+T+S">Tejas S. Prabhune</a>, <a href="/search/eess?searchtype=author&query=Agarwal%2C+D">Dhruv Agarwal</a>, <a href="/search/eess?searchtype=author&query=Anumanchipalli%2C+G+K">Gopala K. Anumanchipalli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.12998v4-abstract-short" style="display: inline;"> Vocal tract articulation is a natural, grounded control space of speech production. The spatiotemporal coordination of articulators combined with the vocal source shapes intelligible speech sounds to enable effective spoken communication. Based on this physiological grounding of speech, we propose a new framework of neural encoding-decoding of speech -- Speech Articulatory Coding (SPARC). SPARC co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12998v4-abstract-full').style.display = 'inline'; document.getElementById('2406.12998v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.12998v4-abstract-full" style="display: none;"> Vocal tract articulation is a natural, grounded control space of speech production. The spatiotemporal coordination of articulators combined with the vocal source shapes intelligible speech sounds to enable effective spoken communication. Based on this physiological grounding of speech, we propose a new framework of neural encoding-decoding of speech -- Speech Articulatory Coding (SPARC). SPARC comprises an articulatory analysis model that infers articulatory features from speech audio, and an articulatory synthesis model that synthesizes speech audio from articulatory features. The articulatory features are kinematic traces of vocal tract articulators and source features, which are intuitively interpretable and controllable, being the actual physical interface of speech production. An additional speaker identity encoder is jointly trained with the articulatory synthesizer to inform the voice texture of individual speakers. By training on large-scale speech data, we achieve a fully intelligible, high-quality articulatory synthesizer that generalizes to unseen speakers. Furthermore, the speaker embedding is effectively disentangled from articulations, which enables accent-perserving zero-shot voice conversion. To the best of our knowledge, this is the first demonstration of universal, high-performance articulatory inference and synthesis, suggesting the proposed framework as a powerful coding system of speech. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12998v4-abstract-full').style.display = 'none'; document.getElementById('2406.12998v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.15153">arXiv:2405.15153</a> <span> [<a href="https://arxiv.org/pdf/2405.15153">pdf</a>, <a href="https://arxiv.org/format/2405.15153">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Optimal Reference Nodes Deployment for Positioning Seafloor Anchor Nodes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+W">Wei Huang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Pengfei Wu</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tianhe Xu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+K">Kaitao Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.15153v1-abstract-short" style="display: inline;"> Seafloor anchor nodes, which form a geodetic network, are designed to provide surface and underwater users with positioning, navigation and timing (PNT) services. Due to the non-uniform distribution of underwater sound speed, accurate positioning of underwater anchor nodes is a challenge work. Traditional anchor node positioning typically uses cross or circular shapes, however, how to optimize the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.15153v1-abstract-full').style.display = 'inline'; document.getElementById('2405.15153v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.15153v1-abstract-full" style="display: none;"> Seafloor anchor nodes, which form a geodetic network, are designed to provide surface and underwater users with positioning, navigation and timing (PNT) services. Due to the non-uniform distribution of underwater sound speed, accurate positioning of underwater anchor nodes is a challenge work. Traditional anchor node positioning typically uses cross or circular shapes, however, how to optimize the deployment of reference nodes for positioning underwater anchor nodes considering the variability of sound speed has not yet been studied. This paper focuses on the optimal reference nodes deployment strategies for time--of--arrival (TOA) localization in the three-dimensional (3D) underwater space. We adopt the criterion that minimizing the trace of the inverse Fisher information matrix (FIM) to determine optimal reference nodes deployment with Gaussian measurement noise, which is positive related to the signal propagation path. A comprehensive analysis of optimal reference-target geometries is provided in the general circumstance with no restriction on the number of reference nodes, elevation angle and reference-target range. A new semi-closed form solution is found to detemine the optimal geometries. To demonstrate the findings in this paper, we conducted both simulations and sea trials on underwater anchor node positioning. Both the simulation and experiment results are consistent with theoretical analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.15153v1-abstract-full').style.display = 'none'; document.getElementById('2405.15153v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.14132">arXiv:2404.14132</a> <span> [<a href="https://arxiv.org/pdf/2404.14132">pdf</a>, <a href="https://arxiv.org/format/2404.14132">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> CRNet: A Detail-Preserving Network for Unified Image Restoration and Enhancement Task </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+K">Kangzhen Yang</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+T">Tao Hu</a>, <a href="/search/eess?searchtype=author&query=Dai%2C+K">Kexin Dai</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+G">Genggeng Chen</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+Y">Yu Cao</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+W">Wei Dong</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peng Wu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yanning Zhang</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+Q">Qingsen Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.14132v1-abstract-short" style="display: inline;"> In real-world scenarios, images captured often suffer from blurring, noise, and other forms of image degradation, and due to sensor limitations, people usually can only obtain low dynamic range images. To achieve high-quality images, researchers have attempted various image restoration and enhancement operations on photographs, including denoising, deblurring, and high dynamic range imaging. Howev… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.14132v1-abstract-full').style.display = 'inline'; document.getElementById('2404.14132v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.14132v1-abstract-full" style="display: none;"> In real-world scenarios, images captured often suffer from blurring, noise, and other forms of image degradation, and due to sensor limitations, people usually can only obtain low dynamic range images. To achieve high-quality images, researchers have attempted various image restoration and enhancement operations on photographs, including denoising, deblurring, and high dynamic range imaging. However, merely performing a single type of image enhancement still cannot yield satisfactory images. In this paper, to deal with the challenge above, we propose the Composite Refinement Network (CRNet) to address this issue using multiple exposure images. By fully integrating information-rich multiple exposure inputs, CRNet can perform unified image restoration and enhancement. To improve the quality of image details, CRNet explicitly separates and strengthens high and low-frequency information through pooling layers, using specially designed Multi-Branch Blocks for effective fusion of these frequencies. To increase the receptive field and fully integrate input features, CRNet employs the High-Frequency Enhancement Module, which includes large kernel convolutions and an inverted bottleneck ConvFFN. Our model secured third place in the first track of the Bracketing Image Restoration and Enhancement Challenge, surpassing previous SOTA models in both testing metrics and visual quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.14132v1-abstract-full').style.display = 'none'; document.getElementById('2404.14132v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted by CVPR2024 Workshop, Code: https://github.com/CalvinYang0/CRNet</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.13537">arXiv:2404.13537</a> <span> [<a href="https://arxiv.org/pdf/2404.13537">pdf</a>, <a href="https://arxiv.org/format/2404.13537">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Bracketing Image Restoration and Enhancement with High-Low Frequency Decomposition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+G">Genggeng Chen</a>, <a href="/search/eess?searchtype=author&query=Dai%2C+K">Kexin Dai</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+K">Kangzhen Yang</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+T">Tao Hu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xiangyu Chen</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Y">Yongqing Yang</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+W">Wei Dong</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peng Wu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yanning Zhang</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+Q">Qingsen Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.13537v2-abstract-short" style="display: inline;"> In real-world scenarios, due to a series of image degradations, obtaining high-quality, clear content photos is challenging. While significant progress has been made in synthesizing high-quality images, previous methods for image restoration and enhancement often overlooked the characteristics of different degradations. They applied the same structure to address various types of degradation, resul… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.13537v2-abstract-full').style.display = 'inline'; document.getElementById('2404.13537v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.13537v2-abstract-full" style="display: none;"> In real-world scenarios, due to a series of image degradations, obtaining high-quality, clear content photos is challenging. While significant progress has been made in synthesizing high-quality images, previous methods for image restoration and enhancement often overlooked the characteristics of different degradations. They applied the same structure to address various types of degradation, resulting in less-than-ideal restoration outcomes. Inspired by the notion that high/low frequency information is applicable to different degradations, we introduce HLNet, a Bracketing Image Restoration and Enhancement method based on high-low frequency decomposition. Specifically, we employ two modules for feature extraction: shared weight modules and non-shared weight modules. In the shared weight modules, we use SCConv to extract common features from different degradations. In the non-shared weight modules, we introduce the High-Low Frequency Decomposition Block (HLFDB), which employs different methods to handle high-low frequency information, enabling the model to address different degradations more effectively. Compared to other networks, our method takes into account the characteristics of different degradations, thus achieving higher-quality image restoration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.13537v2-abstract-full').style.display = 'none'; document.getElementById('2404.13537v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted by CVPR 2024 Workshop, code: https://github.com/chengeng0613/HLNet</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.15668">arXiv:2312.15668</a> <span> [<a href="https://arxiv.org/pdf/2312.15668">pdf</a>, <a href="https://arxiv.org/ps/2312.15668">ps</a>, <a href="https://arxiv.org/format/2312.15668">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Air-to-Ground Communications Beyond 5G: UAV Swarm Formation Control and Tracking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Fan%2C+X">Xiao Fan</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peiran Wu</a>, <a href="/search/eess?searchtype=author&query=Xia%2C+M">Minghua Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.15668v1-abstract-short" style="display: inline;"> Unmanned aerial vehicle (UAV) communications have been widely accepted as promising technologies to support air-to-ground communications in the forthcoming sixth-generation (6G) wireless networks. This paper proposes a novel air-to-ground communication model consisting of aerial base stations served by UAVs and terrestrial user equipments (UEs) by integrating the technique of coordinated multi-poi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.15668v1-abstract-full').style.display = 'inline'; document.getElementById('2312.15668v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.15668v1-abstract-full" style="display: none;"> Unmanned aerial vehicle (UAV) communications have been widely accepted as promising technologies to support air-to-ground communications in the forthcoming sixth-generation (6G) wireless networks. This paper proposes a novel air-to-ground communication model consisting of aerial base stations served by UAVs and terrestrial user equipments (UEs) by integrating the technique of coordinated multi-point (CoMP) transmission with the theory of stochastic geometry. In particular, a CoMP set consisting of multiple UAVs is developed based on the theory of Poisson-Delaunay tetrahedralization. Effective UAV formation control and UAV swarm tracking schemes for two typical scenarios, including static and mobile UEs, are also developed using the multi-agent system theory to ensure that collaborative UAVs can efficiently reach target spatial positions for mission execution. Thanks to the ease of mathematical tractability, this model provides explicit performance expressions for a typical UE's coverage probability and achievable ergodic rate. Extensive simulation and numerical results corroborate that the proposed scheme outperforms UAV communications without CoMP transmission and obtains similar performance to the conventional CoMP scheme while avoiding search overhead. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.15668v1-abstract-full').style.display = 'none'; document.getElementById('2312.15668v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 9 figures, to appear in IEEE TWC</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.12810">arXiv:2312.12810</a> <span> [<a href="https://arxiv.org/pdf/2312.12810">pdf</a>, <a href="https://arxiv.org/format/2312.12810">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Unconstrained Dysfluency Modeling for Dysfluent Speech Transcription and Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lian%2C+J">Jiachen Lian</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+C">Carly Feng</a>, <a href="/search/eess?searchtype=author&query=Farooqi%2C+N">Naasir Farooqi</a>, <a href="/search/eess?searchtype=author&query=Li%2C+S">Steve Li</a>, <a href="/search/eess?searchtype=author&query=Kashyap%2C+A">Anshul Kashyap</a>, <a href="/search/eess?searchtype=author&query=Cho%2C+C+J">Cheol Jun Cho</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peter Wu</a>, <a href="/search/eess?searchtype=author&query=Netzorg%2C+R">Robbie Netzorg</a>, <a href="/search/eess?searchtype=author&query=Li%2C+T">Tingle Li</a>, <a href="/search/eess?searchtype=author&query=Anumanchipalli%2C+G+K">Gopala Krishna Anumanchipalli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.12810v1-abstract-short" style="display: inline;"> Dysfluent speech modeling requires time-accurate and silence-aware transcription at both the word-level and phonetic-level. However, current research in dysfluency modeling primarily focuses on either transcription or detection, and the performance of each aspect remains limited. In this work, we present an unconstrained dysfluency modeling (UDM) approach that addresses both transcription and dete… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.12810v1-abstract-full').style.display = 'inline'; document.getElementById('2312.12810v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.12810v1-abstract-full" style="display: none;"> Dysfluent speech modeling requires time-accurate and silence-aware transcription at both the word-level and phonetic-level. However, current research in dysfluency modeling primarily focuses on either transcription or detection, and the performance of each aspect remains limited. In this work, we present an unconstrained dysfluency modeling (UDM) approach that addresses both transcription and detection in an automatic and hierarchical manner. UDM eliminates the need for extensive manual annotation by providing a comprehensive solution. Furthermore, we introduce a simulated dysfluent dataset called VCTK++ to enhance the capabilities of UDM in phonetic transcription. Our experimental results demonstrate the effectiveness and robustness of our proposed methods in both transcription and detection tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.12810v1-abstract-full').style.display = 'none'; document.getElementById('2312.12810v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2023 ASRU</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.09034">arXiv:2312.09034</a> <span> [<a href="https://arxiv.org/pdf/2312.09034">pdf</a>, <a href="https://arxiv.org/format/2312.09034">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Fusion of Audio and Visual Embeddings for Sound Event Localization and Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Berghi%2C+D">Davide Berghi</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peipei Wu</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+J">Jinzheng Zhao</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wenwu Wang</a>, <a href="/search/eess?searchtype=author&query=Jackson%2C+P+J+B">Philip J. B. Jackson</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.09034v1-abstract-short" style="display: inline;"> Sound event localization and detection (SELD) combines two subtasks: sound event detection (SED) and direction of arrival (DOA) estimation. SELD is usually tackled as an audio-only problem, but visual information has been recently included. Few audio-visual (AV)-SELD works have been published and most employ vision via face/object bounding boxes, or human pose keypoints. In contrast, we explore th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.09034v1-abstract-full').style.display = 'inline'; document.getElementById('2312.09034v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.09034v1-abstract-full" style="display: none;"> Sound event localization and detection (SELD) combines two subtasks: sound event detection (SED) and direction of arrival (DOA) estimation. SELD is usually tackled as an audio-only problem, but visual information has been recently included. Few audio-visual (AV)-SELD works have been published and most employ vision via face/object bounding boxes, or human pose keypoints. In contrast, we explore the integration of audio and visual feature embeddings extracted with pre-trained deep networks. For the visual modality, we tested ResNet50 and Inflated 3D ConvNet (I3D). Our comparison of AV fusion methods includes the AV-Conformer and Cross-Modal Attentive Fusion (CMAF) model. Our best models outperform the DCASE 2023 Task3 audio-only and AV baselines by a wide margin on the development set of the STARSS23 dataset, making them competitive amongst state-of-the-art results of the AV challenge, without model ensembling, heavy data augmentation, or prediction post-processing. Such techniques and further pre-training could be applied as next steps to improve performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.09034v1-abstract-full').style.display = 'none'; document.getElementById('2312.09034v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICASSP 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.01566">arXiv:2312.01566</a> <span> [<a href="https://arxiv.org/pdf/2312.01566">pdf</a>, <a href="https://arxiv.org/format/2312.01566">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Coronary Atherosclerotic Plaque Characterization with Photon-counting CT: a Simulation-based Feasibility Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+M">Mengzhou Li</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+M">Mingye Wu</a>, <a href="/search/eess?searchtype=author&query=Pack%2C+J">Jed Pack</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Pengwei Wu</a>, <a href="/search/eess?searchtype=author&query=De+Man%2C+B">Bruno De Man</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+A">Adam Wang</a>, <a href="/search/eess?searchtype=author&query=Nieman%2C+K">Koen Nieman</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+G">Ge Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.01566v1-abstract-short" style="display: inline;"> Recent development of photon-counting CT (PCCT) brings great opportunities for plaque characterization with much-improved spatial resolution and spectral imaging capability. While existing coronary plaque PCCT imaging results are based on detectors made of CZT or CdTe materials, deep-silicon photon-counting detectors have unique performance characteristics and promise distinct imaging capabilities… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.01566v1-abstract-full').style.display = 'inline'; document.getElementById('2312.01566v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.01566v1-abstract-full" style="display: none;"> Recent development of photon-counting CT (PCCT) brings great opportunities for plaque characterization with much-improved spatial resolution and spectral imaging capability. While existing coronary plaque PCCT imaging results are based on detectors made of CZT or CdTe materials, deep-silicon photon-counting detectors have unique performance characteristics and promise distinct imaging capabilities. In this work, we report a systematic simulation study of a deep-silicon PCCT scanner with a new clinically-relevant digital plaque phantom with realistic geometrical parameters and chemical compositions. This work investigates the effects of spatial resolution, noise, motion artifacts, radiation dose, and spectral characterization. Our simulation results suggest that the deep-silicon PCCT design provides adequate spatial resolution for visualizing a necrotic core and quantitation of key plaque features. Advanced denoising techniques and aggressive bowtie filter designs can keep image noise to acceptable levels at this resolution while keeping radiation dose comparable to that of a conventional CT scan. The ultrahigh resolution of PCCT also means an elevated sensitivity to motion artifacts. It is found that a tolerance of less than 0.4 mm residual movement range requires the application of accurate motion correction methods for best plaque imaging quality with PCCT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.01566v1-abstract-full').style.display = 'none'; document.getElementById('2312.01566v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 figures, 5 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.09537">arXiv:2311.09537</a> <span> [<a href="https://arxiv.org/pdf/2311.09537">pdf</a>, <a href="https://arxiv.org/format/2311.09537">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Future Full-Ocean Deep SSPs Prediction based on Hierarchical Long Short-Term Memory Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lu%2C+J">Jiajun Lu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Pengfei Wu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+S">Sijia Li</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+W">Wei Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.09537v1-abstract-short" style="display: inline;"> The spatial-temporal distribution of underwater sound velocity affects the propagation mode of underwater acoustic signals. Therefore, rapid estimation and prediction of underwater sound velocity distribution is crucial for providing underwater positioning, navigation and timing (PNT) services. Currently, sound speed profile (SSP) inversion methods have a faster time response rate compared to dire… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.09537v1-abstract-full').style.display = 'inline'; document.getElementById('2311.09537v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.09537v1-abstract-full" style="display: none;"> The spatial-temporal distribution of underwater sound velocity affects the propagation mode of underwater acoustic signals. Therefore, rapid estimation and prediction of underwater sound velocity distribution is crucial for providing underwater positioning, navigation and timing (PNT) services. Currently, sound speed profile (SSP) inversion methods have a faster time response rate compared to direct measurement methods, however, most SSP inversion methods focus on constructing spatial dimensional sound velocity fields and are highly dependent on sonar observation data, thus high requirements have been placed on observation data sources. To explore the distribution pattern of sound velocity in the time dimension and achieve future SSP prediction without sonar observation data, we propose a hierarchical long short-term memory (H-LSTM) neural network for SSP prediction. By our SSP prediction method, the sound speed distribution could be estimated without any on-site data measurement process, so that the time efficiency could be greatly improved. Through comparing with other state-of-the-art methods, H-LSTM has better accuracy performance on prediction of monthly average sound velocity distribution, which is less than 1 m/s in different depth layers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.09537v1-abstract-full').style.display = 'none'; document.getElementById('2311.09537v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2310.09522</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.16287">arXiv:2310.16287</a> <span> [<a href="https://arxiv.org/pdf/2310.16287">pdf</a>, <a href="https://arxiv.org/format/2310.16287">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Towards Streaming Speech-to-Avatar Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Prabhune%2C+T+S">Tejas S. Prabhune</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peter Wu</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+B">Bohan Yu</a>, <a href="/search/eess?searchtype=author&query=Anumanchipalli%2C+G+K">Gopala K. Anumanchipalli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.16287v1-abstract-short" style="display: inline;"> Streaming speech-to-avatar synthesis creates real-time animations for a virtual character from audio data. Accurate avatar representations of speech are important for the visualization of sound in linguistics, phonetics, and phonology, visual feedback to assist second language acquisition, and virtual embodiment for paralyzed patients. Previous works have highlighted the capability of deep articul… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.16287v1-abstract-full').style.display = 'inline'; document.getElementById('2310.16287v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.16287v1-abstract-full" style="display: none;"> Streaming speech-to-avatar synthesis creates real-time animations for a virtual character from audio data. Accurate avatar representations of speech are important for the visualization of sound in linguistics, phonetics, and phonology, visual feedback to assist second language acquisition, and virtual embodiment for paralyzed patients. Previous works have highlighted the capability of deep articulatory inversion to perform high-quality avatar animation using electromagnetic articulography (EMA) features. However, these models focus on offline avatar synthesis with recordings rather than real-time audio, which is necessary for live avatar visualization or embodiment. To address this issue, we propose a method using articulatory inversion for streaming high quality facial and inner-mouth avatar animation from real-time audio. Our approach achieves 130ms average streaming latency for every 0.1 seconds of audio with a 0.792 correlation with ground truth articulations. Finally, we show generated mouth and tongue animations to demonstrate the efficacy of our methodology. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.16287v1-abstract-full').style.display = 'none'; document.getElementById('2310.16287v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.14778">arXiv:2310.14778</a> <span> [<a href="https://arxiv.org/pdf/2310.14778">pdf</a>, <a href="https://arxiv.org/format/2310.14778">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Audio-Visual Speaker Tracking: Progress, Challenges, and Future Directions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhao%2C+J">Jinzheng Zhao</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+Y">Yong Xu</a>, <a href="/search/eess?searchtype=author&query=Qian%2C+X">Xinyuan Qian</a>, <a href="/search/eess?searchtype=author&query=Berghi%2C+D">Davide Berghi</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peipei Wu</a>, <a href="/search/eess?searchtype=author&query=Cui%2C+M">Meng Cui</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+J">Jianyuan Sun</a>, <a href="/search/eess?searchtype=author&query=Jackson%2C+P+J+B">Philip J. B. Jackson</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wenwu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.14778v3-abstract-short" style="display: inline;"> Audio-visual speaker tracking has drawn increasing attention over the past few years due to its academic values and wide application. Audio and visual modalities can provide complementary information for localization and tracking. With audio and visual information, the Bayesian-based filter can solve the problem of data association, audio-visual fusion and track management. In this paper, we condu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.14778v3-abstract-full').style.display = 'inline'; document.getElementById('2310.14778v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.14778v3-abstract-full" style="display: none;"> Audio-visual speaker tracking has drawn increasing attention over the past few years due to its academic values and wide application. Audio and visual modalities can provide complementary information for localization and tracking. With audio and visual information, the Bayesian-based filter can solve the problem of data association, audio-visual fusion and track management. In this paper, we conduct a comprehensive overview of audio-visual speaker tracking. To our knowledge, this is the first extensive survey over the past five years. We introduce the family of Bayesian filters and summarize the methods for obtaining audio-visual measurements. In addition, the existing trackers and their performance on AV16.3 dataset are summarized. In the past few years, deep learning techniques have thrived, which also boosts the development of audio visual speaker tracking. The influence of deep learning techniques in terms of measurement extraction and state estimation is also discussed. At last, we discuss the connections between audio-visual speaker tracking and other areas such as speech separation and distributed speaker tracking. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.14778v3-abstract-full').style.display = 'none'; document.getElementById('2310.14778v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.08251">arXiv:2310.08251</a> <span> [<a href="https://arxiv.org/pdf/2310.08251">pdf</a>, <a href="https://arxiv.org/format/2310.08251">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.3390/jmse12122356">10.3390/jmse12122356 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Underwater Sound Speed Profile Construction: A Review </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+W">Wei Huang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jixuan Zhou</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+F">Fan Gao</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+J">Jiajun Lu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+S">Sijia Li</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Pengfei Wu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Junting Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tianhe Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.08251v1-abstract-short" style="display: inline;"> Real--time and accurate construction of regional sound speed profiles (SSP) is important for building underwater positioning, navigation, and timing (PNT) systems as it greatly affect the signal propagation modes such as trajectory. In this paper, we summarizes and analyzes the current research status in the field of underwater SSP construction, and the mainstream methods include direct SSP measur… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.08251v1-abstract-full').style.display = 'inline'; document.getElementById('2310.08251v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.08251v1-abstract-full" style="display: none;"> Real--time and accurate construction of regional sound speed profiles (SSP) is important for building underwater positioning, navigation, and timing (PNT) systems as it greatly affect the signal propagation modes such as trajectory. In this paper, we summarizes and analyzes the current research status in the field of underwater SSP construction, and the mainstream methods include direct SSP measurement and SSP inversion. In the direct measurement method, we compare the performance of popular international commercial temperature, conductivity, and depth profilers (CTD). While for the inversion methods, the framework and basic principles of matched field processing (MFP), compressive sensing (CS), and deep learning (DL) for constructing SSP are introduced, and their advantages and disadvantages are compared. The traditional direct measurement method has good accuracy performance, but it usually takes a long time. The proposal of SSP inversion method greatly improves the convenience and real--time performance, but the accuracy is not as good as the direct measurement method. Currently, the SSP inversion relies on sonar observation data, making it difficult to apply to areas that couldn't be covered by underwater observation systems, and these methods are unable to predict the distribution of sound velocity at future times. How to comprehensively utilize multi-source data and provide elastic sound velocity distribution estimation services with different accuracy and real-time requirements for underwater users without sonar observation data is the mainstream trend in future research on SSP construction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.08251v1-abstract-full').style.display = 'none'; document.getElementById('2310.08251v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Journal of Marine Science and Engineering 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.02497">arXiv:2310.02497</a> <span> [<a href="https://arxiv.org/pdf/2310.02497">pdf</a>, <a href="https://arxiv.org/format/2310.02497">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Towards an Interpretable Representation of Speaker Identity via Perceptual Voice Qualities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Netzorg%2C+R">Robin Netzorg</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+B">Bohan Yu</a>, <a href="/search/eess?searchtype=author&query=Guzman%2C+A">Andrea Guzman</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peter Wu</a>, <a href="/search/eess?searchtype=author&query=McNulty%2C+L">Luna McNulty</a>, <a href="/search/eess?searchtype=author&query=Anumanchipalli%2C+G">Gopala Anumanchipalli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.02497v1-abstract-short" style="display: inline;"> Unlike other data modalities such as text and vision, speech does not lend itself to easy interpretation. While lay people can understand how to describe an image or sentence via perception, non-expert descriptions of speech often end at high-level demographic information, such as gender or age. In this paper, we propose a possible interpretable representation of speaker identity based on perceptu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.02497v1-abstract-full').style.display = 'inline'; document.getElementById('2310.02497v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.02497v1-abstract-full" style="display: none;"> Unlike other data modalities such as text and vision, speech does not lend itself to easy interpretation. While lay people can understand how to describe an image or sentence via perception, non-expert descriptions of speech often end at high-level demographic information, such as gender or age. In this paper, we propose a possible interpretable representation of speaker identity based on perceptual voice qualities (PQs). By adding gendered PQs to the pathology-focused Consensus Auditory-Perceptual Evaluation of Voice (CAPE-V) protocol, our PQ-based approach provides a perceptual latent space of the character of adult voices that is an intermediary of abstraction between high-level demographics and low-level acoustic, physical, or learned representations. Contrary to prior belief, we demonstrate that these PQs are hearable by ensembles of non-experts, and further demonstrate that the information encoded in a PQ-based representation is predictable by various speech representations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.02497v1-abstract-full').style.display = 'none'; document.getElementById('2310.02497v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.07861">arXiv:2309.07861</a> <span> [<a href="https://arxiv.org/pdf/2309.07861">pdf</a>, <a href="https://arxiv.org/format/2309.07861">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CiwaGAN: Articulatory information exchange </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Begu%C5%A1%2C+G">Ga拧per Begu拧</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+T">Thomas Lu</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+A">Alan Zhou</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peter Wu</a>, <a href="/search/eess?searchtype=author&query=Anumanchipalli%2C+G+K">Gopala K. Anumanchipalli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.07861v1-abstract-short" style="display: inline;"> Humans encode information into sounds by controlling articulators and decode information from sounds using the auditory apparatus. This paper introduces CiwaGAN, a model of human spoken language acquisition that combines unsupervised articulatory modeling with an unsupervised model of information exchange through the auditory modality. While prior research includes unsupervised articulatory modeli… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.07861v1-abstract-full').style.display = 'inline'; document.getElementById('2309.07861v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.07861v1-abstract-full" style="display: none;"> Humans encode information into sounds by controlling articulators and decode information from sounds using the auditory apparatus. This paper introduces CiwaGAN, a model of human spoken language acquisition that combines unsupervised articulatory modeling with an unsupervised model of information exchange through the auditory modality. While prior research includes unsupervised articulatory modeling and information exchange separately, our model is the first to combine the two components. The paper also proposes an improved articulatory model with more interpretable internal representations. The proposed CiwaGAN model is the most realistic approximation of human spoken language acquisition using deep learning. As such, it is useful for cognitively plausible simulations of the human speech act. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.07861v1-abstract-full').style.display = 'none'; document.getElementById('2309.07861v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.05262">arXiv:2308.05262</a> <span> [<a href="https://arxiv.org/pdf/2308.05262">pdf</a>, <a href="https://arxiv.org/format/2308.05262">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Robust Interference Mitigation techniques for Direct Position Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+H">Haoqing Li</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+S">Shuo Tang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peng Wu</a>, <a href="/search/eess?searchtype=author&query=Closas%2C+P">Pau Closas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.05262v1-abstract-short" style="display: inline;"> Global Navigation Satellite System (GNSS) is pervasive in navigation and positioning applications, where precise position and time referencing estimations are required. Conventional methods for GNSS positioning involve a two-step process, where intermediate measurements such as Doppler shift and time delay of received GNSS signals are computed and then used to solve for the receiver's position. Al… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.05262v1-abstract-full').style.display = 'inline'; document.getElementById('2308.05262v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.05262v1-abstract-full" style="display: none;"> Global Navigation Satellite System (GNSS) is pervasive in navigation and positioning applications, where precise position and time referencing estimations are required. Conventional methods for GNSS positioning involve a two-step process, where intermediate measurements such as Doppler shift and time delay of received GNSS signals are computed and then used to solve for the receiver's position. Alternatively, Direct Position Estimation (DPE) was proposed to infer the position directly from the sampled signal without intermediate variables, yielding to superior levels of sensitivity and operation under challenging environments. However, the positioning resilience of DPE method is still under the threat of various interferences. Robust Interference Mitigation (RIM) processing has been studied and proved to be efficient against various interference in conventional two-step positioning (2SP) methods, and therefore worthy to be explored regarding its potential to enhance DPE. This article extends DPE methodology by incorporating RIM strategies that address the increasing need to protect GNSS receivers against intentional or unintentional interferences, such as jamming signals, which can deny GNSS-based positioning. RIM, which leverages robust statistics, was shown to provide competitive results in two-step approaches and is here employed in a high-sensitivity DPE framework with successful results. The article also provides a quantification of the loss of efficiency of using RIM when no interference is present and validates the proposed methodology on relevant interference cases, while the approach can be used to mitigate other common interference signals. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.05262v1-abstract-full').style.display = 'none'; document.getElementById('2308.05262v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.03420">arXiv:2308.03420</a> <span> [<a href="https://arxiv.org/pdf/2308.03420">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> A Safe DRL Method for Fast Solution of Real-Time Optimal Power Flow </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+P">Pengfei Wu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+C">Chen Chen</a>, <a href="/search/eess?searchtype=author&query=Lai%2C+D">Dexiang Lai</a>, <a href="/search/eess?searchtype=author&query=Zhong%2C+J">Jian Zhong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.03420v1-abstract-short" style="display: inline;"> High-level penetration of intermittent renewable energy sources (RESs) has introduced significant uncertainties into modern power systems. In order to rapidly and economically respond to the fluctuations of power system operating state, this paper proposes a safe deep reinforcement learning (SDRL) based method for fast solution of real-time optimal power flow (RT-OPF) problems. The proposed method… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.03420v1-abstract-full').style.display = 'inline'; document.getElementById('2308.03420v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.03420v1-abstract-full" style="display: none;"> High-level penetration of intermittent renewable energy sources (RESs) has introduced significant uncertainties into modern power systems. In order to rapidly and economically respond to the fluctuations of power system operating state, this paper proposes a safe deep reinforcement learning (SDRL) based method for fast solution of real-time optimal power flow (RT-OPF) problems. The proposed method considers the volatility of RESs and temporal constraints, and formulates the RT-OPF as a Constrained Markov Decision Process (CMDP). In the training process, the proposed method hybridizes the proximal policy optimization (PPO) and the primal-dual method. Instead of integrating the constraint violation penalty with the reward function, its actor gradients are estimated by a Lagrange advantage function which is derived from two critic systems based on economic reward and violation cost. The decoupling of reward and cost alleviates reward sparsity while improving critic approximation accuracy. Moreover, the introduction of Lagrange multipliers enables the agent to comprehend the trade-off between optimality and feasibility. Numerical tests are carried out and compared with penalty-based DRL methods on the IEEE 9-bus, 30-bus, and 118-bus test systems. The results show that the well-trained SDRL agent can significantly improve the computation efficiency while satisfying the security constraints and optimality requirements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.03420v1-abstract-full').style.display = 'none'; document.getElementById('2308.03420v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.16096">arXiv:2307.16096</a> <span> [<a href="https://arxiv.org/pdf/2307.16096">pdf</a>, <a href="https://arxiv.org/ps/2307.16096">ps</a>, <a href="https://arxiv.org/format/2307.16096">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> D-STAR: Dual Simultaneously Transmitting and Reflecting Reconfigurable Intelligent Surfaces for Joint Uplink/Downlink Transmission </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shen%2C+L">Li-Hsiang Shen</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Po-Chen Wu</a>, <a href="/search/eess?searchtype=author&query=Ku%2C+C">Chia-Jou Ku</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yu-Ting Li</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+K">Kai-Ten Feng</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yuanwei Liu</a>, <a href="/search/eess?searchtype=author&query=Hanzo%2C+L">Lajos Hanzo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.16096v3-abstract-short" style="display: inline;"> The joint uplink/downlink (JUD) design of simultaneously transmitting and reflecting reconfigurable intelligent surfaces (STAR-RIS) is conceived in support of both uplink (UL) and downlink (DL) users. Furthermore, the dual STAR-RISs (D-STAR) concept is conceived as a promising architecture for 360-degree full-plane service coverage, including UL/DL users located between the base station (BS) and t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.16096v3-abstract-full').style.display = 'inline'; document.getElementById('2307.16096v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.16096v3-abstract-full" style="display: none;"> The joint uplink/downlink (JUD) design of simultaneously transmitting and reflecting reconfigurable intelligent surfaces (STAR-RIS) is conceived in support of both uplink (UL) and downlink (DL) users. Furthermore, the dual STAR-RISs (D-STAR) concept is conceived as a promising architecture for 360-degree full-plane service coverage, including UL/DL users located between the base station (BS) and the D-STAR as well as beyond. The corresponding regions are termed as primary (P) and secondary (S) regions. Both BS/users exist in the P-region, but only users are located in the S-region. The primary STAR-RIS (STAR-P) plays an important role in terms of tackling the P-region inter-user interference, the self-interference (SI) from the BS and from the reflective as well as refractive UL users imposed on the DL receiver. By contrast, the secondary STAR-RIS (STAR-S) aims for mitigating the S-region interferences. The non-linear and non-convex rate-maximization problem formulated is solved by alternating optimization amongst the decomposed convex sub-problems of the BS beamformer, and the D-STAR amplitude as well as phase shift configurations. We also propose a D-STAR based active beamforming and passive STAR-RIS amplitude/phase (DBAP) optimization scheme to solve the respective sub-problems by Lagrange dual with Dinkelbach's transformation, alternating direction method of multipliers (ADMM) with successive convex approximation (SCA), and penalty convex-concave procedure (PCCP). Our simulation results reveal that the proposed D-STAR architecture outperforms the conventional single RIS, single STAR-RIS, and half-duplex networks. The proposed DBAP of D-STAR outperforms the state-of-the-art solutions found in the open literature for different numbers of quantization levels, geographic deployment, transmit power and for diverse numbers of transmit antennas, patch partitions as well as D-STAR elements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.16096v3-abstract-full').style.display = 'none'; document.getElementById('2307.16096v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE TCOM</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.02471">arXiv:2307.02471</a> <span> [<a href="https://arxiv.org/pdf/2307.02471">pdf</a>, <a href="https://arxiv.org/format/2307.02471">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Deep Speech Synthesis from MRI-Based Articulatory Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peter Wu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+T">Tingle Li</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+Y">Yijing Lu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yubin Zhang</a>, <a href="/search/eess?searchtype=author&query=Lian%2C+J">Jiachen Lian</a>, <a href="/search/eess?searchtype=author&query=Black%2C+A+W">Alan W Black</a>, <a href="/search/eess?searchtype=author&query=Goldstein%2C+L">Louis Goldstein</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Anumanchipalli%2C+G+K">Gopala K. Anumanchipalli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.02471v1-abstract-short" style="display: inline;"> In this paper, we study articulatory synthesis, a speech synthesis method using human vocal tract information that offers a way to develop efficient, generalizable and interpretable synthesizers. While recent advances have enabled intelligible articulatory synthesis using electromagnetic articulography (EMA), these methods lack critical articulatory information like excitation and nasality, limiti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.02471v1-abstract-full').style.display = 'inline'; document.getElementById('2307.02471v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.02471v1-abstract-full" style="display: none;"> In this paper, we study articulatory synthesis, a speech synthesis method using human vocal tract information that offers a way to develop efficient, generalizable and interpretable synthesizers. While recent advances have enabled intelligible articulatory synthesis using electromagnetic articulography (EMA), these methods lack critical articulatory information like excitation and nasality, limiting generalization capabilities. To bridge this gap, we propose an alternative MRI-based feature set that covers a much more extensive articulatory space than EMA. We also introduce normalization and denoising procedures to enhance the generalizability of deep learning methods trained on MRI data. Moreover, we propose an MRI-to-speech model that improves both computational efficiency and speech fidelity. Finally, through a series of ablations, we show that the proposed MRI representation is more comprehensive than EMA and identify the most suitable MRI feature subset for articulatory synthesis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.02471v1-abstract-full').style.display = 'none'; document.getElementById('2307.02471v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.13558">arXiv:2306.13558</a> <span> [<a href="https://arxiv.org/pdf/2306.13558">pdf</a>, <a href="https://arxiv.org/format/2306.13558">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> One-Bit Spectrum Sensing for Cognitive Radio </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+P">Pei-Wen Wu</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+L">Lei Huang</a>, <a href="/search/eess?searchtype=author&query=Ram%C3%ADrez%2C+D">David Ram铆rez</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+Y">Yu-Hang Xiao</a>, <a href="/search/eess?searchtype=author&query=So%2C+H+C">Hing Cheung So</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.13558v1-abstract-short" style="display: inline;"> Spectrum sensing in cognitive radio necessitates effective monitoring of wide bandwidths, which requires high-rate sampling. Traditional spectrum sensing methods employing high-precision analog-to-digital converters (ADCs) result in increased power consumption and expensive hardware costs. In this paper, we explore blind spectrum sensing utilizing one-bit ADCs. We derive a closed-form detector bas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.13558v1-abstract-full').style.display = 'inline'; document.getElementById('2306.13558v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.13558v1-abstract-full" style="display: none;"> Spectrum sensing in cognitive radio necessitates effective monitoring of wide bandwidths, which requires high-rate sampling. Traditional spectrum sensing methods employing high-precision analog-to-digital converters (ADCs) result in increased power consumption and expensive hardware costs. In this paper, we explore blind spectrum sensing utilizing one-bit ADCs. We derive a closed-form detector based on Rao's test and demonstrate its equivalence with the second-order eigenvalue-moment-ratio test. Furthermore, a near-exact distribution based on the moment-based method, and an approximate distribution in the low signal-to-noise ratio (SNR) regime with the use of the central limit theorem, are obtained. Theoretical analysis is then performed and our results show that the performance loss of the proposed detector is approximately $2$ dB ($蟺/2$) compared to detectors employing $\infty$-bit ADCs when SNR is low. This loss can be compensated for by using approximately $2.47$ ($蟺^2/4$) times more samples. In addition, we unveil that the efficiency of incoherent accumulation in one-bit detection is the square root of that of coherent accumulation. Simulation results corroborate the correctness of our theoretical calculations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.13558v1-abstract-full').style.display = 'none'; document.getElementById('2306.13558v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.10359">arXiv:2306.10359</a> <span> [<a href="https://arxiv.org/pdf/2306.10359">pdf</a>, <a href="https://arxiv.org/format/2306.10359">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Text-Driven Foley Sound Generation With Latent Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yuan%2C+Y">Yi Yuan</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+H">Haohe Liu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xubo Liu</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+X">Xiyuan Kang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peipei Wu</a>, <a href="/search/eess?searchtype=author&query=Plumbley%2C+M+D">Mark D. Plumbley</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wenwu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.10359v5-abstract-short" style="display: inline;"> Foley sound generation aims to synthesise the background sound for multimedia content. Previous models usually employ a large development set with labels as input (e.g., single numbers or one-hot vector). In this work, we propose a diffusion model based system for Foley sound generation with text conditions. To alleviate the data scarcity issue, our model is initially pre-trained with large-scale… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.10359v5-abstract-full').style.display = 'inline'; document.getElementById('2306.10359v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.10359v5-abstract-full" style="display: none;"> Foley sound generation aims to synthesise the background sound for multimedia content. Previous models usually employ a large development set with labels as input (e.g., single numbers or one-hot vector). In this work, we propose a diffusion model based system for Foley sound generation with text conditions. To alleviate the data scarcity issue, our model is initially pre-trained with large-scale datasets and fine-tuned to this task via transfer learning using the contrastive language-audio pertaining (CLAP) technique. We have observed that the feature embedding extracted by the text encoder can significantly affect the performance of the generation model. Hence, we introduce a trainable layer after the encoder to improve the text embedding produced by the encoder. In addition, we further refine the generated waveform by generating multiple candidate audio clips simultaneously and selecting the best one, which is determined in terms of the similarity score between the embedding of the candidate clips and the embedding of the target text label. Using the proposed method, our system ranks ${1}^{st}$ among the systems submitted to DCASE Challenge 2023 Task 7. The results of the ablation studies illustrate that the proposed techniques significantly improve sound generation performance. The codes for implementing the proposed system are available online. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.10359v5-abstract-full').style.display = 'none'; document.getElementById('2306.10359v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submit to DCASE-workshop 2023, an extension and supersedes the previous technical report arXiv:2305.15905</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.17896">arXiv:2305.17896</a> <span> [<a href="https://arxiv.org/pdf/2305.17896">pdf</a>, <a href="https://arxiv.org/format/2305.17896">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Continuous and Noninvasive Measurement of Arterial Pulse Pressure and Pressure Waveform using an Image-free Ultrasound System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+L">Lirui Xu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Pang Wu</a>, <a href="/search/eess?searchtype=author&query=Xia%2C+P">Pan Xia</a>, <a href="/search/eess?searchtype=author&query=Geng%2C+F">Fanglin Geng</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+P">Peng Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xianxiang Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhenfeng Li</a>, <a href="/search/eess?searchtype=author&query=Du%2C+L">Lidong Du</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Shuping Liu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+L">Li Li</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+H">Hongbo Chang</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+Z">Zhen Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.17896v1-abstract-short" style="display: inline;"> The local beat-to-beat local pulse pressure (PP) and blood pressure waveform of arteries, especially central arteries, are important indicators of the course of cardiovascular diseases (CVDs). Nevertheless, noninvasive measurement of them remains a challenge in the clinic. This work presents a three-element image-free ultrasound system with a low-computational method for real-time measurement of l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.17896v1-abstract-full').style.display = 'inline'; document.getElementById('2305.17896v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.17896v1-abstract-full" style="display: none;"> The local beat-to-beat local pulse pressure (PP) and blood pressure waveform of arteries, especially central arteries, are important indicators of the course of cardiovascular diseases (CVDs). Nevertheless, noninvasive measurement of them remains a challenge in the clinic. This work presents a three-element image-free ultrasound system with a low-computational method for real-time measurement of local pulse wave velocity (PWV) and diameter waveforms, enabling real-time and noninvasive continuous PP and blood pressure waveforms measurement without calibration. The developed system has been well-validated in vitro and in vivo. In in vitro cardiovascular phantom experiments, the results demonstrated high accuracy in the measurement of PP (error < 3 mmHg) and blood pressure waveform (root-mean-square-errors (RMSE) < 2 mmHg, correlation coefficient (r) > textgreater 0.99). In subsequent human carotid experiments, the system was compared with an arterial tonometer, which showed excellent PP accuracy (mean absolute error (MAE) = 3.7 +- 3.4 mmHg) and pressure waveform similarity (RMSE = 3.7 +- 1.6 mmHg, r = 0.98 +- 0.01). Furthermore, comparative experiments with the volume clamp device demonstrated the system's ability to accurately trace blood pressure changes (induced by deep breathing) over a period of one minute, with the MAE of DBP, MAP, and SBP within 5 +- 8 mmHg. The present results demonstrate the accuracy and reliability of the developed system for continuous and noninvasive measurement of arterial PP and blood pressure waveform measurements, with potential applications in the diagnosis and prevention of CVDs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.17896v1-abstract-full').style.display = 'none'; document.getElementById('2305.17896v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.17499">arXiv:2305.17499</a> <span> [<a href="https://arxiv.org/pdf/2305.17499">pdf</a>, <a href="https://arxiv.org/format/2305.17499">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CIF-PT: Bridging Speech and Text Representations for Spoken Language Understanding via Continuous Integrate-and-Fire Pre-Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Dong%2C+L">Linhao Dong</a>, <a href="/search/eess?searchtype=author&query=An%2C+Z">Zhecheng An</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peihao Wu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Jun Zhang</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+L">Lu Lu</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+Z">Zejun Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.17499v1-abstract-short" style="display: inline;"> Speech or text representation generated by pre-trained models contains modal-specific information that could be combined for benefiting spoken language understanding (SLU) tasks. In this work, we propose a novel pre-training paradigm termed Continuous Integrate-and-Fire Pre-Training (CIF-PT). It relies on a simple but effective frame-to-token alignment: continuous integrate-and-fire (CIF) to bridg… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.17499v1-abstract-full').style.display = 'inline'; document.getElementById('2305.17499v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.17499v1-abstract-full" style="display: none;"> Speech or text representation generated by pre-trained models contains modal-specific information that could be combined for benefiting spoken language understanding (SLU) tasks. In this work, we propose a novel pre-training paradigm termed Continuous Integrate-and-Fire Pre-Training (CIF-PT). It relies on a simple but effective frame-to-token alignment: continuous integrate-and-fire (CIF) to bridge the representations between speech and text. It jointly performs speech-to-text training and language model distillation through CIF as the pre-training (PT). Evaluated on SLU benchmark SLURP dataset, CIF-PT outperforms the state-of-the-art model by 1.94% of accuracy and 2.71% of SLU-F1 on the tasks of intent classification and slot filling, respectively. We also observe the cross-modal representation extracted by CIF-PT obtains better performance than other neural interfaces for the tasks of SLU, including the dominant speech representation learned from self-supervised pre-training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.17499v1-abstract-full').style.display = 'none'; document.getElementById('2305.17499v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACL 2023 Findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.00383">arXiv:2305.00383</a> <span> [<a href="https://arxiv.org/pdf/2305.00383">pdf</a>, <a href="https://arxiv.org/format/2305.00383">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Edge Learning for Large-Scale Internet of Things With Task-Oriented Efficient Communication </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xie%2C+H">Haihui Xie</a>, <a href="/search/eess?searchtype=author&query=Xia%2C+M">Minghua Xia</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peiran Wu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shuai Wang</a>, <a href="/search/eess?searchtype=author&query=Poor%2C+H+V">H. Vincent Poor</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.00383v1-abstract-short" style="display: inline;"> In the Internet of Things (IoT) networks, edge learning for data-driven tasks provides intelligent applications and services. As the network size becomes large, different users may generate distinct datasets. Thus, to suit multiple edge learning tasks for large-scale IoT networks, this paper performs efficient communication under the task-oriented principle by using the collaborative design of wir… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.00383v1-abstract-full').style.display = 'inline'; document.getElementById('2305.00383v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.00383v1-abstract-full" style="display: none;"> In the Internet of Things (IoT) networks, edge learning for data-driven tasks provides intelligent applications and services. As the network size becomes large, different users may generate distinct datasets. Thus, to suit multiple edge learning tasks for large-scale IoT networks, this paper performs efficient communication under the task-oriented principle by using the collaborative design of wireless resource allocation and edge learning error prediction. In particular, we start with multi-user scheduling to alleviate co-channel interference in dense networks. Then, we perform optimal power allocation in parallel for different learning tasks. Thanks to the high parallelization of the designed algorithm, extensive experimental results corroborate that the multi-user scheduling and task-oriented power allocation improve the performance of distinct edge learning tasks efficiently compared with the state-of-the-art benchmark algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.00383v1-abstract-full').style.display = 'none'; document.getElementById('2305.00383v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 8 figures; accepted for publication in IEEE TWC</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.06774">arXiv:2302.06774</a> <span> [<a href="https://arxiv.org/pdf/2302.06774">pdf</a>, <a href="https://arxiv.org/format/2302.06774">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Speaker-Independent Acoustic-to-Articulatory Speech Inversion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peter Wu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+L">Li-Wei Chen</a>, <a href="/search/eess?searchtype=author&query=Cho%2C+C+J">Cheol Jun Cho</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Goldstein%2C+L">Louis Goldstein</a>, <a href="/search/eess?searchtype=author&query=Black%2C+A+W">Alan W Black</a>, <a href="/search/eess?searchtype=author&query=Anumanchipalli%2C+G+K">Gopala K. Anumanchipalli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.06774v2-abstract-short" style="display: inline;"> To build speech processing methods that can handle speech as naturally as humans, researchers have explored multiple ways of building an invertible mapping from speech to an interpretable space. The articulatory space is a promising inversion target, since this space captures the mechanics of speech production. To this end, we build an acoustic-to-articulatory inversion (AAI) model that leverages… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.06774v2-abstract-full').style.display = 'inline'; document.getElementById('2302.06774v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.06774v2-abstract-full" style="display: none;"> To build speech processing methods that can handle speech as naturally as humans, researchers have explored multiple ways of building an invertible mapping from speech to an interpretable space. The articulatory space is a promising inversion target, since this space captures the mechanics of speech production. To this end, we build an acoustic-to-articulatory inversion (AAI) model that leverages self-supervision to generalize to unseen speakers. Our approach obtains 0.784 correlation on an electromagnetic articulography (EMA) dataset, improving the state-of-the-art by 12.5\%. Additionally, we show the interpretability of these representations through directly comparing the behavior of estimated representations with speech production behavior. Finally, we propose a resynthesis-based AAI evaluation metric that does not rely on articulatory labels, demonstrating its efficacy with an 18-speaker dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.06774v2-abstract-full').style.display = 'none'; document.getElementById('2302.06774v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.00968">arXiv:2211.00968</a> <span> [<a href="https://arxiv.org/pdf/2211.00968">pdf</a>, <a href="https://arxiv.org/ps/2211.00968">ps</a>, <a href="https://arxiv.org/format/2211.00968">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Internal Language Model Estimation based Adaptive Language Model Fusion for Domain Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ma%2C+R">Rao Ma</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xiaobo Wu</a>, <a href="/search/eess?searchtype=author&query=Qiu%2C+J">Jin Qiu</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+Y">Yanan Qin</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+H">Haihua Xu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peihao Wu</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+Z">Zejun Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.00968v2-abstract-short" style="display: inline;"> ASR model deployment environment is ever-changing, and the incoming speech can be switched across different domains during a session. This brings a challenge for effective domain adaptation when only target domain text data is available, and our objective is to obtain obviously improved performance on the target domain while the performance on the general domain is less undermined. In this paper,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.00968v2-abstract-full').style.display = 'inline'; document.getElementById('2211.00968v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.00968v2-abstract-full" style="display: none;"> ASR model deployment environment is ever-changing, and the incoming speech can be switched across different domains during a session. This brings a challenge for effective domain adaptation when only target domain text data is available, and our objective is to obtain obviously improved performance on the target domain while the performance on the general domain is less undermined. In this paper, we propose an adaptive LM fusion approach called internal language model estimation based adaptive domain adaptation (ILME-ADA). To realize such an ILME-ADA, an interpolated log-likelihood score is calculated based on the maximum of the scores from the internal LM and the external LM (ELM) respectively. We demonstrate the efficacy of the proposed ILME-ADA method with both RNN-T and LAS modeling frameworks employing neural network and n-gram LMs as ELMs respectively on two domain specific (target) test sets. The proposed method can achieve significantly better performance on the target test sets while it gets minimal performance degradation on the general test set, compared with both shallow and ILME-based LM fusion methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.00968v2-abstract-full').style.display = 'none'; document.getElementById('2211.00968v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.15272">arXiv:2210.15272</a> <span> [<a href="https://arxiv.org/pdf/2210.15272">pdf</a>, <a href="https://arxiv.org/ps/2210.15272">ps</a>, <a href="https://arxiv.org/format/2210.15272">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> A Fast and Accurate Pitch Estimation Algorithm Based on the Pseudo Wigner-Ville Distribution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yisi Liu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peter Wu</a>, <a href="/search/eess?searchtype=author&query=Black%2C+A+W">Alan W Black</a>, <a href="/search/eess?searchtype=author&query=Anumanchipalli%2C+G+K">Gopala K. Anumanchipalli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.15272v1-abstract-short" style="display: inline;"> Estimation of fundamental frequency (F0) in voiced segments of speech signals, also known as pitch tracking, plays a crucial role in pitch synchronous speech analysis, speech synthesis, and speech manipulation. In this paper, we capitalize on the high time and frequency resolution of the pseudo Wigner-Ville distribution (PWVD) and propose a new PWVD-based pitch estimation method. We devise an effi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.15272v1-abstract-full').style.display = 'inline'; document.getElementById('2210.15272v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.15272v1-abstract-full" style="display: none;"> Estimation of fundamental frequency (F0) in voiced segments of speech signals, also known as pitch tracking, plays a crucial role in pitch synchronous speech analysis, speech synthesis, and speech manipulation. In this paper, we capitalize on the high time and frequency resolution of the pseudo Wigner-Ville distribution (PWVD) and propose a new PWVD-based pitch estimation method. We devise an efficient algorithm to compute PWVD faster and use cepstrum-based pre-filtering to avoid cross-term interference. Evaluating our approach on a database with speech and electroglottograph (EGG) recordings yields a state-of-the-art mean absolute error (MAE) of around 4Hz. Our approach is also effective at voiced/unvoiced classification and handling sudden frequency changes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.15272v1-abstract-full').style.display = 'none'; document.getElementById('2210.15272v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.15173">arXiv:2210.15173</a> <span> [<a href="https://arxiv.org/pdf/2210.15173">pdf</a>, <a href="https://arxiv.org/format/2210.15173">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICASSP49357.2023.10096800">10.1109/ICASSP49357.2023.10096800 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Articulation GAN: Unsupervised modeling of articulatory learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Begu%C5%A1%2C+G">Ga拧per Begu拧</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+A">Alan Zhou</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peter Wu</a>, <a href="/search/eess?searchtype=author&query=Anumanchipalli%2C+G+K">Gopala K Anumanchipalli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.15173v2-abstract-short" style="display: inline;"> Generative deep neural networks are widely used for speech synthesis, but most existing models directly generate waveforms or spectral outputs. Humans, however, produce speech by controlling articulators, which results in the production of speech sounds through physical properties of sound propagation. We introduce the Articulatory Generator to the Generative Adversarial Network paradigm, a new un… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.15173v2-abstract-full').style.display = 'inline'; document.getElementById('2210.15173v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.15173v2-abstract-full" style="display: none;"> Generative deep neural networks are widely used for speech synthesis, but most existing models directly generate waveforms or spectral outputs. Humans, however, produce speech by controlling articulators, which results in the production of speech sounds through physical properties of sound propagation. We introduce the Articulatory Generator to the Generative Adversarial Network paradigm, a new unsupervised generative model of speech production/synthesis. The Articulatory Generator more closely mimics human speech production by learning to generate articulatory representations (electromagnetic articulography or EMA) in a fully unsupervised manner. A separate pre-trained physical model (ema2wav) then transforms the generated EMA representations to speech waveforms, which get sent to the Discriminator for evaluation. Articulatory analysis suggests that the network learns to control articulators in a similar manner to humans during speech production. Acoustic analysis of the outputs suggests that the network learns to generate words that are both present and absent in the training distribution. We additionally discuss implications of articulatory representations for cognitive models of human language and speech technology in general. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.15173v2-abstract-full').style.display = 'none'; document.getElementById('2210.15173v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICASSP 2023</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.11723">arXiv:2210.11723</a> <span> [<a href="https://arxiv.org/pdf/2210.11723">pdf</a>, <a href="https://arxiv.org/format/2210.11723">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICASSP49357.2023.10094711">10.1109/ICASSP49357.2023.10094711 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Evidence of Vocal Tract Articulation in Self-Supervised Learning of Speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cho%2C+C+J">Cheol Jun Cho</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peter Wu</a>, <a href="/search/eess?searchtype=author&query=Mohamed%2C+A">Abdelrahman Mohamed</a>, <a href="/search/eess?searchtype=author&query=Anumanchipalli%2C+G+K">Gopala K. Anumanchipalli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.11723v3-abstract-short" style="display: inline;"> Recent self-supervised learning (SSL) models have proven to learn rich representations of speech, which can readily be utilized by diverse downstream tasks. To understand such utilities, various analyses have been done for speech SSL models to reveal which and how information is encoded in the learned representations. Although the scope of previous analyses is extensive in acoustic, phonetic, and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.11723v3-abstract-full').style.display = 'inline'; document.getElementById('2210.11723v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.11723v3-abstract-full" style="display: none;"> Recent self-supervised learning (SSL) models have proven to learn rich representations of speech, which can readily be utilized by diverse downstream tasks. To understand such utilities, various analyses have been done for speech SSL models to reveal which and how information is encoded in the learned representations. Although the scope of previous analyses is extensive in acoustic, phonetic, and semantic perspectives, the physical grounding by speech production has not yet received full attention. To bridge this gap, we conduct a comprehensive analysis to link speech representations to articulatory trajectories measured by electromagnetic articulography (EMA). Our analysis is based on a linear probing approach where we measure articulatory score as an average correlation of linear mapping to EMA. We analyze a set of SSL models selected from the leaderboard of the SUPERB benchmark and perform further layer-wise analyses on two most successful models, Wav2Vec 2.0 and HuBERT. Surprisingly, representations from the recent speech SSL models are highly correlated with EMA traces (best: r = 0.81), and only 5 minutes are sufficient to train a linear model with high performance (r = 0.77). Our findings suggest that SSL models learn to align closely with continuous articulations, and provide a novel insight into speech SSL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.11723v3-abstract-full').style.display = 'none'; document.getElementById('2210.11723v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.06337">arXiv:2209.06337</a> <span> [<a href="https://arxiv.org/pdf/2209.06337">pdf</a>, <a href="https://arxiv.org/format/2209.06337">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> Deep Speech Synthesis from Articulatory Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peter Wu</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Goldstein%2C+L">Louis Goldstein</a>, <a href="/search/eess?searchtype=author&query=Black%2C+A+W">Alan W Black</a>, <a href="/search/eess?searchtype=author&query=Anumanchipalli%2C+G+K">Gopala K. Anumanchipalli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.06337v1-abstract-short" style="display: inline;"> In the articulatory synthesis task, speech is synthesized from input features containing information about the physical behavior of the human vocal tract. This task provides a promising direction for speech synthesis research, as the articulatory space is compact, smooth, and interpretable. Current works have highlighted the potential for deep learning models to perform articulatory synthesis. How… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.06337v1-abstract-full').style.display = 'inline'; document.getElementById('2209.06337v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.06337v1-abstract-full" style="display: none;"> In the articulatory synthesis task, speech is synthesized from input features containing information about the physical behavior of the human vocal tract. This task provides a promising direction for speech synthesis research, as the articulatory space is compact, smooth, and interpretable. Current works have highlighted the potential for deep learning models to perform articulatory synthesis. However, it remains unclear whether these models can achieve the efficiency and fidelity of the human speech production system. To help bridge this gap, we propose a time-domain articulatory synthesis methodology and demonstrate its efficacy with both electromagnetic articulography (EMA) and synthetic articulatory feature inputs. Our model is computationally efficient and achieves a transcription word error rate (WER) of 18.5% for the EMA-to-speech task, yielding an improvement of 11.6% compared to prior work. Through interpolation experiments, we also highlight the generalizability and interpretability of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.06337v1-abstract-full').style.display = 'none'; document.getElementById('2209.06337v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.08433">arXiv:2208.08433</a> <span> [<a href="https://arxiv.org/pdf/2208.08433">pdf</a>, <a href="https://arxiv.org/format/2208.08433">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Label Flipping Data Poisoning Attack Against Wearable Human Activity Recognition System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shahid%2C+A+R">Abdur R. Shahid</a>, <a href="/search/eess?searchtype=author&query=Imteaj%2C+A">Ahmed Imteaj</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P+Y">Peter Y. Wu</a>, <a href="/search/eess?searchtype=author&query=Igoche%2C+D+A">Diane A. Igoche</a>, <a href="/search/eess?searchtype=author&query=Alam%2C+T">Tauhidul Alam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.08433v1-abstract-short" style="display: inline;"> Human Activity Recognition (HAR) is a problem of interpreting sensor data to human movement using an efficient machine learning (ML) approach. The HAR systems rely on data from untrusted users, making them susceptible to data poisoning attacks. In a poisoning attack, attackers manipulate the sensor readings to contaminate the training set, misleading the HAR to produce erroneous outcomes. This pap… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.08433v1-abstract-full').style.display = 'inline'; document.getElementById('2208.08433v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.08433v1-abstract-full" style="display: none;"> Human Activity Recognition (HAR) is a problem of interpreting sensor data to human movement using an efficient machine learning (ML) approach. The HAR systems rely on data from untrusted users, making them susceptible to data poisoning attacks. In a poisoning attack, attackers manipulate the sensor readings to contaminate the training set, misleading the HAR to produce erroneous outcomes. This paper presents the design of a label flipping data poisoning attack for a HAR system, where the label of a sensor reading is maliciously changed in the data collection phase. Due to high noise and uncertainty in the sensing environment, such an attack poses a severe threat to the recognition system. Besides, vulnerability to label flipping attacks is dangerous when activity recognition models are deployed in safety-critical applications. This paper shades light on how to carry out the attack in practice through smartphone-based sensor data collection applications. This is an earlier research work, to our knowledge, that explores attacking the HAR models via label flipping poisoning. We implement the proposed attack and test it on activity recognition models based on the following machine learning algorithms: multi-layer perceptron, decision tree, random forest, and XGBoost. Finally, we evaluate the effectiveness of K-nearest neighbors (KNN)-based defense mechanism against the proposed attack. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.08433v1-abstract-full').style.display = 'none'; document.getElementById('2208.08433v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to IEEE SSCI 2022 Conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.04029">arXiv:2205.04029</a> <span> [<a href="https://arxiv.org/pdf/2205.04029">pdf</a>, <a href="https://arxiv.org/format/2205.04029">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Muskits: an End-to-End Music Processing Toolkit for Singing Voice Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+S">Shuai Guo</a>, <a href="/search/eess?searchtype=author&query=Qian%2C+T">Tao Qian</a>, <a href="/search/eess?searchtype=author&query=Huo%2C+N">Nan Huo</a>, <a href="/search/eess?searchtype=author&query=Hayashi%2C+T">Tomoki Hayashi</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yuning Wu</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+F">Frank Xu</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+X">Xuankai Chang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Huazhe Li</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peter Wu</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+Q">Qin Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.04029v2-abstract-short" style="display: inline;"> This paper introduces a new open-source platform named Muskits for end-to-end music processing, which mainly focuses on end-to-end singing voice synthesis (E2E-SVS). Muskits supports state-of-the-art SVS models, including RNN SVS, transformer SVS, and XiaoiceSing. The design of Muskits follows the style of widely-used speech processing toolkits, ESPnet and Kaldi, for data prepossessing, training,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.04029v2-abstract-full').style.display = 'inline'; document.getElementById('2205.04029v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.04029v2-abstract-full" style="display: none;"> This paper introduces a new open-source platform named Muskits for end-to-end music processing, which mainly focuses on end-to-end singing voice synthesis (E2E-SVS). Muskits supports state-of-the-art SVS models, including RNN SVS, transformer SVS, and XiaoiceSing. The design of Muskits follows the style of widely-used speech processing toolkits, ESPnet and Kaldi, for data prepossessing, training, and recipe pipelines. To the best of our knowledge, this toolkit is the first platform that allows a fair and highly-reproducible comparison between several published works in SVS. In addition, we also demonstrate several advanced usages based on the toolkit functionalities, including multilingual training and transfer learning. This paper describes the major framework of Muskits, its functionalities, and experimental results in single-singer, multi-singer, multilingual, and transfer learning scenarios. The toolkit is publicly available at https://github.com/SJTMusicTeam/Muskits. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.04029v2-abstract-full').style.display = 'none'; document.getElementById('2205.04029v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Interspeech</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.14633">arXiv:2112.14633</a> <span> [<a href="https://arxiv.org/pdf/2112.14633">pdf</a>, <a href="https://arxiv.org/format/2112.14633">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Bayesian Compressive Channel Estimation for Hybrid Full-Dimensional MIMO Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+H">Hongqing Huang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peiran Wu</a>, <a href="/search/eess?searchtype=author&query=Xia%2C+M">Minghua Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.14633v1-abstract-short" style="display: inline;"> Efficient channel estimation is challenging in full-dimensional multiple-input multiple-output communication systems, particularly in those with hybrid digital-analog architectures. Under a compressive sensing framework, this letter first designs a uniform dictionary based on a spherical Fibonacci grid to represent channels in a sparse domain, yielding smaller angular errors in three-dimensional b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.14633v1-abstract-full').style.display = 'inline'; document.getElementById('2112.14633v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.14633v1-abstract-full" style="display: none;"> Efficient channel estimation is challenging in full-dimensional multiple-input multiple-output communication systems, particularly in those with hybrid digital-analog architectures. Under a compressive sensing framework, this letter first designs a uniform dictionary based on a spherical Fibonacci grid to represent channels in a sparse domain, yielding smaller angular errors in three-dimensional beamspace than traditional dictionaries. Then, a Bayesian inference-aided greedy pursuit algorithm is developed to estimate channels in the frequency domain. Finally, simulation results demonstrate that both the designed dictionary and the proposed Bayesian channel estimation outperform the benchmark schemes and attain a lower normalized mean squared error of channel estimation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.14633v1-abstract-full').style.display = 'none'; document.getElementById('2112.14633v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 5 figures, submitted for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.15636">arXiv:2111.15636</a> <span> [<a href="https://arxiv.org/pdf/2111.15636">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applications">stat.AP</span> </div> </div> <p class="title is-5 mathjax"> Generating gapless land surface temperature with a high spatio-temporal resolution by fusing multi-source satellite-observed and model-simulated data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ma%2C+J">Jun Ma</a>, <a href="/search/eess?searchtype=author&query=Shen%2C+H">Huanfeng Shen</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Penghai Wu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+J">Jingan Wu</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+M">Meiling Gao</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+C">Chunlei Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.15636v1-abstract-short" style="display: inline;"> Land surface temperature (LST) is a key parameter when monitoring land surface processes. However, cloud contamination and the tradeoff between the spatial and temporal resolutions greatly impede the access to high-quality thermal infrared (TIR) remote sensing data. Despite the massive efforts made to solve these dilemmas, it is still difficult to generate LST estimates with concurrent spatial com… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.15636v1-abstract-full').style.display = 'inline'; document.getElementById('2111.15636v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.15636v1-abstract-full" style="display: none;"> Land surface temperature (LST) is a key parameter when monitoring land surface processes. However, cloud contamination and the tradeoff between the spatial and temporal resolutions greatly impede the access to high-quality thermal infrared (TIR) remote sensing data. Despite the massive efforts made to solve these dilemmas, it is still difficult to generate LST estimates with concurrent spatial completeness and a high spatio-temporal resolution. Land surface models (LSMs) can be used to simulate gapless LST with a high temporal resolution, but this usually comes with a low spatial resolution. In this paper, we present an integrated temperature fusion framework for satellite-observed and LSM-simulated LST data to map gapless LST at a 60-m spatial resolution and half-hourly temporal resolution. The global linear model (GloLM) model and the diurnal land surface temperature cycle (DTC) model are respectively performed as preprocessing steps for sensor and temporal normalization between the different LST data. The Landsat LST, Moderate Resolution Imaging Spectroradiometer (MODIS) LST, and Community Land Model Version 5.0 (CLM 5.0)-simulated LST are then fused using a filter-based spatio-temporal integrated fusion model. Evaluations were implemented in an urban-dominated region (the city of Wuhan in China) and a natural-dominated region (the Heihe River Basin in China), in terms of accuracy, spatial variability, and diurnal temporal dynamics. Results indicate that the fused LST is highly consistent with actual Landsat LST data (in situ LST measurements), in terms of a Pearson correlation coefficient of 0.94 (0.97-0.99), a mean absolute error of 0.71-0.98 K (0.82-3.17 K), and a root-mean-square error of 0.97-1.26 K (1.09-3.97 K). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.15636v1-abstract-full').style.display = 'none'; document.getElementById('2111.15636v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Wu%2C+P&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Wu%2C+P&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Wu%2C+P&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>