CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 146 results for author: <span class="mathjax">Hu, S</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Hu%2C+S">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Hu, S"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Hu%2C+S&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Hu, S"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Hu%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Hu%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Hu%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Hu%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09291">arXiv:2502.09291</a> <span> [<a href="https://arxiv.org/pdf/2502.09291">pdf</a>, <a href="https://arxiv.org/ps/2502.09291">ps</a>, <a href="https://arxiv.org/format/2502.09291">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Joint Attention Mechanism Learning to Facilitate Opto-physiological Monitoring during Physical Activity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zheng%2C+X">Xiaoyu Zheng</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Sijung Hu</a>, <a href="/search/eess?searchtype=author&query=Dwyer%2C+V">Vincent Dwyer</a>, <a href="/search/eess?searchtype=author&query=Derakhshani%2C+M">Mahsa Derakhshani</a>, <a href="/search/eess?searchtype=author&query=Barrett%2C+L">Laura Barrett</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09291v1-abstract-short" style="display: inline;"> Opto-physiological monitoring is a non-contact technique for measuring cardiac signals, i.e., photoplethysmography (PPG). Quality PPG signals directly lead to reliable physiological readings. However, PPG signal acquisition procedures are often accompanied by spurious motion artefacts (MAs), especially during low-to-high-intensity physical activity. This study proposes a practical adversarial lear… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09291v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09291v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09291v1-abstract-full" style="display: none;"> Opto-physiological monitoring is a non-contact technique for measuring cardiac signals, i.e., photoplethysmography (PPG). Quality PPG signals directly lead to reliable physiological readings. However, PPG signal acquisition procedures are often accompanied by spurious motion artefacts (MAs), especially during low-to-high-intensity physical activity. This study proposes a practical adversarial learning approach for opto-physiological monitoring by using a generative adversarial network with an attention mechanism (AM-GAN) to model motion noise and to allow MA removal. The AM-GAN learns an MA-resistant mapping from raw and noisy signals to clear PPG signals in an adversarial manner, guided by an attention mechanism to directly translate the motion reference of triaxial acceleration to the MAs appearing in the raw signal. The AM-GAN was experimented with three various protocols engaged with 39 subjects in various physical activities. The average absolute error for heart rate (HR) derived from the MA-free PPG signal via the AM-GAN, is 1.81 beats/min for the IEEE-SPC dataset and 3.86 beats/min for the PPGDalia dataset. The same procedure applied to an in-house LU dataset resulted in average absolute errors for HR and respiratory rate (RR) of less than 1.37 beats/min and 2.49 breaths/min, respectively. The study demonstrates the robustness and resilience of AM-GAN, particularly during low-to-high-intensity physical activities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09291v1-abstract-full').style.display = 'none'; document.getElementById('2502.09291v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06817">arXiv:2502.06817</a> <span> [<a href="https://arxiv.org/pdf/2502.06817">pdf</a>, <a href="https://arxiv.org/format/2502.06817">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Diffusion-empowered AutoPrompt MedSAM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+P">Peng Huang</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shu Hu</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+B">Bo Peng</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Jiashu Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+H">Hongtu Zhu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xi Wu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06817v1-abstract-short" style="display: inline;"> MedSAM, a medical foundation model derived from the SAM architecture, has demonstrated notable success across diverse medical domains. However, its clinical application faces two major challenges: the dependency on labor-intensive manual prompt generation, which imposes a significant burden on clinicians, and the absence of semantic labeling in the generated segmentation masks for organs or lesion… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06817v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06817v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06817v1-abstract-full" style="display: none;"> MedSAM, a medical foundation model derived from the SAM architecture, has demonstrated notable success across diverse medical domains. However, its clinical application faces two major challenges: the dependency on labor-intensive manual prompt generation, which imposes a significant burden on clinicians, and the absence of semantic labeling in the generated segmentation masks for organs or lesions, limiting its practicality for non-expert users. To address these limitations, we propose AutoMedSAM, an end-to-end framework derived from SAM, designed to enhance usability and segmentation performance. AutoMedSAM retains MedSAM's image encoder and mask decoder structure while introducing a novel diffusion-based class prompt encoder. The diffusion-based encoder employs a dual-decoder structure to collaboratively generate prompt embeddings guided by sparse and dense prompt definitions. These embeddings enhance the model's ability to understand and process clinical imagery autonomously. With this encoder, AutoMedSAM leverages class prompts to embed semantic information into the model's predictions, transforming MedSAM's semi-automated pipeline into a fully automated workflow. Furthermore, AutoMedSAM employs an uncertainty-aware joint optimization strategy during training to effectively inherit MedSAM's pre-trained knowledge while improving generalization by integrating multiple loss functions. Experimental results across diverse datasets demonstrate that AutoMedSAM achieves superior performance while broadening its applicability to both clinical settings and non-expert users. Code is available at https://github.com/HP-ML/AutoPromptMedSAM.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06817v1-abstract-full').style.display = 'none'; document.getElementById('2502.06817v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07127">arXiv:2501.07127</a> <span> [<a href="https://arxiv.org/pdf/2501.07127">pdf</a>, <a href="https://arxiv.org/ps/2501.07127">ps</a>, <a href="https://arxiv.org/format/2501.07127">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> QoE-oriented Communication Service Provision for Annotation Rendering in Mobile Augmented Reality </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Sun%2C+L">Lulu Sun</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+C">Conghao Zhou</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shisheng Hu</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+Y">Yupeng Zhu</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+N">Nan Cheng</a>, <a href="/search/eess?searchtype=author&query=Xu"> Xu</a>, <a href="/search/eess?searchtype=author&query=Xia"> Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07127v1-abstract-short" style="display: inline;"> As mobile augmented reality (MAR) continues to evolve, future 6G networks will play a pivotal role in supporting immersive and personalized user experiences. In this paper, we address the communication service provision problem for annotation rendering in edge-assisted MAR, with the objective of optimizing spectrum resource utilization while ensuring the required quality of experience (QoE) for MA… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07127v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07127v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07127v1-abstract-full" style="display: none;"> As mobile augmented reality (MAR) continues to evolve, future 6G networks will play a pivotal role in supporting immersive and personalized user experiences. In this paper, we address the communication service provision problem for annotation rendering in edge-assisted MAR, with the objective of optimizing spectrum resource utilization while ensuring the required quality of experience (QoE) for MAR users. To overcome the challenges of user-specific uplink data traffic patterns and the complex operational mechanisms of annotation rendering, we propose a digital twin (DT)-based approach. We first design a DT specifically tailored for MAR applications to learn key annotation rendering mechanisms, enabling the network controller to access MAR application-specific information. Then, we develop a DT based QoE modeling approach to capture the unique relationship between individual user QoE and spectrum resource demands. Finally, we propose a QoE-oriented resource allocation algorithm that decreases resource utilization compared to conventional net work slicing-based approaches. Simulation results demonstrate that our DT-based approach outperforms benchmark approaches in the accuracy and granularity of QoE modeling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07127v1-abstract-full').style.display = 'none'; document.getElementById('2501.07127v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages,4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04379">arXiv:2501.04379</a> <span> [<a href="https://arxiv.org/pdf/2501.04379">pdf</a>, <a href="https://arxiv.org/format/2501.04379">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Phone-purity Guided Discrete Tokens for Dysarthric Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+H">Huimeng Wang</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+X">Xurong Xie</a>, <a href="/search/eess?searchtype=author&query=Geng%2C+M">Mengzhe Geng</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shujie Hu</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+H">Haoning Xu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Youjun Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhaoqing Li</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+J">Jiajun Deng</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xunying Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04379v1-abstract-short" style="display: inline;"> Discrete tokens extracted provide efficient and domain adaptable speech features. Their application to disordered speech that exhibits articulation imprecision and large mismatch against normal voice remains unexplored. To improve their phonetic discrimination that is weakened during unsupervised K-means or vector quantization of continuous features, this paper proposes novel phone-purity guided (… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04379v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04379v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04379v1-abstract-full" style="display: none;"> Discrete tokens extracted provide efficient and domain adaptable speech features. Their application to disordered speech that exhibits articulation imprecision and large mismatch against normal voice remains unexplored. To improve their phonetic discrimination that is weakened during unsupervised K-means or vector quantization of continuous features, this paper proposes novel phone-purity guided (PPG) discrete tokens for dysarthric speech recognition. Phonetic label supervision is used to regularize maximum likelihood and reconstruction error costs used in standard K-means and VAE-VQ based discrete token extraction. Experiments conducted on the UASpeech corpus suggest that the proposed PPG discrete token features extracted from HuBERT consistently outperform hybrid TDNN and End-to-End (E2E) Conformer systems using non-PPG based K-means or VAE-VQ tokens across varying codebook sizes by statistically significant word error rate (WER) reductions up to 0.99\% and 1.77\% absolute (3.21\% and 4.82\% relative) respectively on the UASpeech test set of 16 dysarthric speakers. The lowest WER of 23.25\% was obtained by combining systems using different token features. Consistent improvements on the phone purity metric were also achieved. T-SNE visualization further demonstrates sharper decision boundaries were produced between K-means/VAE-VQ clusters after introducing phone-purity guidance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04379v1-abstract-full').style.display = 'none'; document.getElementById('2501.04379v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03643">arXiv:2501.03643</a> <span> [<a href="https://arxiv.org/pdf/2501.03643">pdf</a>, <a href="https://arxiv.org/format/2501.03643">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Effective and Efficient Mixed Precision Quantization of Speech Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+H">Haoning Xu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhaoqing Li</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+Z">Zengrui Jin</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Huimeng Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Youjun Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+G">Guinan Li</a>, <a href="/search/eess?searchtype=author&query=Geng%2C+M">Mengzhe Geng</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shujie Hu</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+J">Jiajun Deng</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xunying Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03643v2-abstract-short" style="display: inline;"> This paper presents a novel mixed-precision quantization approach for speech foundation models that tightly integrates mixed-precision learning and quantized model parameter estimation into one single model compression stage. Experiments conducted on LibriSpeech dataset with fine-tuned wav2vec2.0-base and HuBERT-large models suggest the resulting mixed-precision quantized models increased the loss… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03643v2-abstract-full').style.display = 'inline'; document.getElementById('2501.03643v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03643v2-abstract-full" style="display: none;"> This paper presents a novel mixed-precision quantization approach for speech foundation models that tightly integrates mixed-precision learning and quantized model parameter estimation into one single model compression stage. Experiments conducted on LibriSpeech dataset with fine-tuned wav2vec2.0-base and HuBERT-large models suggest the resulting mixed-precision quantized models increased the lossless compression ratio by factors up to 1.7x and 1.9x over the respective uniform-precision and two-stage mixed-precision quantized baselines that perform precision learning and model parameters quantization in separate and disjointed stages, while incurring no statistically word error rate (WER) increase over the 32-bit full-precision models. The system compression time of wav2vec2.0-base and HuBERT-large models is reduced by up to 1.9 and 1.5 times over the two-stage mixed-precision baselines, while both produce lower WERs. The best-performing 3.5-bit mixed-precision quantized HuBERT-large model produces a lossless compression ratio of 8.6x over the 32-bit full-precision system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03643v2-abstract-full').style.display = 'none'; document.getElementById('2501.03643v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at IEEE ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19279">arXiv:2412.19279</a> <span> [<a href="https://arxiv.org/pdf/2412.19279">pdf</a>, <a href="https://arxiv.org/format/2412.19279">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Improving Generalization for AI-Synthesized Voice Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ren%2C+H">Hainan Ren</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+L">Li Lin</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+C">Chun-Hao Liu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shu Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19279v2-abstract-short" style="display: inline;"> AI-synthesized voice technology has the potential to create realistic human voices for beneficial applications, but it can also be misused for malicious purposes. While existing AI-synthesized voice detection models excel in intra-domain evaluation, they face challenges in generalizing across different domains, potentially becoming obsolete as new voice generators emerge. Current solutions use div… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19279v2-abstract-full').style.display = 'inline'; document.getElementById('2412.19279v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19279v2-abstract-full" style="display: none;"> AI-synthesized voice technology has the potential to create realistic human voices for beneficial applications, but it can also be misused for malicious purposes. While existing AI-synthesized voice detection models excel in intra-domain evaluation, they face challenges in generalizing across different domains, potentially becoming obsolete as new voice generators emerge. Current solutions use diverse data and advanced machine learning techniques (e.g., domain-invariant representation, self-supervised learning), but are limited by predefined vocoders and sensitivity to factors like background noise and speaker identity. In this work, we introduce an innovative disentanglement framework aimed at extracting domain-agnostic artifact features related to vocoders. Utilizing these features, we enhance model learning in a flat loss landscape, enabling escape from suboptimal solutions and improving generalization. Extensive experiments on benchmarks show our approach outperforms state-of-the-art methods, achieving up to 5.12% improvement in the equal error rate metric in intra-domain and 7.59% in cross-domain evaluations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19279v2-abstract-full').style.display = 'none'; document.getElementById('2412.19279v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AAAI25</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18832">arXiv:2412.18832</a> <span> [<a href="https://arxiv.org/pdf/2412.18832">pdf</a>, <a href="https://arxiv.org/format/2412.18832">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Structured Speaker-Deficiency Adaptation of Foundation Models for Dysarthric and Elderly Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shujie Hu</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+X">Xurong Xie</a>, <a href="/search/eess?searchtype=author&query=Geng%2C+M">Mengzhe Geng</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+J">Jiajun Deng</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+Z">Zengrui Jin</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+T">Tianzi Wang</a>, <a href="/search/eess?searchtype=author&query=Cui%2C+M">Mingyu Cui</a>, <a href="/search/eess?searchtype=author&query=Li%2C+G">Guinan Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhaoqing Li</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xunying Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18832v1-abstract-short" style="display: inline;"> Data-intensive fine-tuning of speech foundation models (SFMs) to scarce and diverse dysarthric and elderly speech leads to data bias and poor generalization to unseen speakers. This paper proposes novel structured speaker-deficiency adaptation approaches for SSL pre-trained SFMs on such data. Speaker and speech deficiency invariant SFMs were constructed in their supervised adaptive fine-tuning sta… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18832v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18832v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18832v1-abstract-full" style="display: none;"> Data-intensive fine-tuning of speech foundation models (SFMs) to scarce and diverse dysarthric and elderly speech leads to data bias and poor generalization to unseen speakers. This paper proposes novel structured speaker-deficiency adaptation approaches for SSL pre-trained SFMs on such data. Speaker and speech deficiency invariant SFMs were constructed in their supervised adaptive fine-tuning stage to reduce undue bias to training data speakers, and serves as a more neutral and robust starting point for test time unsupervised adaptation. Speech variability attributed to speaker identity and speech impairment severity, or aging induced neurocognitive decline, are modelled using separate adapters that can be combined together to model any seen or unseen speaker. Experiments on the UASpeech dysarthric and DementiaBank Pitt elderly speech corpora suggest structured speaker-deficiency adaptation of HuBERT and Wav2vec2-conformer models consistently outperforms baseline SFMs using either: a) no adapters; b) global adapters shared among all speakers; or c) single attribute adapters modelling speaker or deficiency labels alone by statistically significant WER reductions up to 3.01% and 1.50% absolute (10.86% and 6.94% relative) on the two tasks respectively. The lowest published WER of 19.45% (49.34% on very low intelligibility, 33.17% on unseen words) is obtained on the UASpeech test set of 16 dysarthric speakers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18832v1-abstract-full').style.display = 'none'; document.getElementById('2412.18832v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18619">arXiv:2412.18619</a> <span> [<a href="https://arxiv.org/pdf/2412.18619">pdf</a>, <a href="https://arxiv.org/format/2412.18619">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Next Token Prediction Towards Multimodal Intelligence: A Comprehensive Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+L">Liang Chen</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zekun Wang</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+S">Shuhuai Ren</a>, <a href="/search/eess?searchtype=author&query=Li%2C+L">Lei Li</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+H">Haozhe Zhao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yunshui Li</a>, <a href="/search/eess?searchtype=author&query=Cai%2C+Z">Zefan Cai</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+H">Hongcheng Guo</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+L">Lei Zhang</a>, <a href="/search/eess?searchtype=author&query=Xiong%2C+Y">Yizhe Xiong</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yichi Zhang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+R">Ruoyu Wu</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+Q">Qingxiu Dong</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+L">Lingwei Meng</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shujie Hu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yulong Chen</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+J">Junyang Lin</a>, <a href="/search/eess?searchtype=author&query=Bai%2C+S">Shuai Bai</a>, <a href="/search/eess?searchtype=author&query=Vlachos%2C+A">Andreas Vlachos</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+X">Xu Tan</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Minjia Zhang</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+W">Wen Xiao</a>, <a href="/search/eess?searchtype=author&query=Yee%2C+A">Aaron Yee</a> , et al. (2 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18619v2-abstract-short" style="display: inline;"> Building on the foundations of language modeling in natural language processing, Next Token Prediction (NTP) has evolved into a versatile training objective for machine learning tasks across various modalities, achieving considerable success. As Large Language Models (LLMs) have advanced to unify understanding and generation tasks within the textual modality, recent research has shown that tasks f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18619v2-abstract-full').style.display = 'inline'; document.getElementById('2412.18619v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18619v2-abstract-full" style="display: none;"> Building on the foundations of language modeling in natural language processing, Next Token Prediction (NTP) has evolved into a versatile training objective for machine learning tasks across various modalities, achieving considerable success. As Large Language Models (LLMs) have advanced to unify understanding and generation tasks within the textual modality, recent research has shown that tasks from different modalities can also be effectively encapsulated within the NTP framework, transforming the multimodal information into tokens and predict the next one given the context. This survey introduces a comprehensive taxonomy that unifies both understanding and generation within multimodal learning through the lens of NTP. The proposed taxonomy covers five key aspects: Multimodal tokenization, MMNTP model architectures, unified task representation, datasets \& evaluation, and open challenges. This new taxonomy aims to aid researchers in their exploration of multimodal intelligence. An associated GitHub repository collecting the latest papers and repos is available at https://github.com/LMM101/Awesome-Multimodal-Next-Token-Prediction <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18619v2-abstract-full').style.display = 'none'; document.getElementById('2412.18619v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">69 papes, 18 figures, repo at https://github.com/LMM101/Awesome-Multimodal-Next-Token-Prediction</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10559">arXiv:2412.10559</a> <span> [<a href="https://arxiv.org/pdf/2412.10559">pdf</a>, <a href="https://arxiv.org/format/2412.10559">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Dynamical Systems">math.DS</span> </div> </div> <p class="title is-5 mathjax"> Error Estimation and Stopping Criteria for Krylov-Based Model Order Reduction in Acoustics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hu%2C+S">Siyang Hu</a>, <a href="/search/eess?searchtype=author&query=Wulbusch%2C+N">Nick Wulbusch</a>, <a href="/search/eess?searchtype=author&query=Chernov%2C+A">Alexey Chernov</a>, <a href="/search/eess?searchtype=author&query=Bechtold%2C+T">Tamara Bechtold</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10559v1-abstract-short" style="display: inline;"> Depending on the frequency range of interest, finite element-based modeling of acoustic problems leads to dynamical systems with very high dimensional state spaces. As these models can mostly be described with second order linear dynamical system with sparse matrices, mathematical model order reduction provides an interesting possibility to speed up the simulation process. In this work, we tackle… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10559v1-abstract-full').style.display = 'inline'; document.getElementById('2412.10559v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10559v1-abstract-full" style="display: none;"> Depending on the frequency range of interest, finite element-based modeling of acoustic problems leads to dynamical systems with very high dimensional state spaces. As these models can mostly be described with second order linear dynamical system with sparse matrices, mathematical model order reduction provides an interesting possibility to speed up the simulation process. In this work, we tackle the question of finding an optimal order for the reduced system, given a desired accuracy. To do so, we revisit a heuristic error estimator based on the difference of two reduced models from two consecutive Krylov iterations. We perform a mathematical analysis of the estimator and show that the difference of two consecutive reduced models does provide a sufficiently accurate estimation for the true model reduction error. This claim is supported by numerical experiments on two acoustic models. We briefly discuss its feasibility as a stopping criterion for Krylov-based model order reduction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10559v1-abstract-full').style.display = 'none'; document.getElementById('2412.10559v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 5 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 37M05; 65P99; 33J05 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14656">arXiv:2411.14656</a> <span> [<a href="https://arxiv.org/pdf/2411.14656">pdf</a>, <a href="https://arxiv.org/format/2411.14656">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applications">stat.AP</span> </div> </div> <p class="title is-5 mathjax"> mmWave Radar for Sit-to-Stand Analysis: A Comparative Study with Wearables and Kinect </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shuting Hu</a>, <a href="/search/eess?searchtype=author&query=Ackun%2C+P">Peggy Ackun</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiang Zhang</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+S">Siyang Cao</a>, <a href="/search/eess?searchtype=author&query=Barton%2C+J">Jennifer Barton</a>, <a href="/search/eess?searchtype=author&query=Hector%2C+M+G">Melvin G. Hector</a>, <a href="/search/eess?searchtype=author&query=Fain%2C+M+J">Mindy J. Fain</a>, <a href="/search/eess?searchtype=author&query=Toosizadeh%2C+N">Nima Toosizadeh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14656v2-abstract-short" style="display: inline;"> This study explores a novel approach for analyzing Sit-to-Stand (STS) movements using millimeter-wave (mmWave) radar technology. The goal is to develop a non-contact sensing, privacy-preserving, and all-day operational method for healthcare applications, including fall risk assessment. We used a 60GHz mmWave radar system to collect radar point cloud data, capturing STS motions from 45 participants… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14656v2-abstract-full').style.display = 'inline'; document.getElementById('2411.14656v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14656v2-abstract-full" style="display: none;"> This study explores a novel approach for analyzing Sit-to-Stand (STS) movements using millimeter-wave (mmWave) radar technology. The goal is to develop a non-contact sensing, privacy-preserving, and all-day operational method for healthcare applications, including fall risk assessment. We used a 60GHz mmWave radar system to collect radar point cloud data, capturing STS motions from 45 participants. By employing a deep learning pose estimation model, we learned the human skeleton from Kinect built-in body tracking and applied Inverse Kinematics (IK) to calculate joint angles, segment STS motions, and extract commonly used features in fall risk assessment. Radar extracted features were then compared with those obtained from Kinect and wearable sensors. The results demonstrated the effectiveness of mmWave radar in capturing general motion patterns and large joint movements (e.g., trunk). Additionally, the study highlights the advantages and disadvantages of individual sensors and suggests the potential of integrated sensor technologies to improve the accuracy and reliability of motion analysis in clinical and biomedical research settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14656v2-abstract-full').style.display = 'none'; document.getElementById('2411.14656v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13766">arXiv:2411.13766</a> <span> [<a href="https://arxiv.org/pdf/2411.13766">pdf</a>, <a href="https://arxiv.org/format/2411.13766">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Tiny-Align: Bridging Automatic Speech Recognition and Large Language Model on the Edge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Qin%2C+R">Ruiyang Qin</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+D">Dancheng Liu</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+G">Gelei Xu</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+Z">Zheyu Yan</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+C">Chenhui Xu</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+Y">Yuting Hu</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+X+S">X. Sharon Hu</a>, <a href="/search/eess?searchtype=author&query=Xiong%2C+J">Jinjun Xiong</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+Y">Yiyu Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13766v2-abstract-short" style="display: inline;"> The combination of Large Language Models (LLM) and Automatic Speech Recognition (ASR), when deployed on edge devices (called edge ASR-LLM), can serve as a powerful personalized assistant to enable audio-based interaction for users. Compared to text-based interaction, edge ASR-LLM allows accessible and natural audio interactions. Unfortunately, existing ASR-LLM models are mainly trained in high-per… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13766v2-abstract-full').style.display = 'inline'; document.getElementById('2411.13766v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13766v2-abstract-full" style="display: none;"> The combination of Large Language Models (LLM) and Automatic Speech Recognition (ASR), when deployed on edge devices (called edge ASR-LLM), can serve as a powerful personalized assistant to enable audio-based interaction for users. Compared to text-based interaction, edge ASR-LLM allows accessible and natural audio interactions. Unfortunately, existing ASR-LLM models are mainly trained in high-performance computing environments and produce substantial model weights, making them difficult to deploy on edge devices. More importantly, to better serve users' personalized needs, the ASR-LLM must be able to learn from each distinct user, given that audio input often contains highly personalized characteristics that necessitate personalized on-device training. Since individually fine-tuning the ASR or LLM often leads to suboptimal results due to modality-specific limitations, end-to-end training ensures seamless integration of audio features and language understanding (cross-modal alignment), ultimately enabling a more personalized and efficient adaptation on edge devices. However, due to the complex training requirements and substantial computational demands of existing approaches, cross-modal alignment between ASR audio and LLM can be challenging on edge devices. In this work, we propose a resource-efficient cross-modal alignment framework that bridges ASR and LLMs on edge devices to handle personalized audio input. Our framework enables efficient ASR-LLM alignment on resource-constrained devices like NVIDIA Jetson Orin (8GB RAM), achieving 50x training time speedup while improving the alignment quality by more than 50\%. To the best of our knowledge, this is the first work to study efficient ASR-LLM alignment on resource-constrained edge devices. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13766v2-abstract-full').style.display = 'none'; document.getElementById('2411.13766v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03099">arXiv:2411.03099</a> <span> [<a href="https://arxiv.org/pdf/2411.03099">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Optimized Cryo-CMOS Technology with VTH<0.2V and Ion>1.2mA/um for High-Peformance Computing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=He%2C+C">Chang He</a>, <a href="/search/eess?searchtype=author&query=Xin%2C+Y">Yue Xin</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+L">Longfei Yang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zewei Wang</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+Z">Zhidong Tang</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+X">Xin Luo</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+R">Renhe Chen</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zirui Wang</a>, <a href="/search/eess?searchtype=author&query=Kong%2C+S">Shuai Kong</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jianli Wang</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+J">Jianshi Tang</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+X">Xiaoxu Kang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+S">Shoumian Chen</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Y">Yuhang Zhao</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shaojian Hu</a>, <a href="/search/eess?searchtype=author&query=Kou%2C+X">Xufeng Kou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03099v1-abstract-short" style="display: inline;"> We report the design-technology co-optimization (DTCO) scheme to develop a 28-nm cryogenic CMOS (Cryo-CMOS) technology for high-performance computing (HPC). The precise adjustment of halo implants manages to compensate the threshold voltage (VTH) shift at low temperatures. The optimized NMOS and PMOS transistors, featured by VTH<0.2V, sub-threshold swing (SS)<30 mV/dec, and on-state current (Ion)>… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03099v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03099v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03099v1-abstract-full" style="display: none;"> We report the design-technology co-optimization (DTCO) scheme to develop a 28-nm cryogenic CMOS (Cryo-CMOS) technology for high-performance computing (HPC). The precise adjustment of halo implants manages to compensate the threshold voltage (VTH) shift at low temperatures. The optimized NMOS and PMOS transistors, featured by VTH<0.2V, sub-threshold swing (SS)<30 mV/dec, and on-state current (Ion)>1.2mA/um at 77K, warrant a reliable sub-0.6V operation. Moreover, the enhanced driving strength of Cryo-CMOS inherited from a higher transconductance leads to marked improvements in elevating the ring oscillator frequency by 20%, while reducing the power consumption of the compute-intensive cryogenic IC system by 37% at 77K. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03099v1-abstract-full').style.display = 'none'; document.getElementById('2411.03099v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02592">arXiv:2410.02592</a> <span> [<a href="https://arxiv.org/pdf/2410.02592">pdf</a>, <a href="https://arxiv.org/format/2410.02592">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> IC3M: In-Car Multimodal Multi-object Monitoring for Abnormal Status of Both Driver and Passengers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Fang%2C+Z">Zihan Fang</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+Z">Zheng Lin</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Senkang Hu</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+H">Hangcheng Cao</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+Y">Yiqin Deng</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xianhao Chen</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+Y">Yuguang Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02592v4-abstract-short" style="display: inline;"> Recently, in-car monitoring has emerged as a promising technology for detecting early-stage abnormal status of the driver and providing timely alerts to prevent traffic accidents. Although training models with multimodal data enhances the reliability of abnormal status detection, the scarcity of labeled data and the imbalance of class distribution impede the extraction of critical abnormal state f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02592v4-abstract-full').style.display = 'inline'; document.getElementById('2410.02592v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02592v4-abstract-full" style="display: none;"> Recently, in-car monitoring has emerged as a promising technology for detecting early-stage abnormal status of the driver and providing timely alerts to prevent traffic accidents. Although training models with multimodal data enhances the reliability of abnormal status detection, the scarcity of labeled data and the imbalance of class distribution impede the extraction of critical abnormal state features, significantly deteriorating training performance. Furthermore, missing modalities due to environment and hardware limitations further exacerbate the challenge of abnormal status identification. More importantly, monitoring abnormal health conditions of passengers, particularly in elderly care, is of paramount importance but remains underexplored. To address these challenges, we introduce our IC3M, an efficient camera-rotation-based multimodal framework for monitoring both driver and passengers in a car. Our IC3M comprises two key modules: an adaptive threshold pseudo-labeling strategy and a missing modality reconstruction. The former customizes pseudo-labeling thresholds for different classes based on the class distribution, generating class-balanced pseudo labels to guide model training effectively, while the latter leverages crossmodality relationships learned from limited labels to accurately recover missing modalities by distribution transferring from available modalities. Extensive experimental results demonstrate that IC3M outperforms state-of-the-art benchmarks in accuracy, precision, and recall while exhibiting superior robustness under limited labeled data and severe missing modality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02592v4-abstract-full').style.display = 'none'; document.getElementById('2410.02592v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 17 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13167">arXiv:2409.13167</a> <span> [<a href="https://arxiv.org/pdf/2409.13167">pdf</a>, <a href="https://arxiv.org/ps/2409.13167">ps</a>, <a href="https://arxiv.org/format/2409.13167">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Unsupervised Attention-Based Multi-Source Domain Adaptation Framework for Drift Compensation in Electronic Nose Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+W">Wenwen Zhang</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shuhao Hu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Zhengyuan Zhang</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+Y">Yuanjin Zheng</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Q+J">Qi Jie Wang</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+Z">Zhiping Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13167v1-abstract-short" style="display: inline;"> Continuous, long-term monitoring of hazardous, noxious, explosive, and flammable gases in industrial environments using electronic nose (E-nose) systems faces the significant challenge of reduced gas identification accuracy due to time-varying drift in gas sensors. To address this issue, we propose a novel unsupervised attention-based multi-source domain shared-private feature fusion adaptation (A… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13167v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13167v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13167v1-abstract-full" style="display: none;"> Continuous, long-term monitoring of hazardous, noxious, explosive, and flammable gases in industrial environments using electronic nose (E-nose) systems faces the significant challenge of reduced gas identification accuracy due to time-varying drift in gas sensors. To address this issue, we propose a novel unsupervised attention-based multi-source domain shared-private feature fusion adaptation (AMDS-PFFA) framework for gas identification with drift compensation in E-nose systems. The AMDS-PFFA model effectively leverages labeled data from multiple source domains collected during the initial stage to accurately identify gases in unlabeled gas sensor array drift signals from the target domain. To validate the model's effectiveness, extensive experimental evaluations were conducted using both the University of California, Irvine (UCI) standard drift gas dataset, collected over 36 months, and drift signal data from our self-developed E-nose system, spanning 30 months. Compared to recent drift compensation methods, the AMDS-PFFA model achieves the highest average gas recognition accuracy with strong convergence, attaining 83.20% on the UCI dataset and 93.96% on data from our self-developed E-nose system across all target domain batches. These results demonstrate the superior performance of the AMDS-PFFA model in gas identification with drift compensation, significantly outperforming existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13167v1-abstract-full').style.display = 'none'; document.getElementById('2409.13167v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08797">arXiv:2409.08797</a> <span> [<a href="https://arxiv.org/pdf/2409.08797">pdf</a>, <a href="https://arxiv.org/format/2409.08797">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Exploring SSL Discrete Speech Features for Zipformer-based Contextual ASR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cui%2C+M">Mingyu Cui</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Y">Yifan Yang</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+J">Jiajun Deng</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shujie Hu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+T">Tianzi Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhaoqing Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shiliang Zhang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xie Chen</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xunying Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08797v1-abstract-short" style="display: inline;"> Self-supervised learning (SSL) based discrete speech representations are highly compact and domain adaptable. In this paper, SSL discrete speech features extracted from WavLM models are used as additional cross-utterance acoustic context features in Zipformer-Transducer ASR systems. The efficacy of replacing Fbank features with discrete token features for modelling either cross-utterance contexts… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08797v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08797v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08797v1-abstract-full" style="display: none;"> Self-supervised learning (SSL) based discrete speech representations are highly compact and domain adaptable. In this paper, SSL discrete speech features extracted from WavLM models are used as additional cross-utterance acoustic context features in Zipformer-Transducer ASR systems. The efficacy of replacing Fbank features with discrete token features for modelling either cross-utterance contexts (from preceding and future segments), or current utterance's internal contexts alone, or both at the same time, are demonstrated thoroughly on the Gigaspeech 1000-hr corpus. The best Zipformer-Transducer system using discrete tokens based cross-utterance context features outperforms the baseline using utterance internal context only with statistically significant word error rate (WER) reductions of 0.32% to 0.41% absolute (2.78% to 3.54% relative) on the dev and test data. The lowest published WER of 11.15% and 11.14% were obtained on the dev and test sets. Our work is open-source and publicly available at https://github.com/open-creator/icefall/tree/master/egs/gigaspeech/Context\_ASR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08797v1-abstract-full').style.display = 'none'; document.getElementById('2409.08797v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08596">arXiv:2409.08596</a> <span> [<a href="https://arxiv.org/pdf/2409.08596">pdf</a>, <a href="https://arxiv.org/format/2409.08596">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Large Language Model Can Transcribe Speech in Multi-Talker Scenarios with Versatile Instructions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Meng%2C+L">Lingwei Meng</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shujie Hu</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhaoqing Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuejiao Wang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+W">Wenxuan Wu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xixin Wu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xunying Liu</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08596v1-abstract-short" style="display: inline;"> Recent advancements in large language models (LLMs) have revolutionized various domains, bringing significant progress and new opportunities. Despite progress in speech-related tasks, LLMs have not been sufficiently explored in multi-talker scenarios. In this work, we present a pioneering effort to investigate the capability of LLMs in transcribing speech in multi-talker environments, following ve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08596v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08596v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08596v1-abstract-full" style="display: none;"> Recent advancements in large language models (LLMs) have revolutionized various domains, bringing significant progress and new opportunities. Despite progress in speech-related tasks, LLMs have not been sufficiently explored in multi-talker scenarios. In this work, we present a pioneering effort to investigate the capability of LLMs in transcribing speech in multi-talker environments, following versatile instructions related to multi-talker automatic speech recognition (ASR), target talker ASR, and ASR based on specific talker attributes such as sex, occurrence order, language, and keyword spoken. Our approach utilizes WavLM and Whisper encoder to extract multi-faceted speech representations that are sensitive to speaker characteristics and semantic context. These representations are then fed into an LLM fine-tuned using LoRA, enabling the capabilities for speech comprehension and transcription. Comprehensive experiments reveal the promising performance of our proposed system, MT-LLM, in cocktail party scenarios, highlighting the potential of LLM to handle speech-related tasks based on user instructions in such complex settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08596v1-abstract-full').style.display = 'none'; document.getElementById('2409.08596v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08881">arXiv:2408.08881</a> <span> [<a href="https://arxiv.org/pdf/2408.08881">pdf</a>, <a href="https://arxiv.org/format/2408.08881">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Challenge Summary U-MedSAM: Uncertainty-aware MedSAM for Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xiaoyu Liu</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+P">Peng Huang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+P">Pu Huang</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shu Hu</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+H">Hongtu Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08881v3-abstract-short" style="display: inline;"> Medical Image Foundation Models have proven to be powerful tools for mask prediction across various datasets. However, accurately assessing the uncertainty of their predictions remains a significant challenge. To address this, we propose a new model, U-MedSAM, which integrates the MedSAM model with an uncertainty-aware loss function and the Sharpness-Aware Minimization (SharpMin) optimizer. The un… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08881v3-abstract-full').style.display = 'inline'; document.getElementById('2408.08881v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08881v3-abstract-full" style="display: none;"> Medical Image Foundation Models have proven to be powerful tools for mask prediction across various datasets. However, accurately assessing the uncertainty of their predictions remains a significant challenge. To address this, we propose a new model, U-MedSAM, which integrates the MedSAM model with an uncertainty-aware loss function and the Sharpness-Aware Minimization (SharpMin) optimizer. The uncertainty-aware loss function automatically combines region-based, distribution-based, and pixel-based loss designs to enhance segmentation accuracy and robustness. SharpMin improves generalization by finding flat minima in the loss landscape, thereby reducing overfitting. Our method was evaluated in the CVPR24 MedSAM on Laptop challenge, where U-MedSAM demonstrated promising performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08881v3-abstract-full').style.display = 'none'; document.getElementById('2408.08881v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2405.17496</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04300">arXiv:2408.04300</a> <span> [<a href="https://arxiv.org/pdf/2408.04300">pdf</a>, <a href="https://arxiv.org/format/2408.04300">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> An Explainable Non-local Network for COVID-19 Diagnosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+J">Jingfu Yang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+P">Peng Huang</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+J">Jing Hu</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shu Hu</a>, <a href="/search/eess?searchtype=author&query=Lyu%2C+S">Siwei Lyu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+J">Jun Guo</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xi Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04300v1-abstract-short" style="display: inline;"> The CNN has achieved excellent results in the automatic classification of medical images. In this study, we propose a novel deep residual 3D attention non-local network (NL-RAN) to classify CT images included COVID-19, common pneumonia, and normal to perform rapid and explainable COVID-19 diagnosis. We built a deep residual 3D attention non-local network that could achieve end-to-end training. The… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04300v1-abstract-full').style.display = 'inline'; document.getElementById('2408.04300v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04300v1-abstract-full" style="display: none;"> The CNN has achieved excellent results in the automatic classification of medical images. In this study, we propose a novel deep residual 3D attention non-local network (NL-RAN) to classify CT images included COVID-19, common pneumonia, and normal to perform rapid and explainable COVID-19 diagnosis. We built a deep residual 3D attention non-local network that could achieve end-to-end training. The network is embedded with a nonlocal module to capture global information, while a 3D attention module is embedded to focus on the details of the lesion so that it can directly analyze the 3D lung CT and output the classification results. The output of the attention module can be used as a heat map to increase the interpretability of the model. 4079 3D CT scans were included in this study. Each scan had a unique label (novel coronavirus pneumonia, common pneumonia, and normal). The CT scans cohort was randomly split into a training set of 3263 scans, a validation set of 408 scans, and a testing set of 408 scans. And compare with existing mainstream classification methods, such as CovNet, CBAM, ResNet, etc. Simultaneously compare the visualization results with visualization methods such as CAM. Model performance was evaluated using the Area Under the ROC Curve(AUC), precision, and F1-score. The NL-RAN achieved the AUC of 0.9903, the precision of 0.9473, and the F1-score of 0.9462, surpass all the classification methods compared. The heat map output by the attention module is also clearer than the heat map output by CAM. Our experimental results indicate that our proposed method performs significantly better than existing methods. In addition, the first attention module outputs a heat map containing detailed outline information to increase the interpretability of the model. Our experiments indicate that the inference of our model is fast. It can provide real-time assistance with diagnosis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04300v1-abstract-full').style.display = 'none'; document.getElementById('2408.04300v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.02943">arXiv:2408.02943</a> <span> [<a href="https://arxiv.org/pdf/2408.02943">pdf</a>, <a href="https://arxiv.org/format/2408.02943">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Recent Advances in Data-driven Intelligent Control for Wireless Communication: A Comprehensive Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huo%2C+W">Wei Huo</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+H">Huiwen Yang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+N">Nachuan Yang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Z">Zhaohua Yang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Jiuzhou Zhang</a>, <a href="/search/eess?searchtype=author&query=Nan%2C+F">Fuhai Nan</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xingzhou Chen</a>, <a href="/search/eess?searchtype=author&query=Mao%2C+Y">Yifan Mao</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Suyang Hu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+P">Pengyu Wang</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+X">Xuanyu Zheng</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+M">Mingming Zhao</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+L">Ling Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.02943v1-abstract-short" style="display: inline;"> The advent of next-generation wireless communication systems heralds an era characterized by high data rates, low latency, massive connectivity, and superior energy efficiency. These systems necessitate innovative and adaptive strategies for resource allocation and device behavior control in wireless networks. Traditional optimization-based methods have been found inadequate in meeting the complex… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02943v1-abstract-full').style.display = 'inline'; document.getElementById('2408.02943v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.02943v1-abstract-full" style="display: none;"> The advent of next-generation wireless communication systems heralds an era characterized by high data rates, low latency, massive connectivity, and superior energy efficiency. These systems necessitate innovative and adaptive strategies for resource allocation and device behavior control in wireless networks. Traditional optimization-based methods have been found inadequate in meeting the complex demands of these emerging systems. As the volume of data continues to escalate, the integration of data-driven methods has become indispensable for enabling adaptive and intelligent control mechanisms in future wireless communication systems. This comprehensive survey explores recent advancements in data-driven methodologies applied to wireless communication networks. It focuses on developments over the past five years and their application to various control objectives within wireless cyber-physical systems. It encompasses critical areas such as link adaptation, user scheduling, spectrum allocation, beam management, power control, and the co-design of communication and control systems. We provide an in-depth exploration of the technical underpinnings that support these data-driven approaches, including the algorithms, models, and frameworks developed to enhance network performance and efficiency. We also examine the challenges that current data-driven algorithms face, particularly in the context of the dynamic and heterogeneous nature of next-generation wireless networks. The paper provides a critical analysis of these challenges and offers insights into potential solutions and future research directions. This includes discussing the adaptability, integration with 6G, and security of data-driven methods in the face of increasing network complexity and data volume. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02943v1-abstract-full').style.display = 'none'; document.getElementById('2408.02943v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.13782">arXiv:2407.13782</a> <span> [<a href="https://arxiv.org/pdf/2407.13782">pdf</a>, <a href="https://arxiv.org/format/2407.13782">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Self-supervised ASR Models and Features For Dysarthric and Elderly Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shujie Hu</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+X">Xurong Xie</a>, <a href="/search/eess?searchtype=author&query=Geng%2C+M">Mengzhe Geng</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+Z">Zengrui Jin</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+J">Jiajun Deng</a>, <a href="/search/eess?searchtype=author&query=Li%2C+G">Guinan Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yi Wang</a>, <a href="/search/eess?searchtype=author&query=Cui%2C+M">Mingyu Cui</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+T">Tianzi Wang</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xunying Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.13782v1-abstract-short" style="display: inline;"> Self-supervised learning (SSL) based speech foundation models have been applied to a wide range of ASR tasks. However, their application to dysarthric and elderly speech via data-intensive parameter fine-tuning is confronted by in-domain data scarcity and mismatch. To this end, this paper explores a series of approaches to integrate domain fine-tuned SSL pre-trained models and their features into… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13782v1-abstract-full').style.display = 'inline'; document.getElementById('2407.13782v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.13782v1-abstract-full" style="display: none;"> Self-supervised learning (SSL) based speech foundation models have been applied to a wide range of ASR tasks. However, their application to dysarthric and elderly speech via data-intensive parameter fine-tuning is confronted by in-domain data scarcity and mismatch. To this end, this paper explores a series of approaches to integrate domain fine-tuned SSL pre-trained models and their features into TDNN and Conformer ASR systems for dysarthric and elderly speech recognition. These include: a) input feature fusion between standard acoustic frontends and domain fine-tuned SSL speech representations; b) frame-level joint decoding between TDNN systems separately trained using standard acoustic features alone and those with additional domain fine-tuned SSL features; and c) multi-pass decoding involving the TDNN/Conformer system outputs to be rescored using domain fine-tuned pre-trained ASR models. In addition, fine-tuned SSL speech features are used in acoustic-to-articulatory (A2A) inversion to construct multi-modal ASR systems. Experiments are conducted on four tasks: the English UASpeech and TORGO dysarthric speech corpora; and the English DementiaBank Pitt and Cantonese JCCOCC MoCA elderly speech datasets. The TDNN systems constructed by integrating domain-adapted HuBERT, wav2vec2-conformer or multi-lingual XLSR models and their features consistently outperform the standalone fine-tuned SSL pre-trained models. These systems produced statistically significant WER or CER reductions of 6.53%, 1.90%, 2.04% and 7.97% absolute (24.10%, 23.84%, 10.14% and 31.39% relative) on the four tasks respectively. Consistent improvements in Alzheimer's Disease detection accuracy are also obtained using the DementiaBank Pitt elderly speech recognition outputs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13782v1-abstract-full').style.display = 'none'; document.getElementById('2407.13782v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE/ACM Transactions on Audio, Speech, and Language Processing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.08551">arXiv:2407.08551</a> <span> [<a href="https://arxiv.org/pdf/2407.08551">pdf</a>, <a href="https://arxiv.org/format/2407.08551">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Autoregressive Speech Synthesis without Vector Quantization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Meng%2C+L">Lingwei Meng</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+L">Long Zhou</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Shujie Liu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+S">Sanyuan Chen</a>, <a href="/search/eess?searchtype=author&query=Han%2C+B">Bing Han</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shujie Hu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yanqing Liu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jinyu Li</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Sheng Zhao</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xixin Wu</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+F">Furu Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.08551v1-abstract-short" style="display: inline;"> We present MELLE, a novel continuous-valued tokens based language modeling approach for text to speech synthesis (TTS). MELLE autoregressively generates continuous mel-spectrogram frames directly from text condition, bypassing the need for vector quantization, which are originally designed for audio compression and sacrifice fidelity compared to mel-spectrograms. Specifically, (i) instead of cross… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08551v1-abstract-full').style.display = 'inline'; document.getElementById('2407.08551v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.08551v1-abstract-full" style="display: none;"> We present MELLE, a novel continuous-valued tokens based language modeling approach for text to speech synthesis (TTS). MELLE autoregressively generates continuous mel-spectrogram frames directly from text condition, bypassing the need for vector quantization, which are originally designed for audio compression and sacrifice fidelity compared to mel-spectrograms. Specifically, (i) instead of cross-entropy loss, we apply regression loss with a proposed spectrogram flux loss function to model the probability distribution of the continuous-valued tokens. (ii) we have incorporated variational inference into MELLE to facilitate sampling mechanisms, thereby enhancing the output diversity and model robustness. Experiments demonstrate that, compared to the two-stage codec language models VALL-E and its variants, the single-stage MELLE mitigates robustness issues by avoiding the inherent flaws of sampling discrete codes, achieves superior performance across multiple metrics, and, most importantly, offers a more streamlined paradigm. See https://aka.ms/melle for demos of our work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08551v1-abstract-full').style.display = 'none'; document.getElementById('2407.08551v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.06310">arXiv:2407.06310</a> <span> [<a href="https://arxiv.org/pdf/2407.06310">pdf</a>, <a href="https://arxiv.org/format/2407.06310">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Homogeneous Speaker Features for On-the-Fly Dysarthric and Elderly Speaker Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Geng%2C+M">Mengzhe Geng</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+X">Xurong Xie</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+J">Jiajun Deng</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+Z">Zengrui Jin</a>, <a href="/search/eess?searchtype=author&query=Li%2C+G">Guinan Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+T">Tianzi Wang</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shujie Hu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhaoqing Li</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xunying Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.06310v1-abstract-short" style="display: inline;"> The application of data-intensive automatic speech recognition (ASR) technologies to dysarthric and elderly adult speech is confronted by their mismatch against healthy and nonaged voices, data scarcity and large speaker-level variability. To this end, this paper proposes two novel data-efficient methods to learn homogeneous dysarthric and elderly speaker-level features for rapid, on-the-fly test-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06310v1-abstract-full').style.display = 'inline'; document.getElementById('2407.06310v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.06310v1-abstract-full" style="display: none;"> The application of data-intensive automatic speech recognition (ASR) technologies to dysarthric and elderly adult speech is confronted by their mismatch against healthy and nonaged voices, data scarcity and large speaker-level variability. To this end, this paper proposes two novel data-efficient methods to learn homogeneous dysarthric and elderly speaker-level features for rapid, on-the-fly test-time adaptation of DNN/TDNN and Conformer ASR models. These include: 1) speaker-level variance-regularized spectral basis embedding (VR-SBE) features that exploit a special regularization term to enforce homogeneity of speaker features in adaptation; and 2) feature-based learning hidden unit contributions (f-LHUC) transforms that are conditioned on VR-SBE features. Experiments are conducted on four tasks across two languages: the English UASpeech and TORGO dysarthric speech datasets, the English DementiaBank Pitt and Cantonese JCCOCC MoCA elderly speech corpora. The proposed on-the-fly speaker adaptation techniques consistently outperform baseline iVector and xVector adaptation by statistically significant word or character error rate reductions up to 5.32% absolute (18.57% relative) and batch-mode LHUC speaker adaptation by 2.24% absolute (9.20% relative), while operating with real-time factors speeding up to 33.6 times against xVectors during adaptation. The efficacy of the proposed adaptation techniques is demonstrated in a comparison against current ASR technologies including SSL pre-trained systems on UASpeech, where our best system produces a state-of-the-art WER of 23.33%. Analyses show VR-SBE features and f-LHUC transforms are insensitive to speaker-level data quantity in testtime adaptation. T-SNE visualization reveals they have stronger speaker-level homogeneity than baseline iVectors, xVectors and batch-mode LHUC transforms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06310v1-abstract-full').style.display = 'none'; document.getElementById('2407.06310v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In submission to IEEE/ACM Transactions on Audio, Speech, and Language Processing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17338">arXiv:2406.17338</a> <span> [<a href="https://arxiv.org/pdf/2406.17338">pdf</a>, <a href="https://arxiv.org/format/2406.17338">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Robustly Optimized Deep Feature Decoupling Network for Fatty Liver Diseases Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+P">Peng Huang</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shu Hu</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+B">Bo Peng</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Jiashu Zhang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xi Wu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17338v1-abstract-short" style="display: inline;"> Current medical image classification efforts mainly aim for higher average performance, often neglecting the balance between different classes. This can lead to significant differences in recognition accuracy between classes and obvious recognition weaknesses. Without the support of massive data, deep learning faces challenges in fine-grained classification of fatty liver. In this paper, we propos… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17338v1-abstract-full').style.display = 'inline'; document.getElementById('2406.17338v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17338v1-abstract-full" style="display: none;"> Current medical image classification efforts mainly aim for higher average performance, often neglecting the balance between different classes. This can lead to significant differences in recognition accuracy between classes and obvious recognition weaknesses. Without the support of massive data, deep learning faces challenges in fine-grained classification of fatty liver. In this paper, we propose an innovative deep learning framework that combines feature decoupling and adaptive adversarial training. Firstly, we employ two iteratively compressed decouplers to supervised decouple common features and specific features related to fatty liver in abdominal ultrasound images. Subsequently, the decoupled features are concatenated with the original image after transforming the color space and are fed into the classifier. During adversarial training, we adaptively adjust the perturbation and balance the adversarial strength by the accuracy of each class. The model will eliminate recognition weaknesses by correctly classifying adversarial samples, thus improving recognition robustness. Finally, the accuracy of our method improved by 4.16%, achieving 82.95%. As demonstrated by extensive experiments, our method is a generalized learning framework that can be directly used to eliminate the recognition weaknesses of any classifier while improving its average performance. Code is available at https://github.com/HP-ML/MICCAI2024. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17338v1-abstract-full').style.display = 'none'; document.getElementById('2406.17338v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">MICCAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.15093">arXiv:2406.15093</a> <span> [<a href="https://arxiv.org/pdf/2406.15093">pdf</a>, <a href="https://arxiv.org/format/2406.15093">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> ECLIPSE: Expunging Clean-label Indiscriminate Poisons via Sparse Diffusion Purification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xianlong Wang</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shengshan Hu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yechao Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Z">Ziqi Zhou</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+L+Y">Leo Yu Zhang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+P">Peng Xu</a>, <a href="/search/eess?searchtype=author&query=Wan%2C+W">Wei Wan</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+H">Hai Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.15093v2-abstract-short" style="display: inline;"> Clean-label indiscriminate poisoning attacks add invisible perturbations to correctly labeled training images, thus dramatically reducing the generalization capability of the victim models. Recently, some defense mechanisms have been proposed such as adversarial training, image transformation techniques, and image purification. However, these schemes are either susceptible to adaptive attacks, bui… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.15093v2-abstract-full').style.display = 'inline'; document.getElementById('2406.15093v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.15093v2-abstract-full" style="display: none;"> Clean-label indiscriminate poisoning attacks add invisible perturbations to correctly labeled training images, thus dramatically reducing the generalization capability of the victim models. Recently, some defense mechanisms have been proposed such as adversarial training, image transformation techniques, and image purification. However, these schemes are either susceptible to adaptive attacks, built on unrealistic assumptions, or only effective against specific poison types, limiting their universal applicability. In this research, we propose a more universally effective, practical, and robust defense scheme called ECLIPSE. We first investigate the impact of Gaussian noise on the poisons and theoretically prove that any kind of poison will be largely assimilated when imposing sufficient random noise. In light of this, we assume the victim has access to an extremely limited number of clean images (a more practical scene) and subsequently enlarge this sparse set for training a denoising probabilistic model (a universal denoising tool). We then begin by introducing Gaussian noise to absorb the poisons and then apply the model for denoising, resulting in a roughly purified dataset. Finally, to address the trade-off of the inconsistency in the assimilation sensitivity of different poisons by Gaussian noise, we propose a lightweight corruption compensation module to effectively eliminate residual poisons, providing a more universal defense approach. Extensive experiments demonstrate that our defense approach outperforms 10 state-of-the-art defenses. We also propose an adaptive attack against ECLIPSE and verify the robustness of our defense scheme. Our code is available at https://github.com/CGCL-codes/ECLIPSE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.15093v2-abstract-full').style.display = 'none'; document.getElementById('2406.15093v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ESORICS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.10160">arXiv:2406.10160</a> <span> [<a href="https://arxiv.org/pdf/2406.10160">pdf</a>, <a href="https://arxiv.org/format/2406.10160">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> One-pass Multiple Conformer and Foundation Speech Systems Compression and Quantization Using An All-in-one Neural Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhaoqing Li</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+H">Haoning Xu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+T">Tianzi Wang</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shoukang Hu</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+Z">Zengrui Jin</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shujie Hu</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+J">Jiajun Deng</a>, <a href="/search/eess?searchtype=author&query=Cui%2C+M">Mingyu Cui</a>, <a href="/search/eess?searchtype=author&query=Geng%2C+M">Mengzhe Geng</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xunying Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.10160v1-abstract-short" style="display: inline;"> We propose a novel one-pass multiple ASR systems joint compression and quantization approach using an all-in-one neural model. A single compression cycle allows multiple nested systems with varying Encoder depths, widths, and quantization precision settings to be simultaneously constructed without the need to train and store individual target systems separately. Experiments consistently demonstrat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10160v1-abstract-full').style.display = 'inline'; document.getElementById('2406.10160v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.10160v1-abstract-full" style="display: none;"> We propose a novel one-pass multiple ASR systems joint compression and quantization approach using an all-in-one neural model. A single compression cycle allows multiple nested systems with varying Encoder depths, widths, and quantization precision settings to be simultaneously constructed without the need to train and store individual target systems separately. Experiments consistently demonstrate the multiple ASR systems compressed in a single all-in-one model produced a word error rate (WER) comparable to, or lower by up to 1.01\% absolute (6.98\% relative) than individually trained systems of equal complexity. A 3.4x overall system compression and training time speed-up was achieved. Maximum model size compression ratios of 12.8x and 3.93x were obtained over the baseline Switchboard-300hr Conformer and LibriSpeech-100hr fine-tuned wav2vec2.0 models, respectively, incurring no statistically significant WER increase. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10160v1-abstract-full').style.display = 'none'; document.getElementById('2406.10160v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.10152">arXiv:2406.10152</a> <span> [<a href="https://arxiv.org/pdf/2406.10152">pdf</a>, <a href="https://arxiv.org/format/2406.10152">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Joint Speaker Features Learning for Audio-visual Multichannel Speech Separation and Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+G">Guinan Li</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+J">Jiajun Deng</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Youjun Chen</a>, <a href="/search/eess?searchtype=author&query=Geng%2C+M">Mengzhe Geng</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shujie Hu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhe Li</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+Z">Zengrui Jin</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+T">Tianzi Wang</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+X">Xurong Xie</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xunying Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.10152v1-abstract-short" style="display: inline;"> This paper proposes joint speaker feature learning methods for zero-shot adaptation of audio-visual multichannel speech separation and recognition systems. xVector and ECAPA-TDNN speaker encoders are connected using purpose-built fusion blocks and tightly integrated with the complete system training. Experiments conducted on LRS3-TED data simulated multichannel overlapped speech suggest that joint… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10152v1-abstract-full').style.display = 'inline'; document.getElementById('2406.10152v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.10152v1-abstract-full" style="display: none;"> This paper proposes joint speaker feature learning methods for zero-shot adaptation of audio-visual multichannel speech separation and recognition systems. xVector and ECAPA-TDNN speaker encoders are connected using purpose-built fusion blocks and tightly integrated with the complete system training. Experiments conducted on LRS3-TED data simulated multichannel overlapped speech suggest that joint speaker feature learning consistently improves speech separation and recognition performance over the baselines without joint speaker feature estimation. Further analyses reveal performance improvements are strongly correlated with increased inter-speaker discrimination measured using cosine similarity. The best-performing joint speaker feature learning adapted system outperformed the baseline fine-tuned WavLM model by statistically significant WER reductions of 21.6% and 25.3% absolute (67.5% and 83.5% relative) on Dev and Test sets after incorporating WavLM features and video modality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10152v1-abstract-full').style.display = 'none'; document.getElementById('2406.10152v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.10034">arXiv:2406.10034</a> <span> [<a href="https://arxiv.org/pdf/2406.10034">pdf</a>, <a href="https://arxiv.org/format/2406.10034">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Towards Effective and Efficient Non-autoregressive Decoding Using Block-based Attention Mask </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+T">Tianzi Wang</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+X">Xurong Xie</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhaoqing Li</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shoukang Hu</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+Z">Zengrui Jin</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+J">Jiajun Deng</a>, <a href="/search/eess?searchtype=author&query=Cui%2C+M">Mingyu Cui</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shujie Hu</a>, <a href="/search/eess?searchtype=author&query=Geng%2C+M">Mengzhe Geng</a>, <a href="/search/eess?searchtype=author&query=Li%2C+G">Guinan Li</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xunying Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.10034v3-abstract-short" style="display: inline;"> This paper proposes a novel non-autoregressive (NAR) block-based Attention Mask Decoder (AMD) that flexibly balances performance-efficiency trade-offs for Conformer ASR systems. AMD performs parallel NAR inference within contiguous blocks of output labels that are concealed using attention masks, while conducting left-to-right AR prediction and history context amalgamation between blocks. A beam s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10034v3-abstract-full').style.display = 'inline'; document.getElementById('2406.10034v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.10034v3-abstract-full" style="display: none;"> This paper proposes a novel non-autoregressive (NAR) block-based Attention Mask Decoder (AMD) that flexibly balances performance-efficiency trade-offs for Conformer ASR systems. AMD performs parallel NAR inference within contiguous blocks of output labels that are concealed using attention masks, while conducting left-to-right AR prediction and history context amalgamation between blocks. A beam search algorithm is designed to leverage a dynamic fusion of CTC, AR Decoder, and AMD probabilities. Experiments on the LibriSpeech-100hr corpus suggest the tripartite Decoder incorporating the AMD module produces a maximum decoding speed-up ratio of 1.73x over the baseline CTC+AR decoding, while incurring no statistically significant word error rate (WER) increase on the test sets. When operating with the same decoding real time factors, statistically significant WER reductions of up to 0.7% and 0.3% absolute (5.3% and 6.1% relative) were obtained over the CTC+AR baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10034v3-abstract-full').style.display = 'none'; document.getElementById('2406.10034v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures, 2 tables, Interspeech24 conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.19527">arXiv:2405.19527</a> <span> [<a href="https://arxiv.org/pdf/2405.19527">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Flexible Agent-based Modeling Framework to Evaluate Integrated Microtransit and Fixed-route Transit Designs: Mode Choice, Supernetworks, and Fleet Simulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hu%2C+S">Siwei Hu</a>, <a href="/search/eess?searchtype=author&query=Hyland%2C+M+F">Michael F. Hyland</a>, <a href="/search/eess?searchtype=author&query=Saha%2C+R">Ritun Saha</a>, <a href="/search/eess?searchtype=author&query=Berkel%2C+J+J">Jacob J. Berkel</a>, <a href="/search/eess?searchtype=author&query=Veen%2C+G+V">Geoffrey Vander Veen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.19527v1-abstract-short" style="display: inline;"> The integration of traditional fixed-route transit (FRT) and more flexible microtransit has been touted as a means of improving mobility and access to opportunity, increasing transit ridership, and promoting environmental sustainability. To help evaluate integrated FRT and microtransit public transit (PT) system (henceforth ``integrated fixed-flex PT system'') designs, we propose a high-fidelity m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19527v1-abstract-full').style.display = 'inline'; document.getElementById('2405.19527v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.19527v1-abstract-full" style="display: none;"> The integration of traditional fixed-route transit (FRT) and more flexible microtransit has been touted as a means of improving mobility and access to opportunity, increasing transit ridership, and promoting environmental sustainability. To help evaluate integrated FRT and microtransit public transit (PT) system (henceforth ``integrated fixed-flex PT system'') designs, we propose a high-fidelity modeling framework that provides reliable estimates for a wide range of (i) performance metrics and (ii) integrated fixed-flex PT system designs. We formulate the mode choice equilibrium problem as a fixed-point problem wherein microtransit demand is a function of microtransit performance, and microtransit performance depends on microtransit demand. We propose a detailed agent-based simulation modeling framework that includes (i) a binary logit mode choice model (private auto vs. transit), (ii) a supernetwork-based model and pathfinding algorithm for multi-modal transit path choice where the supernetwork includes pedestrian, FRT, and microtransit layers, (iii) a detailed mobility-on-demand fleet simulator called FleetPy to model the supply-demand dynamics of the microtransit service. In this paper, we illustrate the capabilities of the modeling framework by analyzing integrated fixed-flex PT system designs that vary the following design parameters: FRT frequencies and microtransit fleet size, service region structure, virtual stop coverage, and operating hours. We include case studies in downtown San Diego and Lemon Grove, California. The computational results show that the proposed modeling framework converges to a mode choice equilibrium. Moreover, the scenario results imply that introducing a new microtransit service decreases FRT ridership and requires additional subsidies, but it significantly increases job accessibility and slightly reduces total VMT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19527v1-abstract-full').style.display = 'none'; document.getElementById('2405.19527v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">49 pages, 25 figures, 8 tables; Submitted To: Transportation Research Part C: Emerging Technologies on May 1st, 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.17496">arXiv:2405.17496</a> <span> [<a href="https://arxiv.org/pdf/2405.17496">pdf</a>, <a href="https://arxiv.org/format/2405.17496">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> UU-Mamba: Uncertainty-aware U-Mamba for Cardiac Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Tsai%2C+T+Y">Ting Yu Tsai</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+L">Li Lin</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shu Hu</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+M">Ming-Ching Chang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+H">Hongtu Zhu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.17496v4-abstract-short" style="display: inline;"> Biomedical image segmentation is critical for accurate identification and analysis of anatomical structures in medical imaging, particularly in cardiac MRI. Manual segmentation is labor-intensive, time-consuming, and prone to errors, highlighting the need for automated methods. However, current machine learning approaches face challenges like overfitting and data demands. To tackle these issues, w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17496v4-abstract-full').style.display = 'inline'; document.getElementById('2405.17496v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.17496v4-abstract-full" style="display: none;"> Biomedical image segmentation is critical for accurate identification and analysis of anatomical structures in medical imaging, particularly in cardiac MRI. Manual segmentation is labor-intensive, time-consuming, and prone to errors, highlighting the need for automated methods. However, current machine learning approaches face challenges like overfitting and data demands. To tackle these issues, we propose a new UU-Mamba model, integrating the U-Mamba model with the Sharpness-Aware Minimization (SAM) optimizer and an uncertainty-aware loss function. SAM enhances generalization by locating flat minima in the loss landscape, thus reducing overfitting. The uncertainty-aware loss combines region-based, distribution-based, and pixel-based loss designs to improve segmentation accuracy and robustness. Evaluation of our method is performed on the ACDC cardiac dataset, outperforming state-of-the-art models including TransUNet, Swin-Unet, nnUNet, and nnFormer. Our approach achieves Dice Similarity Coefficient (DSC) and Mean Squared Error (MSE) scores, demonstrating its effectiveness in cardiac MRI segmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17496v4-abstract-full').style.display = 'none'; document.getElementById('2405.17496v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.09443">arXiv:2405.09443</a> <span> [<a href="https://arxiv.org/pdf/2405.09443">pdf</a>, <a href="https://arxiv.org/format/2405.09443">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Low-Complexity Joint Azimuth-Range-Velocity Estimation for Integrated Sensing and Communication with OFDM Waveform </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Jun Zhang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+G">Gang Yang</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Q">Qibin Ye</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Y">Yixuan Huang</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Su Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.09443v1-abstract-short" style="display: inline;"> Integrated sensing and communication (ISAC) is a main application scenario of the sixth-generation mobile communication systems. Due to the fast-growing number of antennas and subcarriers in cellular systems, the computational complexity of joint azimuth-range-velocity estimation (JARVE) in ISAC systems is extremely high. This paper studies the JARVE problem for a monostatic ISAC system with ortho… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.09443v1-abstract-full').style.display = 'inline'; document.getElementById('2405.09443v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.09443v1-abstract-full" style="display: none;"> Integrated sensing and communication (ISAC) is a main application scenario of the sixth-generation mobile communication systems. Due to the fast-growing number of antennas and subcarriers in cellular systems, the computational complexity of joint azimuth-range-velocity estimation (JARVE) in ISAC systems is extremely high. This paper studies the JARVE problem for a monostatic ISAC system with orthogonal frequency division multiplexing (OFDM) waveform, in which a base station receives the echos of its transmitted cellular OFDM signals to sense multiple targets. The Cramer-Rao bounds are first derived for JARVE. A low-complexity algorithm is further designed for super-resolution JARVE, which utilizes the proposed iterative subspace update scheme and Levenberg-Marquardt optimization method to replace the exhaustive search of spatial spectrum in multiple-signal-classification (MUSIC) algorithm. Finally, with the practical parameters of 5G New Radio, simulation results verify that the proposed algorithm can reduce the computational complexity by three orders of magnitude and two orders of magnitude compared to the existing three-dimensional MUSIC algorithm and estimation-of-signal-parameters-using-rotational-invariance-techniques (ESPRIT) algorithm, respectively, and also improve the estimation performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.09443v1-abstract-full').style.display = 'none'; document.getElementById('2405.09443v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 12 figures, submitted to IEEE journal</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.15212">arXiv:2404.15212</a> <span> [<a href="https://arxiv.org/pdf/2404.15212">pdf</a>, <a href="https://arxiv.org/format/2404.15212">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Real-time Lane-wise Traffic Monitoring in Optimal ROIs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Qiu%2C+M">Mei Qiu</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+W">Wei Lin</a>, <a href="/search/eess?searchtype=author&query=Christopher%2C+L+A">Lauren Ann Christopher</a>, <a href="/search/eess?searchtype=author&query=Chien%2C+S">Stanley Chien</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yaobin Chen</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shu Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.15212v1-abstract-short" style="display: inline;"> In the US, thousands of Pan, Tilt, and Zoom (PTZ) traffic cameras monitor highway conditions. There is a great interest in using these highway cameras to gather valuable road traffic data to support traffic analysis and decision-making for highway safety and efficient traffic management. However, there are too many cameras for a few human traffic operators to effectively monitor, so a fully automa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15212v1-abstract-full').style.display = 'inline'; document.getElementById('2404.15212v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.15212v1-abstract-full" style="display: none;"> In the US, thousands of Pan, Tilt, and Zoom (PTZ) traffic cameras monitor highway conditions. There is a great interest in using these highway cameras to gather valuable road traffic data to support traffic analysis and decision-making for highway safety and efficient traffic management. However, there are too many cameras for a few human traffic operators to effectively monitor, so a fully automated solution is desired. This paper introduces a novel system that learns the locations of highway lanes and traffic directions from these camera feeds automatically. It collects real-time, lane-specific traffic data continuously, even adjusting for changes in camera angle or zoom. This facilitates efficient traffic analysis, decision-making, and improved highway safety. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15212v1-abstract-full').style.display = 'none'; document.getElementById('2404.15212v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.12908">arXiv:2404.12908</a> <span> [<a href="https://arxiv.org/pdf/2404.12908">pdf</a>, <a href="https://arxiv.org/format/2404.12908">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Robust CLIP-Based Detector for Exposing Diffusion Model-Generated Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Santosh"> Santosh</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+L">Li Lin</a>, <a href="/search/eess?searchtype=author&query=Amerini%2C+I">Irene Amerini</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shu Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.12908v2-abstract-short" style="display: inline;"> Diffusion models (DMs) have revolutionized image generation, producing high-quality images with applications spanning various fields. However, their ability to create hyper-realistic images poses significant challenges in distinguishing between real and synthetic content, raising concerns about digital authenticity and potential misuse in creating deepfakes. This work introduces a robust detection… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.12908v2-abstract-full').style.display = 'inline'; document.getElementById('2404.12908v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.12908v2-abstract-full" style="display: none;"> Diffusion models (DMs) have revolutionized image generation, producing high-quality images with applications spanning various fields. However, their ability to create hyper-realistic images poses significant challenges in distinguishing between real and synthetic content, raising concerns about digital authenticity and potential misuse in creating deepfakes. This work introduces a robust detection framework that integrates image and text features extracted by CLIP model with a Multilayer Perceptron (MLP) classifier. We propose a novel loss that can improve the detector's robustness and handle imbalanced datasets. Additionally, we flatten the loss landscape during the model training to improve the detector's generalization capabilities. The effectiveness of our method, which outperforms traditional detection techniques, is demonstrated through extensive experiments, underscoring its potential to set a new state-of-the-art approach in DM-generated image detection. The code is available at https://github.com/Purdue-M2/Robust_DM_Generated_Image_Detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.12908v2-abstract-full').style.display = 'none'; document.getElementById('2404.12908v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.09729">arXiv:2404.09729</a> <span> [<a href="https://arxiv.org/pdf/2404.09729">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> </div> </div> <p class="title is-5 mathjax"> Amplitude-Phase Fusion for Enhanced Electrocardiogram Morphological Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shuaicong Hu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yanan Wang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jian Liu</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+J">Jingyu Lin</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+S">Shengmei Qin</a>, <a href="/search/eess?searchtype=author&query=Nie%2C+Z">Zhenning Nie</a>, <a href="/search/eess?searchtype=author&query=Yao%2C+Z">Zhifeng Yao</a>, <a href="/search/eess?searchtype=author&query=Cai%2C+W">Wenjie Cai</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+C">Cuiwei Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.09729v1-abstract-short" style="display: inline;"> Considering the variability of amplitude and phase patterns in electrocardiogram (ECG) signals due to cardiac activity and individual differences, existing entropy-based studies have not fully utilized these two patterns and lack integration. To address this gap, this paper proposes a novel fusion entropy metric, morphological ECG entropy (MEE) for the first time, specifically designed for ECG mor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.09729v1-abstract-full').style.display = 'inline'; document.getElementById('2404.09729v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.09729v1-abstract-full" style="display: none;"> Considering the variability of amplitude and phase patterns in electrocardiogram (ECG) signals due to cardiac activity and individual differences, existing entropy-based studies have not fully utilized these two patterns and lack integration. To address this gap, this paper proposes a novel fusion entropy metric, morphological ECG entropy (MEE) for the first time, specifically designed for ECG morphology, to comprehensively describe the fusion of amplitude and phase patterns. MEE is computed based on beat-level samples, enabling detailed analysis of each cardiac cycle. Experimental results demonstrate that MEE achieves rapid, accurate, and label-free localization of abnormal ECG arrhythmia regions. Furthermore, MEE provides a method for assessing sample diversity, facilitating compression of imbalanced training sets (via representative sample selection), and outperforms random pruning. Additionally, MEE exhibits the ability to describe areas of poor quality. By discussing, it proves the robustness of MEE value calculation to noise interference and its low computational complexity. Finally, we integrate this method into a clinical interactive interface to provide a more convenient and intuitive user experience. These findings indicate that MEE serves as a valuable clinical descriptor for ECG characterization. The implementation code can be referenced at the following link: https://github.com/fdu-harry/ECG-MEE-metric. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.09729v1-abstract-full').style.display = 'none'; document.getElementById('2404.09729v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 12 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.5.2 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.00656">arXiv:2404.00656</a> <span> [<a href="https://arxiv.org/pdf/2404.00656">pdf</a>, <a href="https://arxiv.org/format/2404.00656">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> WavLLM: Towards Robust and Adaptive Speech Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shujie Hu</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+L">Long Zhou</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Shujie Liu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+S">Sanyuan Chen</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+L">Lingwei Meng</a>, <a href="/search/eess?searchtype=author&query=Hao%2C+H">Hongkun Hao</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+J">Jing Pan</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xunying Liu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jinyu Li</a>, <a href="/search/eess?searchtype=author&query=Sivasankaran%2C+S">Sunit Sivasankaran</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+L">Linquan Liu</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+F">Furu Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.00656v3-abstract-short" style="display: inline;"> The recent advancements in large language models (LLMs) have revolutionized the field of natural language processing, progressively broadening their scope to multimodal perception and generation. However, effectively integrating listening capabilities into LLMs poses significant challenges, particularly with respect to generalizing across varied contexts and executing complex auditory tasks. In th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00656v3-abstract-full').style.display = 'inline'; document.getElementById('2404.00656v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.00656v3-abstract-full" style="display: none;"> The recent advancements in large language models (LLMs) have revolutionized the field of natural language processing, progressively broadening their scope to multimodal perception and generation. However, effectively integrating listening capabilities into LLMs poses significant challenges, particularly with respect to generalizing across varied contexts and executing complex auditory tasks. In this work, we introduce WavLLM, a robust and adaptive speech large language model with dual encoders, and a prompt-aware LoRA weight adapter, optimized by a two-stage curriculum learning approach. Leveraging dual encoders, we decouple different types of speech information, utilizing a Whisper encoder to process the semantic content of speech, and a WavLM encoder to capture the unique characteristics of the speaker's identity. Within the curriculum learning framework, WavLLM first builds its foundational capabilities by optimizing on mixed elementary single tasks, followed by advanced multi-task training on more complex tasks such as combinations of the elementary tasks. To enhance the flexibility and adherence to different tasks and instructions, a prompt-aware LoRA weight adapter is introduced in the second advanced multi-task training stage. We validate the proposed model on universal speech benchmarks including tasks such as ASR, ST, SV, ER, and also apply it to specialized datasets like Gaokao English listening comprehension set for SQA, and speech Chain-of-Thought (CoT) evaluation set. Experiments demonstrate that the proposed model achieves state-of-the-art performance across a range of speech tasks on the same model size, exhibiting robust generalization capabilities in executing complex tasks using CoT approach. Furthermore, our model successfully completes Gaokao tasks without specialized training. The codes, models, audio, and Gaokao evaluation set can be accessed at \url{aka.ms/wavllm}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00656v3-abstract-full').style.display = 'none'; document.getElementById('2404.00656v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by EMNLP2024 findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.10067">arXiv:2403.10067</a> <span> [<a href="https://arxiv.org/pdf/2403.10067">pdf</a>, <a href="https://arxiv.org/format/2403.10067">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Hybrid Convolutional and Attention Network for Hyperspectral Image Denoising </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shuai Hu</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+F">Feng Gao</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+X">Xiaowei Zhou</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+J">Junyu Dong</a>, <a href="/search/eess?searchtype=author&query=Du%2C+Q">Qian Du</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.10067v1-abstract-short" style="display: inline;"> Hyperspectral image (HSI) denoising is critical for the effective analysis and interpretation of hyperspectral data. However, simultaneously modeling global and local features is rarely explored to enhance HSI denoising. In this letter, we propose a hybrid convolution and attention network (HCANet), which leverages both the strengths of convolution neural networks (CNNs) and Transformers. To enhan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10067v1-abstract-full').style.display = 'inline'; document.getElementById('2403.10067v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.10067v1-abstract-full" style="display: none;"> Hyperspectral image (HSI) denoising is critical for the effective analysis and interpretation of hyperspectral data. However, simultaneously modeling global and local features is rarely explored to enhance HSI denoising. In this letter, we propose a hybrid convolution and attention network (HCANet), which leverages both the strengths of convolution neural networks (CNNs) and Transformers. To enhance the modeling of both global and local features, we have devised a convolution and attention fusion module aimed at capturing long-range dependencies and neighborhood spectral correlations. Furthermore, to improve multi-scale information aggregation, we design a multi-scale feed-forward network to enhance denoising performance by extracting features at different scales. Experimental results on mainstream HSI datasets demonstrate the rationality and effectiveness of the proposed HCANet. The proposed model is effective in removing various types of complex noise. Our codes are available at \url{https://github.com/summitgao/HCANet}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10067v1-abstract-full').style.display = 'none'; document.getElementById('2403.10067v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE GRSL 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.08947">arXiv:2403.08947</a> <span> [<a href="https://arxiv.org/pdf/2403.08947">pdf</a>, <a href="https://arxiv.org/format/2403.08947">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Robust COVID-19 Detection in CT Images with CLIP </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lin%2C+L">Li Lin</a>, <a href="/search/eess?searchtype=author&query=Krubha%2C+Y+S">Yamini Sri Krubha</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Z">Zhenhuan Yang</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+C">Cheng Ren</a>, <a href="/search/eess?searchtype=author&query=Le%2C+T+D">Thuc Duy Le</a>, <a href="/search/eess?searchtype=author&query=Amerini%2C+I">Irene Amerini</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shu Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.08947v3-abstract-short" style="display: inline;"> In the realm of medical imaging, particularly for COVID-19 detection, deep learning models face substantial challenges such as the necessity for extensive computational resources, the paucity of well-annotated datasets, and a significant amount of unlabeled data. In this work, we introduce the first lightweight detector designed to overcome these obstacles, leveraging a frozen CLIP image encoder a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08947v3-abstract-full').style.display = 'inline'; document.getElementById('2403.08947v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.08947v3-abstract-full" style="display: none;"> In the realm of medical imaging, particularly for COVID-19 detection, deep learning models face substantial challenges such as the necessity for extensive computational resources, the paucity of well-annotated datasets, and a significant amount of unlabeled data. In this work, we introduce the first lightweight detector designed to overcome these obstacles, leveraging a frozen CLIP image encoder and a trainable multilayer perception (MLP). Enhanced with Conditional Value at Risk (CVaR) for robustness and a loss landscape flattening strategy for improved generalization, our model is tailored for high efficacy in COVID-19 detection. Furthermore, we integrate a teacher-student framework to capitalize on the vast amounts of unlabeled data, enabling our model to achieve superior performance despite the inherent data limitations. Experimental results on the COV19-CT-DB dataset demonstrate the effectiveness of our approach, surpassing baseline by up to 10.6% in `macro' F1 score in supervised learning. The code is available at https://github.com/Purdue-M2/COVID-19_Detection_M2_PURDUE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08947v3-abstract-full').style.display = 'none'; document.getElementById('2403.08947v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.17797">arXiv:2402.17797</a> <span> [<a href="https://arxiv.org/pdf/2402.17797">pdf</a>, <a href="https://arxiv.org/format/2402.17797">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Neural Radiance Fields in Medical Imaging: Challenges and Next Steps </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shu Hu</a>, <a href="/search/eess?searchtype=author&query=Fan%2C+H">Heng Fan</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+H">Hongtu Zhu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.17797v3-abstract-short" style="display: inline;"> Neural Radiance Fields (NeRF), as a pioneering technique in computer vision, offer great potential to revolutionize medical imaging by synthesizing three-dimensional representations from the projected two-dimensional image data. However, they face unique challenges when applied to medical applications. This paper presents a comprehensive examination of applications of NeRFs in medical imaging, hig… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.17797v3-abstract-full').style.display = 'inline'; document.getElementById('2402.17797v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.17797v3-abstract-full" style="display: none;"> Neural Radiance Fields (NeRF), as a pioneering technique in computer vision, offer great potential to revolutionize medical imaging by synthesizing three-dimensional representations from the projected two-dimensional image data. However, they face unique challenges when applied to medical applications. This paper presents a comprehensive examination of applications of NeRFs in medical imaging, highlighting four imminent challenges, including fundamental imaging principles, inner structure requirement, object boundary definition, and color density significance. We discuss current methods on different organs and discuss related limitations. We also review several datasets and evaluation metrics and propose several promising directions for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.17797v3-abstract-full').style.display = 'none'; document.getElementById('2402.17797v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.08743">arXiv:2402.08743</a> <span> [<a href="https://arxiv.org/pdf/2402.08743">pdf</a>, <a href="https://arxiv.org/format/2402.08743">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> ADS: Approximate Densest Subgraph for Novel Image Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shanfeng Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.08743v1-abstract-short" style="display: inline;"> The volume of image repositories continues to grow. Despite the availability of content-based addressing, we still lack a lightweight tool that allows us to discover images of distinct characteristics from a large collection. In this paper, we propose a fast and training-free algorithm for novel image discovery. The key of our algorithm is formulating a collection of images as a perceptual distanc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.08743v1-abstract-full').style.display = 'inline'; document.getElementById('2402.08743v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.08743v1-abstract-full" style="display: none;"> The volume of image repositories continues to grow. Despite the availability of content-based addressing, we still lack a lightweight tool that allows us to discover images of distinct characteristics from a large collection. In this paper, we propose a fast and training-free algorithm for novel image discovery. The key of our algorithm is formulating a collection of images as a perceptual distance-weighted graph, within which our task is to locate the K-densest subgraph that corresponds to a subset of the most unique images. While solving this problem is not just NP-hard but also requires a full computation of the potentially huge distance matrix, we propose to relax it into a K-sparse eigenvector problem that we can efficiently solve using stochastic gradient descent (SGD) without explicitly computing the distance matrix. We compare our algorithm against state-of-the-arts on both synthetic and real datasets, showing that it is considerably faster to run with a smaller memory footprint while able to mine novel images more accurately. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.08743v1-abstract-full').style.display = 'none'; document.getElementById('2402.08743v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.06304">arXiv:2402.06304</a> <span> [<a href="https://arxiv.org/pdf/2402.06304">pdf</a>, <a href="https://arxiv.org/ps/2402.06304">ps</a>, <a href="https://arxiv.org/format/2402.06304">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> A New Approach to Voice Authenticity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=M%C3%BCller%2C+N+M">Nicolas M. M眉ller</a>, <a href="/search/eess?searchtype=author&query=Kawa%2C+P">Piotr Kawa</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shen Hu</a>, <a href="/search/eess?searchtype=author&query=Neu%2C+M">Matthias Neu</a>, <a href="/search/eess?searchtype=author&query=Williams%2C+J">Jennifer Williams</a>, <a href="/search/eess?searchtype=author&query=Sperl%2C+P">Philip Sperl</a>, <a href="/search/eess?searchtype=author&query=B%C3%B6ttinger%2C+K">Konstantin B枚ttinger</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.06304v1-abstract-short" style="display: inline;"> Voice faking, driven primarily by recent advances in text-to-speech (TTS) synthesis technology, poses significant societal challenges. Currently, the prevailing assumption is that unaltered human speech can be considered genuine, while fake speech comes from TTS synthesis. We argue that this binary distinction is oversimplified. For instance, altered playback speeds can be used for malicious purpo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.06304v1-abstract-full').style.display = 'inline'; document.getElementById('2402.06304v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.06304v1-abstract-full" style="display: none;"> Voice faking, driven primarily by recent advances in text-to-speech (TTS) synthesis technology, poses significant societal challenges. Currently, the prevailing assumption is that unaltered human speech can be considered genuine, while fake speech comes from TTS synthesis. We argue that this binary distinction is oversimplified. For instance, altered playback speeds can be used for malicious purposes, like in the 'Drunken Nancy Pelosi' incident. Similarly, editing of audio clips can be done ethically, e.g., for brevity or summarization in news reporting or podcasts, but editing can also create misleading narratives. In this paper, we propose a conceptual shift away from the binary paradigm of audio being either 'fake' or 'real'. Instead, our focus is on pinpointing 'voice edits', which encompass traditional modifications like filters and cuts, as well as TTS synthesis and VC systems. We delineate 6 categories and curate a new challenge dataset rooted in the M-AILABS corpus, for which we present baseline detection systems. And most importantly, we argue that merely categorizing audio as fake or real is a dangerous over-simplification that will fail to move the field of speech technology forward. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.06304v1-abstract-full').style.display = 'none'; document.getElementById('2402.06304v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.04448">arXiv:2402.04448</a> <span> [<a href="https://arxiv.org/pdf/2402.04448">pdf</a>, <a href="https://arxiv.org/format/2402.04448">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Failure Analysis in Next-Generation Critical Cellular Communication Infrastructures </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Bi%2C+S">Siguo Bi</a>, <a href="/search/eess?searchtype=author&query=Yuan%2C+X">Xin Yuan</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shuyan Hu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+K">Kai Li</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+W">Wei Ni</a>, <a href="/search/eess?searchtype=author&query=Hossain%2C+E">Ekram Hossain</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.04448v1-abstract-short" style="display: inline;"> The advent of communication technologies marks a transformative phase in critical infrastructure construction, where the meticulous analysis of failures becomes paramount in achieving the fundamental objectives of continuity, security, and availability. This survey enriches the discourse on failures, failure analysis, and countermeasures in the context of the next-generation critical communication… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.04448v1-abstract-full').style.display = 'inline'; document.getElementById('2402.04448v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.04448v1-abstract-full" style="display: none;"> The advent of communication technologies marks a transformative phase in critical infrastructure construction, where the meticulous analysis of failures becomes paramount in achieving the fundamental objectives of continuity, security, and availability. This survey enriches the discourse on failures, failure analysis, and countermeasures in the context of the next-generation critical communication infrastructures. Through an exhaustive examination of existing literature, we discern and categorize prominent research orientations with focuses on, namely resource depletion, security vulnerabilities, and system availability concerns. We also analyze constructive countermeasures tailored to address identified failure scenarios and their prevention. Furthermore, the survey emphasizes the imperative for standardization in addressing failures related to Artificial Intelligence (AI) within the ambit of the sixth-generation (6G) networks, accounting for the forward-looking perspective for the envisioned intelligence of 6G network architecture. By identifying new challenges and delineating future research directions, this survey can help guide stakeholders toward unexplored territories, fostering innovation and resilience in critical communication infrastructure development and failure prevention. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.04448v1-abstract-full').style.display = 'none'; document.getElementById('2402.04448v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.16141">arXiv:2401.16141</a> <span> [<a href="https://arxiv.org/pdf/2401.16141">pdf</a>, <a href="https://arxiv.org/ps/2401.16141">ps</a>, <a href="https://arxiv.org/format/2401.16141">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Reconfigurable AI Modules Aided Channel Estimation and MIMO Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Qin%2C+X">Xiangzhao Qin</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Sha Hu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Jiankun Zhang</a>, <a href="/search/eess?searchtype=author&query=Qian%2C+J">Jing Qian</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.16141v1-abstract-short" style="display: inline;"> Deep learning (DL) based channel estimation (CE) and multiple input and multiple output detection (MIMODet), as two separate research topics, have provided convinced evidence to demonstrate the effectiveness and robustness of artificial intelligence (AI) for receiver design. However, problem remains on how to unify the CE and MIMODet by optimizing AI's structure to achieve near optimal detection p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.16141v1-abstract-full').style.display = 'inline'; document.getElementById('2401.16141v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.16141v1-abstract-full" style="display: none;"> Deep learning (DL) based channel estimation (CE) and multiple input and multiple output detection (MIMODet), as two separate research topics, have provided convinced evidence to demonstrate the effectiveness and robustness of artificial intelligence (AI) for receiver design. However, problem remains on how to unify the CE and MIMODet by optimizing AI's structure to achieve near optimal detection performance such as widely considered QR with M-algorithm (QRM) that can perform close to the maximum likelihood (ML) detector. In this paper, we propose an AI receiver that connects CE and MIMODet as an unified architecture. As a merit, CE and MIMODet only adopt structural input features and conventional neural networks (NN) to perform end-to-end (E2E) training offline. Numerical results show that, by adopting a simple super-resolution based convolutional neural network (SRCNN) as channel estimator and domain knowledge enhanced graphical neural network (GNN) as detector, the proposed QRM enhanced GNN receiver (QRMNet) achieves comparable block error rate (BLER) performance to near-optimal baseline detectors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.16141v1-abstract-full').style.display = 'none'; document.getElementById('2401.16141v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.12826">arXiv:2401.12826</a> <span> [<a href="https://arxiv.org/pdf/2401.12826">pdf</a>, <a href="https://arxiv.org/format/2401.12826">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Digital Twin-Based Network Management for Better QoE in Multicast Short Video Streaming </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+X">Xinyu Huang</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shisheng Hu</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+H">Haojun Yang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xinghan Wang</a>, <a href="/search/eess?searchtype=author&query=Pei%2C+Y">Yingying Pei</a>, <a href="/search/eess?searchtype=author&query=Shen%2C+X">Xuemin Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.12826v1-abstract-short" style="display: inline;"> Multicast short video streaming can enhance bandwidth utilization by enabling simultaneous video transmission to multiple users over shared wireless channels. The existing network management schemes mainly rely on the sequential buffering principle and general quality of experience (QoE) model, which may deteriorate QoE when users' swipe behaviors exhibit distinct spatiotemporal variation. In this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.12826v1-abstract-full').style.display = 'inline'; document.getElementById('2401.12826v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.12826v1-abstract-full" style="display: none;"> Multicast short video streaming can enhance bandwidth utilization by enabling simultaneous video transmission to multiple users over shared wireless channels. The existing network management schemes mainly rely on the sequential buffering principle and general quality of experience (QoE) model, which may deteriorate QoE when users' swipe behaviors exhibit distinct spatiotemporal variation. In this paper, we propose a digital twin (DT)-based network management scheme to enhance QoE. Firstly, user status emulated by the DT is utilized to estimate the transmission capabilities and watching probability distributions of sub-multicast groups (SMGs) for an adaptive segment buffering. The SMGs' buffers are aligned to the unique virtual buffers managed by the DT for a fine-grained buffer update. Then, a multicast QoE model consisting of rebuffering time, video quality, and quality variation is developed, by considering the mutual influence of segment buffering among SMGs. Finally, a joint optimization problem of segment version selection and slot division is formulated to maximize QoE. To efficiently solve the problem, a data-model-driven algorithm is proposed by integrating a convex optimization method and a deep reinforcement learning algorithm. Simulation results based on the real-world dataset demonstrate that the proposed DT-based network management scheme outperforms benchmark schemes in terms of QoE improvement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.12826v1-abstract-full').style.display = 'none'; document.getElementById('2401.12826v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.08913">arXiv:2401.08913</a> <span> [<a href="https://arxiv.org/pdf/2401.08913">pdf</a>, <a href="https://arxiv.org/format/2401.08913">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Efficient Image Super-Resolution via Symmetric Visual Attention Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+C">Chengxu Wu</a>, <a href="/search/eess?searchtype=author&query=Fan%2C+Q">Qinrui Fan</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shu Hu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xi Wu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+J">Jing Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.08913v1-abstract-short" style="display: inline;"> An important development direction in the Single-Image Super-Resolution (SISR) algorithms is to improve the efficiency of the algorithms. Recently, efficient Super-Resolution (SR) research focuses on reducing model complexity and improving efficiency through improved deep small kernel convolution, leading to a small receptive field. The large receptive field obtained by large kernel convolution ca… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.08913v1-abstract-full').style.display = 'inline'; document.getElementById('2401.08913v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.08913v1-abstract-full" style="display: none;"> An important development direction in the Single-Image Super-Resolution (SISR) algorithms is to improve the efficiency of the algorithms. Recently, efficient Super-Resolution (SR) research focuses on reducing model complexity and improving efficiency through improved deep small kernel convolution, leading to a small receptive field. The large receptive field obtained by large kernel convolution can significantly improve image quality, but the computational cost is too high. To improve the reconstruction details of efficient super-resolution reconstruction, we propose a Symmetric Visual Attention Network (SVAN) by applying large receptive fields. The SVAN decomposes a large kernel convolution into three different combinations of convolution operations and combines them with an attention mechanism to form a Symmetric Large Kernel Attention Block (SLKAB), which forms a symmetric attention block with a bottleneck structure by the size of the receptive field in the convolution combination to extract depth features effectively as the basic component of the SVAN. Our network gets a large receptive field while minimizing the number of parameters and improving the perceptual ability of the model. The experimental results show that the proposed SVAN can obtain high-quality super-resolution reconstruction results using only about 30% of the parameters of existing SOTA methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.08913v1-abstract-full').style.display = 'none'; document.getElementById('2401.08913v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages,4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.01544">arXiv:2401.01544</a> <span> [<a href="https://arxiv.org/pdf/2401.01544">pdf</a>, <a href="https://arxiv.org/format/2401.01544">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Collaborative Perception for Connected and Autonomous Driving: Challenges, Possible Solutions and Opportunities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hu%2C+S">Senkang Hu</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+Z">Zhengru Fang</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+Y">Yiqin Deng</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xianhao Chen</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+Y">Yuguang Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.01544v1-abstract-short" style="display: inline;"> Autonomous driving has attracted significant attention from both academia and industries, which is expected to offer a safer and more efficient driving system. However, current autonomous driving systems are mostly based on a single vehicle, which has significant limitations which still poses threats to driving safety. Collaborative perception with connected and autonomous vehicles (CAVs) shows a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.01544v1-abstract-full').style.display = 'inline'; document.getElementById('2401.01544v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.01544v1-abstract-full" style="display: none;"> Autonomous driving has attracted significant attention from both academia and industries, which is expected to offer a safer and more efficient driving system. However, current autonomous driving systems are mostly based on a single vehicle, which has significant limitations which still poses threats to driving safety. Collaborative perception with connected and autonomous vehicles (CAVs) shows a promising solution to overcoming these limitations. In this article, we first identify the challenges of collaborative perception, such as data sharing asynchrony, data volume, and pose errors. Then, we discuss the possible solutions to address these challenges with various technologies, where the research opportunities are also elaborated. Furthermore, we propose a scheme to deal with communication efficiency and latency problems, which is a channel-aware collaborative perception framework to dynamically adjust the communication graph and minimize latency, thereby improving perception performance while increasing communication efficiency. Finally, we conduct experiments to demonstrate the effectiveness of our proposed scheme. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.01544v1-abstract-full').style.display = 'none'; document.getElementById('2401.01544v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.00662">arXiv:2401.00662</a> <span> [<a href="https://arxiv.org/pdf/2401.00662">pdf</a>, <a href="https://arxiv.org/format/2401.00662">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Pre-trained ASR System Fine-tuning for Dysarthric Speech Recognition using Adversarial Data Augmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+H">Huimeng Wang</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+Z">Zengrui Jin</a>, <a href="/search/eess?searchtype=author&query=Geng%2C+M">Mengzhe Geng</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shujie Hu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+G">Guinan Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+T">Tianzi Wang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+H">Haoning Xu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xunying Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.00662v1-abstract-short" style="display: inline;"> Automatic recognition of dysarthric speech remains a highly challenging task to date. Neuro-motor conditions and co-occurring physical disabilities create difficulty in large-scale data collection for ASR system development. Adapting SSL pre-trained ASR models to limited dysarthric speech via data-intensive parameter fine-tuning leads to poor generalization. To this end, this paper presents an ext… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.00662v1-abstract-full').style.display = 'inline'; document.getElementById('2401.00662v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.00662v1-abstract-full" style="display: none;"> Automatic recognition of dysarthric speech remains a highly challenging task to date. Neuro-motor conditions and co-occurring physical disabilities create difficulty in large-scale data collection for ASR system development. Adapting SSL pre-trained ASR models to limited dysarthric speech via data-intensive parameter fine-tuning leads to poor generalization. To this end, this paper presents an extensive comparative study of various data augmentation approaches to improve the robustness of pre-trained ASR model fine-tuning to dysarthric speech. These include: a) conventional speaker-independent perturbation of impaired speech; b) speaker-dependent speed perturbation, or GAN-based adversarial perturbation of normal, control speech based on their time alignment against parallel dysarthric speech; c) novel Spectral basis GAN-based adversarial data augmentation operating on non-parallel data. Experiments conducted on the UASpeech corpus suggest GAN-based data augmentation consistently outperforms fine-tuned Wav2vec2.0 and HuBERT models using no data augmentation and speed perturbation across different data expansion operating points by statistically significant word error rate (WER) reductions up to 2.01% and 0.96% absolute (9.03% and 4.63% relative) respectively on the UASpeech test set of 16 dysarthric speakers. After cross-system outputs rescoring, the best system produced the lowest published WER of 16.53% (46.47% on very low intelligibility) on UASpeech. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.00662v1-abstract-full').style.display = 'none'; document.getElementById('2401.00662v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at IEEE ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.00246">arXiv:2401.00246</a> <span> [<a href="https://arxiv.org/pdf/2401.00246">pdf</a>, <a href="https://arxiv.org/format/2401.00246">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Boosting Large Language Model for Speech Synthesis: An Empirical Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hao%2C+H">Hongkun Hao</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+L">Long Zhou</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Shujie Liu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jinyu Li</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shujie Hu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+R">Rui Wang</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+F">Furu Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.00246v1-abstract-short" style="display: inline;"> Large language models (LLMs) have made significant advancements in natural language processing and are concurrently extending the language ability to other modalities, such as speech and vision. Nevertheless, most of the previous work focuses on prompting LLMs with perception abilities like auditory comprehension, and the effective approach for augmenting LLMs with speech synthesis capabilities re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.00246v1-abstract-full').style.display = 'inline'; document.getElementById('2401.00246v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.00246v1-abstract-full" style="display: none;"> Large language models (LLMs) have made significant advancements in natural language processing and are concurrently extending the language ability to other modalities, such as speech and vision. Nevertheless, most of the previous work focuses on prompting LLMs with perception abilities like auditory comprehension, and the effective approach for augmenting LLMs with speech synthesis capabilities remains ambiguous. In this paper, we conduct a comprehensive empirical exploration of boosting LLMs with the ability to generate speech, by combining pre-trained LLM LLaMA/OPT and text-to-speech synthesis model VALL-E. We compare three integration methods between LLMs and speech synthesis models, including directly fine-tuned LLMs, superposed layers of LLMs and VALL-E, and coupled LLMs and VALL-E using LLMs as a powerful text encoder. Experimental results show that, using LoRA method to fine-tune LLMs directly to boost the speech synthesis capability does not work well, and superposed LLMs and VALL-E can improve the quality of generated speech both in speaker similarity and word error rate (WER). Among these three methods, coupled methods leveraging LLMs as the text encoder can achieve the best performance, making it outperform original speech synthesis models with a consistently better speaker similarity and a significant (10.9%) WER reduction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.00246v1-abstract-full').style.display = 'none'; document.getElementById('2401.00246v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.13154">arXiv:2312.13154</a> <span> [<a href="https://arxiv.org/pdf/2312.13154">pdf</a>, <a href="https://arxiv.org/format/2312.13154">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Joint Range-Velocity-Azimuth Estimation for OFDM-Based Integrated Sensing and Communication </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hu%2C+Z">Zelin Hu</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Q">Qibin Ye</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Y">Yixuan Huang</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Su Hu</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+G">Gang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.13154v1-abstract-short" style="display: inline;"> Orthogonal frequency division multiplexing (OFDM)-based integrated sensing and communication (ISAC) is promising for future sixth-generation mobile communication systems. Existing works focus on the joint estimation of the targets' range and velocity for OFDM-based ISAC systems. In contrast, this paper studies the three-dimensional joint estimation (3DJE) of range, velocity, and azimuth for OFDM-b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.13154v1-abstract-full').style.display = 'inline'; document.getElementById('2312.13154v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.13154v1-abstract-full" style="display: none;"> Orthogonal frequency division multiplexing (OFDM)-based integrated sensing and communication (ISAC) is promising for future sixth-generation mobile communication systems. Existing works focus on the joint estimation of the targets' range and velocity for OFDM-based ISAC systems. In contrast, this paper studies the three-dimensional joint estimation (3DJE) of range, velocity, and azimuth for OFDM-based ISAC systems with multiple receive antennas. First, we establish the signal model and derive the Cramer-Rao bounds (CRBs) on the 3DJE. Furthermore, an auto-paired super-resolution 3DJE algorithm is proposed by exploiting the reconstructed observation sub-signal's translational invariance property in the time, frequency, and space domains. Finally, with the 5G New Radio parameter setup, simulation results show that the proposed algorithm achieves better estimation performance and its root mean square error is closer to the root of CRBs than existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.13154v1-abstract-full').style.display = 'none'; document.getElementById('2312.13154v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This manuscript has been submitted to the IEEE journal in 09-Aug-2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.08641">arXiv:2312.08641</a> <span> [<a href="https://arxiv.org/pdf/2312.08641">pdf</a>, <a href="https://arxiv.org/format/2312.08641">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Towards Automatic Data Augmentation for Disordered Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jin%2C+Z">Zengrui Jin</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+X">Xurong Xie</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+T">Tianzi Wang</a>, <a href="/search/eess?searchtype=author&query=Geng%2C+M">Mengzhe Geng</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+J">Jiajun Deng</a>, <a href="/search/eess?searchtype=author&query=Li%2C+G">Guinan Li</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shujie Hu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xunying Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.08641v1-abstract-short" style="display: inline;"> Automatic recognition of disordered speech remains a highly challenging task to date due to data scarcity. This paper presents a reinforcement learning (RL) based on-the-fly data augmentation approach for training state-of-the-art PyChain TDNN and end-to-end Conformer ASR systems on such data. The handcrafted temporal and spectral mask operations in the standard SpecAugment method that are task an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.08641v1-abstract-full').style.display = 'inline'; document.getElementById('2312.08641v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.08641v1-abstract-full" style="display: none;"> Automatic recognition of disordered speech remains a highly challenging task to date due to data scarcity. This paper presents a reinforcement learning (RL) based on-the-fly data augmentation approach for training state-of-the-art PyChain TDNN and end-to-end Conformer ASR systems on such data. The handcrafted temporal and spectral mask operations in the standard SpecAugment method that are task and system dependent, together with additionally introduced minimum and maximum cut-offs of these time-frequency masks, are now automatically learned using an RNN-based policy controller and tightly integrated with ASR system training. Experiments on the UASpeech corpus suggest the proposed RL-based data augmentation approach consistently produced performance superior or comparable that obtained using expert or handcrafted SpecAugment policies. Our RL auto-augmented PyChain TDNN system produced an overall WER of 28.79% on the UASpeech test set of 16 dysarthric speakers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.08641v1-abstract-full').style.display = 'none'; document.getElementById('2312.08641v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at IEEE ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.03787">arXiv:2312.03787</a> <span> [<a href="https://arxiv.org/pdf/2312.03787">pdf</a>, <a href="https://arxiv.org/format/2312.03787">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Detection and Mitigation of Position Spoofing Attacks on Cooperative UAV Swarm Formations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Bi%2C+S">Siguo Bi</a>, <a href="/search/eess?searchtype=author&query=Li%2C+K">Kai Li</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shuyan Hu</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+W">Wei Ni</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Cong Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.03787v1-abstract-short" style="display: inline;"> Detecting spoofing attacks on the positions of unmanned aerial vehicles (UAVs) within a swarm is challenging. Traditional methods relying solely on individually reported positions and pairwise distance measurements are ineffective in identifying the misbehavior of malicious UAVs. This paper presents a novel systematic structure designed to detect and mitigate spoofing attacks in UAV swarms. We for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.03787v1-abstract-full').style.display = 'inline'; document.getElementById('2312.03787v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.03787v1-abstract-full" style="display: none;"> Detecting spoofing attacks on the positions of unmanned aerial vehicles (UAVs) within a swarm is challenging. Traditional methods relying solely on individually reported positions and pairwise distance measurements are ineffective in identifying the misbehavior of malicious UAVs. This paper presents a novel systematic structure designed to detect and mitigate spoofing attacks in UAV swarms. We formulate the problem of detecting malicious UAVs as a localization feasibility problem, leveraging the reported positions and distance measurements. To address this problem, we develop a semidefinite relaxation (SDR) approach, which reformulates the non-convex localization problem into a convex and tractable semidefinite program (SDP). Additionally, we propose two innovative algorithms that leverage the proximity of neighboring UAVs to identify malicious UAVs effectively. Simulations demonstrate the superior performance of our proposed approaches compared to existing benchmarks. Our methods exhibit robustness across various swarm networks, showcasing their effectiveness in detecting and mitigating spoofing attacks. {\blue Specifically, the detection success rate is improved by up to 65\%, 55\%, and 51\% against distributed, collusion, and mixed attacks, respectively, compared to the benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.03787v1-abstract-full').style.display = 'none'; document.getElementById('2312.03787v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by IEEE TIFS in Dec. 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.15141">arXiv:2311.15141</a> <span> [<a href="https://arxiv.org/pdf/2311.15141">pdf</a>, <a href="https://arxiv.org/format/2311.15141">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> OFDMA-F$^2$L: Federated Learning With Flexible Aggregation Over an OFDMA Air Interface </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shuyan Hu</a>, <a href="/search/eess?searchtype=author&query=Yuan%2C+X">Xin Yuan</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+W">Wei Ni</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/eess?searchtype=author&query=Hossain%2C+E">Ekram Hossain</a>, <a href="/search/eess?searchtype=author&query=Poor%2C+H+V">H. Vincent Poor</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.15141v1-abstract-short" style="display: inline;"> Federated learning (FL) can suffer from a communication bottleneck when deployed in mobile networks, limiting participating clients and deterring FL convergence. The impact of practical air interfaces with discrete modulations on FL has not previously been studied in depth. This paper proposes a new paradigm of flexible aggregation-based FL (F$^2$L) over orthogonal frequency division multiple-acce… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.15141v1-abstract-full').style.display = 'inline'; document.getElementById('2311.15141v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.15141v1-abstract-full" style="display: none;"> Federated learning (FL) can suffer from a communication bottleneck when deployed in mobile networks, limiting participating clients and deterring FL convergence. The impact of practical air interfaces with discrete modulations on FL has not previously been studied in depth. This paper proposes a new paradigm of flexible aggregation-based FL (F$^2$L) over orthogonal frequency division multiple-access (OFDMA) air interface, termed as ``OFDMA-F$^2$L'', allowing selected clients to train local models for various numbers of iterations before uploading the models in each aggregation round. We optimize the selections of clients, subchannels and modulations, adapting to channel conditions and computing powers. Specifically, we derive an upper bound on the optimality gap of OFDMA-F$^2$L capturing the impact of the selections, and show that the upper bound is minimized by maximizing the weighted sum rate of the clients per aggregation round. A Lagrange-dual based method is developed to solve this challenging mixed integer program of weighted sum rate maximization, revealing that a ``winner-takes-all'' policy provides the almost surely optimal client, subchannel, and modulation selections. Experiments on multilayer perceptrons and convolutional neural networks show that OFDMA-F$^2$L with optimal selections can significantly improve the training convergence and accuracy, e.g., by about 18\% and 5\%, compared to potential alternatives. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.15141v1-abstract-full').style.display = 'none'; document.getElementById('2311.15141v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE TWC in Nov. 2023</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Hu%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Hu%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Hu%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Hu%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>