Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 328 results for author: <span class="mathjax">Wu, X</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Wu%2C+X">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Wu, X"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Wu%2C+X&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Wu, X"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Wu%2C+X&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Wu%2C+X&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Wu%2C+X&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Wu%2C+X&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Wu%2C+X&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Wu%2C+X&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Wu%2C+X&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> <li> <a href="/search/?searchtype=author&query=Wu%2C+X&start=300" class="pagination-link " aria-label="Page 7" aria-current="page">7 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18853">arXiv:2411.18853</a> <span> [<a href="https://arxiv.org/pdf/2411.18853">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Self-Adaptive Active Damping Method for Stability Enhancement of Systems With Black-Box Inverters Considering Operating Points </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yang Li</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xiangyang Wu</a>, <a href="/search/eess?searchtype=author&query=Shuai%2C+Z">Zhikang Shuai</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+J">Junbin Fang</a>, <a href="/search/eess?searchtype=author&query=He%2C+L">Lili He</a>, <a href="/search/eess?searchtype=author&query=Lei%2C+Y">Yi Lei</a>, <a href="/search/eess?searchtype=author&query=Shen%2C+Z+J">Z. John Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18853v1-abstract-short" style="display: inline;"> Due to the black-box nature of inverters and the wide variation range of operating points, it is challenging to on-line predict and adaptively enhance the stability of inverter-based systems. To solve this problem, this paper provides a feasible self-adaptive active damping method to eliminate potential small-signal instability of systems with black-box inverters under multiple operating points. F… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18853v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18853v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18853v1-abstract-full" style="display: none;"> Due to the black-box nature of inverters and the wide variation range of operating points, it is challenging to on-line predict and adaptively enhance the stability of inverter-based systems. To solve this problem, this paper provides a feasible self-adaptive active damping method to eliminate potential small-signal instability of systems with black-box inverters under multiple operating points. First, the framework that includes grid impedance estimation, inverters' admittance identification, and self-adaptive strategy is presented. Second, a widely-applicable and engineering-friendly method for inductive-resistive grid impedance estimation is studied, in which a frequency-integral-based dq-axis aligning method is presented to avoid the inaccuracy resulting from the disturbance theta. Then, to make the system have a sufficient stable margin under different operating points, a self-adaptive active damper (SAD) as well as its control strategy with lag compensator modification is proposed, in which the SAD's damping compensation mechanism for the system's stability enhancement is investigated and revealed. Finally, the mapping between system's parameter variations and SAD's parameters is established based on the artificial neural network (ANN) technique, serving as a computationally light model surrogate that is favorable for on-line parameter-tuning for SAD to compensate the system's damping according to operating points. The effectiveness of the proposed method is verified by simulations in PSACD/EMTDC and experiments in RT-Lab platforms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18853v1-abstract-full').style.display = 'none'; document.getElementById('2411.18853v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16119">arXiv:2411.16119</a> <span> [<a href="https://arxiv.org/pdf/2411.16119">pdf</a>, <a href="https://arxiv.org/format/2411.16119">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Learning Optimal Lattice Vector Quantizers for End-to-end Neural Image Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xi Zhang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xiaolin Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16119v1-abstract-short" style="display: inline;"> It is customary to deploy uniform scalar quantization in the end-to-end optimized Neural image compression methods, instead of more powerful vector quantization, due to the high complexity of the latter. Lattice vector quantization (LVQ), on the other hand, presents a compelling alternative, which can exploit inter-feature dependencies more effectively while keeping computational efficiency almost… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16119v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16119v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16119v1-abstract-full" style="display: none;"> It is customary to deploy uniform scalar quantization in the end-to-end optimized Neural image compression methods, instead of more powerful vector quantization, due to the high complexity of the latter. Lattice vector quantization (LVQ), on the other hand, presents a compelling alternative, which can exploit inter-feature dependencies more effectively while keeping computational efficiency almost the same as scalar quantization. However, traditional LVQ structures are designed/optimized for uniform source distributions, hence nonadaptive and suboptimal for real source distributions of latent code space for Neural image compression tasks. In this paper, we propose a novel learning method to overcome this weakness by designing the rate-distortion optimal lattice vector quantization (OLVQ) codebooks with respect to the sample statistics of the latent features to be compressed. By being able to better fit the LVQ structures to any given latent sample distribution, the proposed OLVQ method improves the rate-distortion performances of the existing quantization schemes in neural image compression significantly, while retaining the amenability of uniform scalar quantization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16119v1-abstract-full').style.display = 'none'; document.getElementById('2411.16119v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11116">arXiv:2411.11116</a> <span> [<a href="https://arxiv.org/pdf/2411.11116">pdf</a>, <a href="https://arxiv.org/ps/2411.11116">ps</a>, <a href="https://arxiv.org/format/2411.11116">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DBF-Net: A Dual-Branch Network with Feature Fusion for Ultrasound Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+G">Guoping Xu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Ximing Wu</a>, <a href="/search/eess?searchtype=author&query=Liao%2C+W">Wentao Liao</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xinglong Wu</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Q">Qing Huang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Chang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11116v1-abstract-short" style="display: inline;"> Accurately segmenting lesions in ultrasound images is challenging due to the difficulty in distinguishing boundaries between lesions and surrounding tissues. While deep learning has improved segmentation accuracy, there is limited focus on boundary quality and its relationship with body structures. To address this, we introduce UBBS-Net, a dual-branch deep neural network that learns the relationsh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11116v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11116v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11116v1-abstract-full" style="display: none;"> Accurately segmenting lesions in ultrasound images is challenging due to the difficulty in distinguishing boundaries between lesions and surrounding tissues. While deep learning has improved segmentation accuracy, there is limited focus on boundary quality and its relationship with body structures. To address this, we introduce UBBS-Net, a dual-branch deep neural network that learns the relationship between body and boundary for improved segmentation. We also propose a feature fusion module to integrate body and boundary information. Evaluated on three public datasets, UBBS-Net outperforms existing methods, achieving Dice Similarity Coefficients of 81.05% for breast cancer, 76.41% for brachial plexus nerves, and 87.75% for infantile hemangioma segmentation. Our results demonstrate the effectiveness of UBBS-Net for ultrasound image segmentation. The code is available at https://github.com/apple1986/DBF-Net. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11116v1-abstract-full').style.display = 'none'; document.getElementById('2411.11116v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09760">arXiv:2411.09760</a> <span> [<a href="https://arxiv.org/pdf/2411.09760">pdf</a>, <a href="https://arxiv.org/format/2411.09760">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> SpecPCM: A Low-power PCM-based In-Memory Computing Accelerator for Full-stack Mass Spectrometry Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Fan%2C+K">Keming Fan</a>, <a href="/search/eess?searchtype=author&query=Moradifirouzabadi%2C+A">Ashkan Moradifirouzabadi</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xiangjin Wu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zheyu Li</a>, <a href="/search/eess?searchtype=author&query=Ponzina%2C+F">Flavio Ponzina</a>, <a href="/search/eess?searchtype=author&query=Persson%2C+A">Anton Persson</a>, <a href="/search/eess?searchtype=author&query=Pop%2C+E">Eric Pop</a>, <a href="/search/eess?searchtype=author&query=Rosing%2C+T">Tajana Rosing</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+M">Mingu Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09760v1-abstract-short" style="display: inline;"> Mass spectrometry (MS) is essential for proteomics and metabolomics but faces impending challenges in efficiently processing the vast volumes of data. This paper introduces SpecPCM, an in-memory computing (IMC) accelerator designed to achieve substantial improvements in energy and delay efficiency for both MS spectral clustering and database (DB) search. SpecPCM employs analog processing with low-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09760v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09760v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09760v1-abstract-full" style="display: none;"> Mass spectrometry (MS) is essential for proteomics and metabolomics but faces impending challenges in efficiently processing the vast volumes of data. This paper introduces SpecPCM, an in-memory computing (IMC) accelerator designed to achieve substantial improvements in energy and delay efficiency for both MS spectral clustering and database (DB) search. SpecPCM employs analog processing with low-voltage swing and utilizes recently introduced phase change memory (PCM) devices based on superlattice materials, optimized for low-voltage and low-power programming. Our approach integrates contributions across multiple levels: application, algorithm, circuit, device, and instruction sets. We leverage a robust hyperdimensional computing (HD) algorithm with a novel dimension-packing method and develop specialized hardware for the end-to-end MS pipeline to overcome the non-ideal behavior of PCM devices. We further optimize multi-level PCM devices for different tasks by using different materials. We also perform a comprehensive design exploration to improve energy and delay efficiency while maintaining accuracy, exploring various combinations of hardware and software parameters controlled by the instruction set architecture (ISA). SpecPCM, with up to three bits per cell, achieves speedups of up to 82x and 143x for MS clustering and DB search tasks, respectively, along with a four-orders-of-magnitude improvement in energy efficiency compared with state-of-the-art CPU/GPU tools. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09760v1-abstract-full').style.display = 'none'; document.getElementById('2411.09760v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01826">arXiv:2411.01826</a> <span> [<a href="https://arxiv.org/pdf/2411.01826">pdf</a>, <a href="https://arxiv.org/format/2411.01826">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> An online optimization algorithm for tracking a linearly varying optimal point with zero steady-state error </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+A+X">Alex Xinting Wu</a>, <a href="/search/eess?searchtype=author&query=Petersen%2C+I+R">Ian R. Petersen</a>, <a href="/search/eess?searchtype=author&query=Ugrinovskii%2C+V">Valery Ugrinovskii</a>, <a href="/search/eess?searchtype=author&query=Shames%2C+I">Iman Shames</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01826v1-abstract-short" style="display: inline;"> In this paper, we develop an online optimization algorithm for solving a class of nonconvex optimization problems with a linearly varying optimal point. The global convergence of the algorithm is guaranteed using the circle criterion for the class of functions whose gradient is bounded within a sector. Also, we show that the corresponding Lur茅-type nonlinear system involves a double integrator, wh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01826v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01826v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01826v1-abstract-full" style="display: none;"> In this paper, we develop an online optimization algorithm for solving a class of nonconvex optimization problems with a linearly varying optimal point. The global convergence of the algorithm is guaranteed using the circle criterion for the class of functions whose gradient is bounded within a sector. Also, we show that the corresponding Lur茅-type nonlinear system involves a double integrator, which demonstrates its ability to track a linearly varying optimal point with zero steady-state error. The algorithm is applied to solving a time-of-arrival based localization problem with constant velocity and the results show that the algorithm is able to estimate the source location with zero steady-state error. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01826v1-abstract-full').style.display = 'none'; document.getElementById('2411.01826v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 7 figures, submitted to 2025 American Control Conference</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 93D09 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23084">arXiv:2410.23084</a> <span> [<a href="https://arxiv.org/pdf/2410.23084">pdf</a>, <a href="https://arxiv.org/format/2410.23084">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AI-assisted prostate cancer detection and localisation on biparametric MR by classifying radiologist-positives </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xiangcen Wu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yipei Wang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Q">Qianye Yang</a>, <a href="/search/eess?searchtype=author&query=Thorley%2C+N">Natasha Thorley</a>, <a href="/search/eess?searchtype=author&query=Punwani%2C+S">Shonit Punwani</a>, <a href="/search/eess?searchtype=author&query=Kasivisvanathan%2C+V">Veeru Kasivisvanathan</a>, <a href="/search/eess?searchtype=author&query=Bonmati%2C+E">Ester Bonmati</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+Y">Yipeng Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23084v1-abstract-short" style="display: inline;"> Prostate cancer diagnosis through MR imaging have currently relied on radiologists' interpretation, whilst modern AI-based methods have been developed to detect clinically significant cancers independent of radiologists. In this study, we propose to develop deep learning models that improve the overall cancer diagnostic accuracy, by classifying radiologist-identified patients or lesions (i.e. radi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23084v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23084v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23084v1-abstract-full" style="display: none;"> Prostate cancer diagnosis through MR imaging have currently relied on radiologists' interpretation, whilst modern AI-based methods have been developed to detect clinically significant cancers independent of radiologists. In this study, we propose to develop deep learning models that improve the overall cancer diagnostic accuracy, by classifying radiologist-identified patients or lesions (i.e. radiologist-positives), as opposed to the existing models that are trained to discriminate over all patients. We develop a single voxel-level classification model, with a simple percentage threshold to determine positive cases, at levels of lesions, Barzell-zones and patients. Based on the presented experiments from two clinical data sets, consisting of histopathology-labelled MR images from more than 800 and 500 patients in the respective UCLA and UCL PROMIS studies, we show that the proposed strategy can improve the diagnostic accuracy, by augmenting the radiologist reading of the MR imaging. Among varying definition of clinical significance, the proposed strategy, for example, achieved a specificity of 44.1% (with AI assistance) from 36.3% (by radiologists alone), at a controlled sensitivity of 80.0% on the publicly available UCLA data set. This provides measurable clinical values in a range of applications such as reducing unnecessary biopsies, lowering cost in cancer screening and quantifying risk in therapies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23084v1-abstract-full').style.display = 'none'; document.getElementById('2410.23084v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22746">arXiv:2410.22746</a> <span> [<a href="https://arxiv.org/pdf/2410.22746">pdf</a>, <a href="https://arxiv.org/format/2410.22746">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Unauthorized UAV Countermeasure for Low-Altitude Economy: Joint Communications and Jamming based on MIMO Cellular Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhuoran Li</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Z">Zhen Gao</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+K">Kuiyu Wang</a>, <a href="/search/eess?searchtype=author&query=Mei%2C+Y">Yikun Mei</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+C">Chunli Zhu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+L">Lei Chen</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xiaomei Wu</a>, <a href="/search/eess?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22746v2-abstract-short" style="display: inline;"> To ensure the thriving development of low-altitude economy, countering unauthorized unmanned aerial vehicles (UAVs) is an essential task. The existing widely deployed base stations hold great potential for joint communication and jamming. In light of this, this paper investigates the joint design of beamforming to simultaneously support communication with legitimate users and countermeasure agains… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22746v2-abstract-full').style.display = 'inline'; document.getElementById('2410.22746v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22746v2-abstract-full" style="display: none;"> To ensure the thriving development of low-altitude economy, countering unauthorized unmanned aerial vehicles (UAVs) is an essential task. The existing widely deployed base stations hold great potential for joint communication and jamming. In light of this, this paper investigates the joint design of beamforming to simultaneously support communication with legitimate users and countermeasure against unauthorized UAVs based on dual-functional multiple-input multiple-output (MIMO) cellular systems. We first formulate a joint communication and jamming (JCJ) problem, relaxing it through semi-definite relaxation (SDR) to obtain a tractable semi-definite programming (SDP) problem, with SDR providing an essential step toward simplifying the complex JCJ design. Although the solution to the relaxed SDP problem cannot directly solve the original problem, it offers valuable insights for further refinement. Therefore, we design a novel constraint specifically tailored to the structure of the SDP problem, ensuring that the solution adheres to the rank-1 constraint of the original problem. Finally, we validate effectiveness of the proposed JCJ scheme through extensive simulations. Simulation codes are provided to reproduce the results in this paper: https://github.com/LiZhuoRan0. The results confirm that the proposed JCJ scheme can operate effectively when the total number of legitimate users and unauthorized UAVs exceeds the number of antennas. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22746v2-abstract-full').style.display = 'none'; document.getElementById('2410.22746v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The paper has been accepted by IEEE IoTJ, and the code is available for result reproduction</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17118">arXiv:2410.17118</a> <span> [<a href="https://arxiv.org/pdf/2410.17118">pdf</a>, <a href="https://arxiv.org/ps/2410.17118">ps</a>, <a href="https://arxiv.org/format/2410.17118">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Learning Load Balancing with GNN in MPTCP-Enabled Heterogeneous Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ji%2C+H">Han Ji</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xiping Wu</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+Z">Zhihong Zeng</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+C">Chen Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17118v1-abstract-short" style="display: inline;"> Hybrid light fidelity (LiFi) and wireless fidelity (WiFi) networks are a promising paradigm of heterogeneous network (HetNet), attributed to the complementary physical properties of optical spectra and radio frequency. However, the current development of such HetNets is mostly bottlenecked by the existing transmission control protocol (TCP), which restricts the user equipment (UE) to connecting on… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17118v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17118v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17118v1-abstract-full" style="display: none;"> Hybrid light fidelity (LiFi) and wireless fidelity (WiFi) networks are a promising paradigm of heterogeneous network (HetNet), attributed to the complementary physical properties of optical spectra and radio frequency. However, the current development of such HetNets is mostly bottlenecked by the existing transmission control protocol (TCP), which restricts the user equipment (UE) to connecting one access point (AP) at a time. While the ongoing investigation on multipath TCP (MPTCP) can bring significant benefits, it complicates the network topology of HetNets, making the existing load balancing (LB) learning models less effective. Driven by this, we propose a graph neural network (GNN)-based model to tackle the LB problem for MPTCP-enabled HetNets, which results in a partial mesh topology. Such a topology can be modeled as a graph, with the channel state information and data rate requirement embedded as node features, while the LB solutions are deemed as edge labels. Compared to the conventional deep neural network (DNN), the proposed GNN-based model exhibits two key strengths: i) it can better interpret a complex network topology; and ii) it can handle various numbers of APs and UEs with a single trained model. Simulation results show that against the traditional optimisation method, the proposed learning model can achieve near-optimal throughput within a gap of 11.5%, while reducing the inference time by 4 orders of magnitude. In contrast to the DNN model, the new method can improve the network throughput by up to 21.7%, at a similar inference time level. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17118v1-abstract-full').style.display = 'none'; document.getElementById('2410.17118v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16322">arXiv:2409.16322</a> <span> [<a href="https://arxiv.org/pdf/2409.16322">pdf</a>, <a href="https://arxiv.org/format/2409.16322">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> Towards Within-Class Variation in Alzheimer's Disease Detection from Spontaneous Speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/eess?searchtype=author&query=Han%2C+D">Dongrui Han</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+L">Lingwei Meng</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jingyan Zhou</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jinchao Li</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xixin Wu</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16322v1-abstract-short" style="display: inline;"> Alzheimer's Disease (AD) detection has emerged as a promising research area that employs machine learning classification models to distinguish between individuals with AD and those without. Unlike conventional classification tasks, we identify within-class variation as a critical challenge in AD detection: individuals with AD exhibit a spectrum of cognitive impairments. Given that many AD detectio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16322v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16322v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16322v1-abstract-full" style="display: none;"> Alzheimer's Disease (AD) detection has emerged as a promising research area that employs machine learning classification models to distinguish between individuals with AD and those without. Unlike conventional classification tasks, we identify within-class variation as a critical challenge in AD detection: individuals with AD exhibit a spectrum of cognitive impairments. Given that many AD detection tasks lack fine-grained labels, simplistic binary classification may overlook two crucial aspects: within-class differences and instance-level imbalance. The former compels the model to map AD samples with varying degrees of impairment to a single diagnostic label, disregarding certain changes in cognitive function. While the latter biases the model towards overrepresented severity levels. This work presents early efforts to address these challenges. We propose two novel methods: Soft Target Distillation (SoTD) and Instance-level Re-balancing (InRe), targeting two problems respectively. Experiments on the ADReSS and ADReSSo datasets demonstrate that the proposed methods significantly improve detection accuracy. Further analysis reveals that SoTD effectively harnesses the strengths of multiple component models, while InRe substantially alleviates model over-fitting. These findings provide insights for developing more robust and reliable AD detection models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16322v1-abstract-full').style.display = 'none'; document.getElementById('2409.16322v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12560">arXiv:2409.12560</a> <span> [<a href="https://arxiv.org/pdf/2409.12560">pdf</a>, <a href="https://arxiv.org/format/2409.12560">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> AudioComposer: Towards Fine-grained Audio Generation with Natural Language Descriptions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuanyuan Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+H">Hangting Chen</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+D">Dongchao Yang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Z">Zhiyong Wu</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xixin Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12560v1-abstract-short" style="display: inline;"> Current Text-to-audio (TTA) models mainly use coarse text descriptions as inputs to generate audio, which hinders models from generating audio with fine-grained control of content and style. Some studies try to improve the granularity by incorporating additional frame-level conditions or control networks. However, this usually leads to complex system design and difficulties due to the requirement… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12560v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12560v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12560v1-abstract-full" style="display: none;"> Current Text-to-audio (TTA) models mainly use coarse text descriptions as inputs to generate audio, which hinders models from generating audio with fine-grained control of content and style. Some studies try to improve the granularity by incorporating additional frame-level conditions or control networks. However, this usually leads to complex system design and difficulties due to the requirement for reference frame-level conditions. To address these challenges, we propose AudioComposer, a novel TTA generation framework that relies solely on natural language descriptions (NLDs) to provide both content specification and style control information. To further enhance audio generative modeling, we employ flow-based diffusion transformers with the cross-attention mechanism to incorporate text descriptions effectively into audio generation processes, which can not only simultaneously consider the content and style information in the text inputs, but also accelerate generation compared to other architectures. Furthermore, we propose a novel and comprehensive automatic data simulation pipeline to construct data with fine-grained text descriptions, which significantly alleviates the problem of data scarcity in the area. Experiments demonstrate the effectiveness of our framework using solely NLDs as inputs for content specification and style control. The generation quality and controllability surpass state-of-the-art TTA models, even with a smaller model size. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12560v1-abstract-full').style.display = 'none'; document.getElementById('2409.12560v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12388">arXiv:2409.12388</a> <span> [<a href="https://arxiv.org/pdf/2409.12388">pdf</a>, <a href="https://arxiv.org/format/2409.12388">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Disentangling Speakers in Multi-Talker Speech Recognition with Speaker-Aware CTC </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+L">Lingwei Meng</a>, <a href="/search/eess?searchtype=author&query=Cui%2C+M">Mingyu Cui</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuejiao Wang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xixin Wu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xunying Liu</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12388v1-abstract-short" style="display: inline;"> Multi-talker speech recognition (MTASR) faces unique challenges in disentangling and transcribing overlapping speech. To address these challenges, this paper investigates the role of Connectionist Temporal Classification (CTC) in speaker disentanglement when incorporated with Serialized Output Training (SOT) for MTASR. Our visualization reveals that CTC guides the encoder to represent different sp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12388v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12388v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12388v1-abstract-full" style="display: none;"> Multi-talker speech recognition (MTASR) faces unique challenges in disentangling and transcribing overlapping speech. To address these challenges, this paper investigates the role of Connectionist Temporal Classification (CTC) in speaker disentanglement when incorporated with Serialized Output Training (SOT) for MTASR. Our visualization reveals that CTC guides the encoder to represent different speakers in distinct temporal regions of acoustic embeddings. Leveraging this insight, we propose a novel Speaker-Aware CTC (SACTC) training objective, based on the Bayes risk CTC framework. SACTC is a tailored CTC variant for multi-talker scenarios, it explicitly models speaker disentanglement by constraining the encoder to represent different speakers' tokens at specific time frames. When integrated with SOT, the SOT-SACTC model consistently outperforms standard SOT-CTC across various degrees of speech overlap. Specifically, we observe relative word error rate reductions of 10% overall and 15% on low-overlap speech. This work represents an initial exploration of CTC-based enhancements for MTASR tasks, offering a new perspective on speaker disentanglement in multi-talker speech recognition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12388v1-abstract-full').style.display = 'none'; document.getElementById('2409.12388v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11630">arXiv:2409.11630</a> <span> [<a href="https://arxiv.org/pdf/2409.11630">pdf</a>, <a href="https://arxiv.org/format/2409.11630">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Speaking from Coarse to Fine: Improving Neural Codec Language Model via Multi-Scale Speech Coding and Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Guo%2C+H">Haohan Guo</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+F">Fenglong Xie</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+D">Dongchao Yang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xixin Wu</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11630v1-abstract-short" style="display: inline;"> The neural codec language model (CLM) has demonstrated remarkable performance in text-to-speech (TTS) synthesis. However, troubled by ``recency bias", CLM lacks sufficient attention to coarse-grained information at a higher temporal scale, often producing unnatural or even unintelligible speech. This work proposes CoFi-Speech, a coarse-to-fine CLM-TTS approach, employing multi-scale speech coding… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11630v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11630v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11630v1-abstract-full" style="display: none;"> The neural codec language model (CLM) has demonstrated remarkable performance in text-to-speech (TTS) synthesis. However, troubled by ``recency bias", CLM lacks sufficient attention to coarse-grained information at a higher temporal scale, often producing unnatural or even unintelligible speech. This work proposes CoFi-Speech, a coarse-to-fine CLM-TTS approach, employing multi-scale speech coding and generation to address this issue. We train a multi-scale neural codec, CoFi-Codec, to encode speech into a multi-scale discrete representation, comprising multiple token sequences with different time resolutions. Then, we propose CoFi-LM that can generate this representation in two modes: the single-LM-based chain-of-scale generation and the multiple-LM-based stack-of-scale generation. In experiments, CoFi-Speech significantly outperforms single-scale baseline systems on naturalness and speaker similarity in zero-shot TTS. The analysis of multi-scale coding demonstrates the effectiveness of CoFi-Codec in learning multi-scale discrete speech representations while keeping high-quality speech reconstruction. The coarse-to-fine multi-scale generation, especially for the stack-of-scale approach, is also validated as a crucial approach in pursuing a high-quality neural codec language model for TTS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11630v1-abstract-full').style.display = 'none'; document.getElementById('2409.11630v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08710">arXiv:2409.08710</a> <span> [<a href="https://arxiv.org/pdf/2409.08710">pdf</a>, <a href="https://arxiv.org/format/2409.08710">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Using Ear-EEG to Decode Auditory Attention in Multiple-speaker Environment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhu%2C+H">Haolin Zhu</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+Y">Yujie Yan</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+X">Xiran Xu</a>, <a href="/search/eess?searchtype=author&query=Ge%2C+Z">Zhongshu Ge</a>, <a href="/search/eess?searchtype=author&query=Tian%2C+P">Pei Tian</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xihong Wu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+J">Jing Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08710v1-abstract-short" style="display: inline;"> Auditory Attention Decoding (AAD) can help to determine the identity of the attended speaker during an auditory selective attention task, by analyzing and processing measurements of electroencephalography (EEG) data. Most studies on AAD are based on scalp-EEG signals in two-speaker scenarios, which are far from real application. Ear-EEG has recently gained significant attention due to its motion t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08710v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08710v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08710v1-abstract-full" style="display: none;"> Auditory Attention Decoding (AAD) can help to determine the identity of the attended speaker during an auditory selective attention task, by analyzing and processing measurements of electroencephalography (EEG) data. Most studies on AAD are based on scalp-EEG signals in two-speaker scenarios, which are far from real application. Ear-EEG has recently gained significant attention due to its motion tolerance and invisibility during data acquisition, making it easy to incorporate with other devices for applications. In this work, participants selectively attended to one of the four spatially separated speakers' speech in an anechoic room. The EEG data were concurrently collected from a scalp-EEG system and an ear-EEG system (cEEGrids). Temporal response functions (TRFs) and stimulus reconstruction (SR) were utilized using ear-EEG data. Results showed that the attended speech TRFs were stronger than each unattended speech and decoding accuracy was 41.3\% in the 60s (chance level of 25\%). To further investigate the impact of electrode placement and quantity, SR was utilized in both scalp-EEG and ear-EEG, revealing that while the number of electrodes had a minor effect, their positioning had a significant influence on the decoding accuracy. One kind of auditory spatial attention detection (ASAD) method, STAnet, was testified with this ear-EEG database, resulting in 93.1% in 1-second decoding window. The implementation code and database for our work are available on GitHub: https://github.com/zhl486/Ear_EEG_code.git and Zenodo: https://zenodo.org/records/10803261. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08710v1-abstract-full').style.display = 'none'; document.getElementById('2409.08710v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08596">arXiv:2409.08596</a> <span> [<a href="https://arxiv.org/pdf/2409.08596">pdf</a>, <a href="https://arxiv.org/format/2409.08596">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Large Language Model Can Transcribe Speech in Multi-Talker Scenarios with Versatile Instructions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Meng%2C+L">Lingwei Meng</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shujie Hu</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhaoqing Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuejiao Wang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+W">Wenxuan Wu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xixin Wu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xunying Liu</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08596v1-abstract-short" style="display: inline;"> Recent advancements in large language models (LLMs) have revolutionized various domains, bringing significant progress and new opportunities. Despite progress in speech-related tasks, LLMs have not been sufficiently explored in multi-talker scenarios. In this work, we present a pioneering effort to investigate the capability of LLMs in transcribing speech in multi-talker environments, following ve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08596v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08596v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08596v1-abstract-full" style="display: none;"> Recent advancements in large language models (LLMs) have revolutionized various domains, bringing significant progress and new opportunities. Despite progress in speech-related tasks, LLMs have not been sufficiently explored in multi-talker scenarios. In this work, we present a pioneering effort to investigate the capability of LLMs in transcribing speech in multi-talker environments, following versatile instructions related to multi-talker automatic speech recognition (ASR), target talker ASR, and ASR based on specific talker attributes such as sex, occurrence order, language, and keyword spoken. Our approach utilizes WavLM and Whisper encoder to extract multi-faceted speech representations that are sensitive to speaker characteristics and semantic context. These representations are then fed into an LLM fine-tuned using LoRA, enabling the capabilities for speech comprehension and transcription. Comprehensive experiments reveal the promising performance of our proposed system, MT-LLM, in cocktail party scenarios, highlighting the potential of LLM to handle speech-related tasks based on user instructions in such complex settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08596v1-abstract-full').style.display = 'none'; document.getElementById('2409.08596v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06136">arXiv:2409.06136</a> <span>  </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> DENSE: Dynamic Embedding Causal Target Speech Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yiwen Wang</a>, <a href="/search/eess?searchtype=author&query=Yuan%2C+Z">Zeyu Yuan</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xihong Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06136v2-abstract-short" style="display: inline;"> Target speech extraction (TSE) focuses on extracting the speech of a specific target speaker from a mixture of signals. Existing TSE models typically utilize static embeddings as conditions for extracting the target speaker's voice. However, the static embeddings often fail to capture the contextual information of the extracted speech signal, which may limit the model's performance. We propose a n… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06136v2-abstract-full').style.display = 'inline'; document.getElementById('2409.06136v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06136v2-abstract-full" style="display: none;"> Target speech extraction (TSE) focuses on extracting the speech of a specific target speaker from a mixture of signals. Existing TSE models typically utilize static embeddings as conditions for extracting the target speaker's voice. However, the static embeddings often fail to capture the contextual information of the extracted speech signal, which may limit the model's performance. We propose a novel dynamic embedding causal target speech extraction model to address this limitation. Our approach incorporates an autoregressive mechanism to generate context-dependent embeddings based on the extracted speech, enabling real-time, frame-level extraction. Experimental results demonstrate that the proposed model enhances short-time objective intelligibility (STOI) and signal-to-distortion ratio (SDR), offering a promising solution for target speech extraction in challenging scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06136v2-abstract-full').style.display = 'none'; document.getElementById('2409.06136v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The experimental design and results contain errors, and I would like to withdraw the paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04843">arXiv:2409.04843</a> <span> [<a href="https://arxiv.org/pdf/2409.04843">pdf</a>, <a href="https://arxiv.org/format/2409.04843">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Leveraging Moving Sound Source Trajectories for Universal Sound Separation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+D">Donghang Wu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xihong Wu</a>, <a href="/search/eess?searchtype=author&query=Qu%2C+T">Tianshu Qu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04843v1-abstract-short" style="display: inline;"> Existing methods utilizing spatial information for sound source separation require prior knowledge of the direction of arrival (DOA) of the source or utilize estimated but imprecise localization results, which impairs the separation performance, especially when the sound sources are moving. In fact, sound source localization and separation are interconnected problems, that is, sound source localiz… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04843v1-abstract-full').style.display = 'inline'; document.getElementById('2409.04843v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04843v1-abstract-full" style="display: none;"> Existing methods utilizing spatial information for sound source separation require prior knowledge of the direction of arrival (DOA) of the source or utilize estimated but imprecise localization results, which impairs the separation performance, especially when the sound sources are moving. In fact, sound source localization and separation are interconnected problems, that is, sound source localization facilitates sound separation while sound separation contributes to more precise source localization. This paper proposes a method utilizing the mutual facilitation mechanism between sound source localization and separation for moving sources. Initially, sound separation is conducted using rough preliminary sound source tracking results. Sound source tracking is then performed on the separated signals thus the tracking results can become more precise. The precise trajectory can further enhances the separation performance. This mutual facilitation process can be performed over several iterations. Simulation experiments conducted under reverberation conditions and with moving sound sources demonstrate that the proposed method can achieve more accurate separation based on more precise tracking results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04843v1-abstract-full').style.display = 'none'; document.getElementById('2409.04843v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages,7 figures,submitted to IEEE/ACM Transactions on Audio, Speech and Language Processing(TASLP)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04803">arXiv:2409.04803</a> <span> [<a href="https://arxiv.org/pdf/2409.04803">pdf</a>, <a href="https://arxiv.org/format/2409.04803">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Cross-attention Inspired Selective State Space Models for Target Sound Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+D">Donghang Wu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yiwen Wang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xihong Wu</a>, <a href="/search/eess?searchtype=author&query=Qu%2C+T">Tianshu Qu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04803v3-abstract-short" style="display: inline;"> The Transformer model, particularly its cross-attention module, is widely used for feature fusion in target sound extraction which extracts the signal of interest based on given clues. Despite its effectiveness, this approach suffers from low computational efficiency. Recent advancements in state space models, notably the latest work Mamba, have shown comparable performance to Transformer-based me… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04803v3-abstract-full').style.display = 'inline'; document.getElementById('2409.04803v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04803v3-abstract-full" style="display: none;"> The Transformer model, particularly its cross-attention module, is widely used for feature fusion in target sound extraction which extracts the signal of interest based on given clues. Despite its effectiveness, this approach suffers from low computational efficiency. Recent advancements in state space models, notably the latest work Mamba, have shown comparable performance to Transformer-based methods while significantly reducing computational complexity in various tasks. However, Mamba's applicability in target sound extraction is limited due to its inability to capture dependencies between different sequences as the cross-attention does. In this paper, we propose CrossMamba for target sound extraction, which leverages the hidden attention mechanism of Mamba to compute dependencies between the given clues and the audio mixture. The calculation of Mamba can be divided to the query, key and value. We utilize the clue to generate the query and the audio mixture to derive the key and value, adhering to the principle of the cross-attention mechanism in Transformers. Experimental results from two representative target sound extraction methods validate the efficacy of the proposed CrossMamba. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04803v3-abstract-full').style.display = 'none'; document.getElementById('2409.04803v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures, submitted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02336">arXiv:2409.02336</a> <span> [<a href="https://arxiv.org/pdf/2409.02336">pdf</a>, <a href="https://arxiv.org/format/2409.02336">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Comparative Analysis of Learning-Based Methods for Transient Stability Assessment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xingjian Wu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xiaoting Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xiaozhe Wang</a>, <a href="/search/eess?searchtype=author&query=Caines%2C+P+E">Peter E. Caines</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jingyu Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02336v1-abstract-short" style="display: inline;"> Transient stability and critical clearing time (CCT) are important concepts in power system protection and control. This paper explores and compares various learning-based methods for predicting CCT under uncertainties arising from renewable generation, loads, and contingencies. Specially, we introduce new definitions of transient stability (B-stablilty) and CCT from an engineering perspective. Fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02336v1-abstract-full').style.display = 'inline'; document.getElementById('2409.02336v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02336v1-abstract-full" style="display: none;"> Transient stability and critical clearing time (CCT) are important concepts in power system protection and control. This paper explores and compares various learning-based methods for predicting CCT under uncertainties arising from renewable generation, loads, and contingencies. Specially, we introduce new definitions of transient stability (B-stablilty) and CCT from an engineering perspective. For training the models, only the initial values of system variables and contingency cases are used as features, enabling the provision of protection information based on these initial values. To enhance efficiency, a hybrid feature selection strategy combining the maximal information coefficient (MIC) and Spearman's Correlation Coefficient (SCC) is employed to reduce the feature dimension. The performance of different learning-based models is evaluated on a WSCC 9-bus system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02336v1-abstract-full').style.display = 'none'; document.getElementById('2409.02336v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for presentation at the 56th North American Power Symposium (NAPS)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00933">arXiv:2409.00933</a> <span> [<a href="https://arxiv.org/pdf/2409.00933">pdf</a>, <a href="https://arxiv.org/format/2409.00933">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SoCodec: A Semantic-Ordered Multi-Stream Speech Codec for Efficient Language Model Based Text-to-Speech Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Guo%2C+H">Haohan Guo</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+F">Fenglong Xie</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+K">Kun Xie</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+D">Dongchao Yang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+D">Dake Guo</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xixin Wu</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00933v1-abstract-short" style="display: inline;"> The long speech sequence has been troubling language models (LM) based TTS approaches in terms of modeling complexity and efficiency. This work proposes SoCodec, a semantic-ordered multi-stream speech codec, to address this issue. It compresses speech into a shorter, multi-stream discrete semantic sequence with multiple tokens at each frame. Meanwhile, the ordered product quantization is proposed… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00933v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00933v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00933v1-abstract-full" style="display: none;"> The long speech sequence has been troubling language models (LM) based TTS approaches in terms of modeling complexity and efficiency. This work proposes SoCodec, a semantic-ordered multi-stream speech codec, to address this issue. It compresses speech into a shorter, multi-stream discrete semantic sequence with multiple tokens at each frame. Meanwhile, the ordered product quantization is proposed to constrain this sequence into an ordered representation. It can be applied with a multi-stream delayed LM to achieve better autoregressive generation along both time and stream axes in TTS. The experimental result strongly demonstrates the effectiveness of the proposed approach, achieving superior performance over baseline systems even if compressing the frameshift of speech from 20ms to 240ms (12x). The ablation studies further validate the importance of learning the proposed ordered multi-stream semantic representation in pursuing shorter speech sequences for efficient LM-based TTS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00933v1-abstract-full').style.display = 'none'; document.getElementById('2409.00933v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by SLT 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00364">arXiv:2409.00364</a> <span> [<a href="https://arxiv.org/pdf/2409.00364">pdf</a>, <a href="https://arxiv.org/format/2409.00364">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Resource Management for IRS-Assisted Full-Duplex Integrated Sensing, Communication and Computing Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hao%2C+W">Wanming Hao</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xue Wu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xingwang Li</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+G">Gangcan Sun</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Q">Qingqing Wu</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+L">Liang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00364v1-abstract-short" style="display: inline;"> In this paper, we investigate an intelligent reflecting surface (IRS) assisted full-duplex (FD) integrated sensing, communication and computing system. Specifically, an FD base station (BS) provides service for uplink and downlink transmission, and a local cache is connected to the BS through a backhaul link to store data. Meanwhile, active sensing elements are deployed on the IRS to receive targe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00364v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00364v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00364v1-abstract-full" style="display: none;"> In this paper, we investigate an intelligent reflecting surface (IRS) assisted full-duplex (FD) integrated sensing, communication and computing system. Specifically, an FD base station (BS) provides service for uplink and downlink transmission, and a local cache is connected to the BS through a backhaul link to store data. Meanwhile, active sensing elements are deployed on the IRS to receive target echo signals. On this basis, in order to evaluate the overall performance of the system under consideration, we propose a system utility maximization problem while ensuring the sensing quality, expressed as the difference between the sum of communication throughput, total computation bits (offloading bits and local computation bits) and the total backhaul cost for content delivery. This makes the problem difficult to solve due to the highly non-convex coupling of the optimization variables. To effectively solve this problem, we first design the most effective caching strategy. Then, we develop an algorithm based on weighted minimum mean square error, alternative direction method of multipliers, majorization-minimization framework, semi-definite relaxation techniques, and several complex transformations to jointly solve the optimization variables. Finally, simulation results are provided to verify the utility performance of the proposed algorithm and demonstrate the advantages of the proposed scheme compared with the baseline scheme. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00364v1-abstract-full').style.display = 'none'; document.getElementById('2409.00364v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13893">arXiv:2408.13893</a> <span> [<a href="https://arxiv.org/pdf/2408.13893">pdf</a>, <a href="https://arxiv.org/format/2408.13893">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SimpleSpeech 2: Towards Simple and Efficient Text-to-Speech with Flow-based Scalar Latent Transformer Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+D">Dongchao Yang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuanyuan Wang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+H">Haohan Guo</a>, <a href="/search/eess?searchtype=author&query=Chong%2C+D">Dading Chong</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Songxiang Liu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xixin Wu</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13893v2-abstract-short" style="display: inline;"> Scaling Text-to-speech (TTS) to large-scale datasets has been demonstrated as an effective method for improving the diversity and naturalness of synthesized speech. At the high level, previous large-scale TTS models can be categorized into either Auto-regressive (AR) based (\textit{e.g.}, VALL-E) or Non-auto-regressive (NAR) based models (\textit{e.g.}, NaturalSpeech 2/3). Although these works dem… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13893v2-abstract-full').style.display = 'inline'; document.getElementById('2408.13893v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13893v2-abstract-full" style="display: none;"> Scaling Text-to-speech (TTS) to large-scale datasets has been demonstrated as an effective method for improving the diversity and naturalness of synthesized speech. At the high level, previous large-scale TTS models can be categorized into either Auto-regressive (AR) based (\textit{e.g.}, VALL-E) or Non-auto-regressive (NAR) based models (\textit{e.g.}, NaturalSpeech 2/3). Although these works demonstrate good performance, they still have potential weaknesses. For instance, AR-based models are plagued by unstable generation quality and slow generation speed; meanwhile, some NAR-based models need phoneme-level duration alignment information, thereby increasing the complexity of data pre-processing, model design, and loss design. In this work, we build upon our previous publication by implementing a simple and efficient non-autoregressive (NAR) TTS framework, termed SimpleSpeech 2. SimpleSpeech 2 effectively combines the strengths of both autoregressive (AR) and non-autoregressive (NAR) methods, offering the following key advantages: (1) simplified data preparation; (2) straightforward model and loss design; and (3) stable, high-quality generation performance with fast inference speed. Compared to our previous publication, we present ({\romannumeral1}) a detailed analysis of the influence of speech tokenizer and noisy label for TTS performance; ({\romannumeral2}) four distinct types of sentence duration predictors; ({\romannumeral3}) a novel flow-based scalar latent transformer diffusion model. With these improvement, we show a significant improvement in generation performance and generation speed compared to our previous work and other state-of-the-art (SOTA) large-scale TTS models. Furthermore, we show that SimpleSpeech 2 can be seamlessly extended to multilingual TTS by training it on multilingual speech datasets. Demos are available on: {https://dongchaoyang.top/SimpleSpeech2\_demo/}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13893v2-abstract-full').style.display = 'none'; document.getElementById('2408.13893v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submit to TASLP</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07770">arXiv:2408.07770</a> <span> [<a href="https://arxiv.org/pdf/2408.07770">pdf</a>, <a href="https://arxiv.org/ps/2408.07770">ps</a>, <a href="https://arxiv.org/format/2408.07770">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> User-Centric Machine Learning for Resource Allocation in MPTCP-Enabled Hybrid LiFi and WiFi Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ji%2C+H">Han Ji</a>, <a href="/search/eess?searchtype=author&query=Delaney%2C+D+T">Declan T. Delaney</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xiping Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07770v1-abstract-short" style="display: inline;"> As an emerging paradigm of heterogeneous networks (HetNets) towards 6G, the hybrid light fidelity (LiFi) and wireless fidelity (WiFi) networks (HLWNets) have potential to explore the complementary advantages of the optical and radio spectra. Like other cooperation-native HetNets, HLWNets face a crucial load balancing (LB) problem due to the heterogeneity of access points (APs). The existing litera… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07770v1-abstract-full').style.display = 'inline'; document.getElementById('2408.07770v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07770v1-abstract-full" style="display: none;"> As an emerging paradigm of heterogeneous networks (HetNets) towards 6G, the hybrid light fidelity (LiFi) and wireless fidelity (WiFi) networks (HLWNets) have potential to explore the complementary advantages of the optical and radio spectra. Like other cooperation-native HetNets, HLWNets face a crucial load balancing (LB) problem due to the heterogeneity of access points (APs). The existing literature mostly formulates this problem as joint AP selection and resource allocation (RA), presuming that each user equipment (UE) is served by one AP at a time, under the constraint of the traditional transmission control protocol (TCP). In contrast, multipath TCP (MPTCP), which allows for the simultaneous use of multiple APs, can significantly boost the UE's throughput as well as enhancing its network resilience. However, the existing TCP-based LB methods, particularly those aided by machine learning, are not suitable for the MPTCP scenario. In this paper, we discuss the challenges when developing learning-aided LB in MPTCP-enabled HLWNets, and propose a novel user-centric learning model to tackle this tricky problem. Unlike the conventional network-centric learning methods, the proposed method determines the LB solution for a single target UE, rendering low complexity and high flexibility in practical implementations. Results show that the proposed user-centric approach can greatly outperform the network-centric learning method. Against the TCP-based LB method such as game theory, the proposed method can increase the throughput of HLWNets by up to 40\%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07770v1-abstract-full').style.display = 'none'; document.getElementById('2408.07770v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06185">arXiv:2408.06185</a> <span> [<a href="https://arxiv.org/pdf/2408.06185">pdf</a>, <a href="https://arxiv.org/format/2408.06185">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Hi-SAM: A high-scalable authentication model for satellite-ground Zero-Trust system using mean field game </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xuesong Wu</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+T">Tianshuai Zheng</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+R">Runfang Wu</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+J">Jie Ren</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+J">Junyan Guo</a>, <a href="/search/eess?searchtype=author&query=Du%2C+Y">Ye Du</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06185v1-abstract-short" style="display: inline;"> As more and more Internet of Thing (IoT) devices are connected to satellite networks, the Zero-Trust Architecture brings dynamic security to the satellite-ground system, while frequent authentication creates challenges for system availability. To make the system's accommodate more IoT devices, this paper proposes a high-scalable authentication model (Hi-SAM). Hi-SAM introduces the Proof-of-Work id… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06185v1-abstract-full').style.display = 'inline'; document.getElementById('2408.06185v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06185v1-abstract-full" style="display: none;"> As more and more Internet of Thing (IoT) devices are connected to satellite networks, the Zero-Trust Architecture brings dynamic security to the satellite-ground system, while frequent authentication creates challenges for system availability. To make the system's accommodate more IoT devices, this paper proposes a high-scalable authentication model (Hi-SAM). Hi-SAM introduces the Proof-of-Work idea to authentication, which allows device to obtain the network resource based on frequency. To optimize the frequency, mean field game is used for competition among devices, which can reduce the decision space of large-scale population games. And a dynamic time-range message authentication code is designed for security. From the test at large population scales, Hi-SAM is superior in the optimization of authentication workload and the anomaly detection efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06185v1-abstract-full').style.display = 'none'; document.getElementById('2408.06185v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04300">arXiv:2408.04300</a> <span> [<a href="https://arxiv.org/pdf/2408.04300">pdf</a>, <a href="https://arxiv.org/format/2408.04300">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> An Explainable Non-local Network for COVID-19 Diagnosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+J">Jingfu Yang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+P">Peng Huang</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+J">Jing Hu</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shu Hu</a>, <a href="/search/eess?searchtype=author&query=Lyu%2C+S">Siwei Lyu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+J">Jun Guo</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xi Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04300v1-abstract-short" style="display: inline;"> The CNN has achieved excellent results in the automatic classification of medical images. In this study, we propose a novel deep residual 3D attention non-local network (NL-RAN) to classify CT images included COVID-19, common pneumonia, and normal to perform rapid and explainable COVID-19 diagnosis. We built a deep residual 3D attention non-local network that could achieve end-to-end training. The… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04300v1-abstract-full').style.display = 'inline'; document.getElementById('2408.04300v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04300v1-abstract-full" style="display: none;"> The CNN has achieved excellent results in the automatic classification of medical images. In this study, we propose a novel deep residual 3D attention non-local network (NL-RAN) to classify CT images included COVID-19, common pneumonia, and normal to perform rapid and explainable COVID-19 diagnosis. We built a deep residual 3D attention non-local network that could achieve end-to-end training. The network is embedded with a nonlocal module to capture global information, while a 3D attention module is embedded to focus on the details of the lesion so that it can directly analyze the 3D lung CT and output the classification results. The output of the attention module can be used as a heat map to increase the interpretability of the model. 4079 3D CT scans were included in this study. Each scan had a unique label (novel coronavirus pneumonia, common pneumonia, and normal). The CT scans cohort was randomly split into a training set of 3263 scans, a validation set of 408 scans, and a testing set of 408 scans. And compare with existing mainstream classification methods, such as CovNet, CBAM, ResNet, etc. Simultaneously compare the visualization results with visualization methods such as CAM. Model performance was evaluated using the Area Under the ROC Curve(AUC), precision, and F1-score. The NL-RAN achieved the AUC of 0.9903, the precision of 0.9473, and the F1-score of 0.9462, surpass all the classification methods compared. The heat map output by the attention module is also clearer than the heat map output by CAM. Our experimental results indicate that our proposed method performs significantly better than existing methods. In addition, the first attention module outputs a heat map containing detailed outline information to increase the interpretability of the model. Our experiments indicate that the inference of our model is fast. It can provide real-time assistance with diagnosis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04300v1-abstract-full').style.display = 'none'; document.getElementById('2408.04300v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.02966">arXiv:2408.02966</a> <span> [<a href="https://arxiv.org/pdf/2408.02966">pdf</a>, <a href="https://arxiv.org/format/2408.02966">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Fast Point Cloud Geometry Compression with Context-based Residual Coding and INR-based Refinement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+H">Hao Xu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xi Zhang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xiaolin Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.02966v1-abstract-short" style="display: inline;"> Compressing a set of unordered points is far more challenging than compressing images/videos of regular sample grids, because of the difficulties in characterizing neighboring relations in an irregular layout of points. Many researchers resort to voxelization to introduce regularity, but this approach suffers from quantization loss. In this research, we use the KNN method to determine the neighbor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02966v1-abstract-full').style.display = 'inline'; document.getElementById('2408.02966v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.02966v1-abstract-full" style="display: none;"> Compressing a set of unordered points is far more challenging than compressing images/videos of regular sample grids, because of the difficulties in characterizing neighboring relations in an irregular layout of points. Many researchers resort to voxelization to introduce regularity, but this approach suffers from quantization loss. In this research, we use the KNN method to determine the neighborhoods of raw surface points. This gives us a means to determine the spatial context in which the latent features of 3D points are compressed by arithmetic coding. As such, the conditional probability model is adaptive to local geometry, leading to significant rate reduction. Additionally, we propose a dual-layer architecture where a non-learning base layer reconstructs the main structures of the point cloud at low complexity, while a learned refinement layer focuses on preserving fine details. This design leads to reductions in model complexity and coding latency by two orders of magnitude compared to SOTA methods. Moreover, we incorporate an implicit neural representation (INR) into the refinement layer, allowing the decoder to sample points on the underlying surface at arbitrary densities. This work is the first to effectively exploit content-aware local contexts for compressing irregular raw point clouds, achieving high rate-distortion performance, low complexity, and the ability to function as an arbitrary-scale upsampling network simultaneously. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02966v1-abstract-full').style.display = 'none'; document.getElementById('2408.02966v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.20878">arXiv:2407.20878</a> <span> [<a href="https://arxiv.org/pdf/2407.20878">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> S3PET: Semi-supervised Standard-dose PET Image Reconstruction via Dose-aware Token Swap </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cui%2C+J">Jiaqi Cui</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+P">Pinxian Zeng</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+Y">Yuanyuan Xu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xi Wu</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiliu Zhou</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.20878v1-abstract-short" style="display: inline;"> To acquire high-quality positron emission tomography (PET) images while reducing the radiation tracer dose, numerous efforts have been devoted to reconstructing standard-dose PET (SPET) images from low-dose PET (LPET). However, the success of current fully-supervised approaches relies on abundant paired LPET and SPET images, which are often unavailable in clinic. Moreover, these methods often mix… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20878v1-abstract-full').style.display = 'inline'; document.getElementById('2407.20878v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.20878v1-abstract-full" style="display: none;"> To acquire high-quality positron emission tomography (PET) images while reducing the radiation tracer dose, numerous efforts have been devoted to reconstructing standard-dose PET (SPET) images from low-dose PET (LPET). However, the success of current fully-supervised approaches relies on abundant paired LPET and SPET images, which are often unavailable in clinic. Moreover, these methods often mix the dose-invariant content with dose level-related dose-specific details during reconstruction, resulting in distorted images. To alleviate these problems, in this paper, we propose a two-stage Semi-Supervised SPET reconstruction framework, namely S3PET, to accommodate the training of abundant unpaired and limited paired SPET and LPET images. Our S3PET involves an un-supervised pre-training stage (Stage I) to extract representations from unpaired images, and a supervised dose-aware reconstruction stage (Stage II) to achieve LPET-to-SPET reconstruction by transferring the dose-specific knowledge between paired images. Specifically, in stage I, two independent dose-specific masked autoencoders (DsMAEs) are adopted to comprehensively understand the unpaired SPET and LPET images. Then, in Stage II, the pre-trained DsMAEs are further finetuned using paired images. To prevent distortions in both content and details, we introduce two elaborate modules, i.e., a dose knowledge decouple module to disentangle the respective dose-specific and dose-invariant knowledge of LPET and SPET, and a dose-specific knowledge learning module to transfer the dose-specific information from SPET to LPET, thereby achieving high-quality SPET reconstruction from LPET images. Experiments on two datasets demonstrate that our S3PET achieves state-of-the-art performance quantitatively and qualitatively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20878v1-abstract-full').style.display = 'none'; document.getElementById('2407.20878v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.13509">arXiv:2407.13509</a> <span> [<a href="https://arxiv.org/pdf/2407.13509">pdf</a>, <a href="https://arxiv.org/format/2407.13509">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Spontaneous Style Text-to-Speech Synthesis with Controllable Spontaneous Behaviors Based on Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+W">Weiqin Li</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+P">Peiji Yang</a>, <a href="/search/eess?searchtype=author&query=Zhong%2C+Y">Yicheng Zhong</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yixuan Zhou</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zhisheng Wang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Z">Zhiyong Wu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xixin Wu</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.13509v1-abstract-short" style="display: inline;"> Spontaneous style speech synthesis, which aims to generate human-like speech, often encounters challenges due to the scarcity of high-quality data and limitations in model capabilities. Recent language model-based TTS systems can be trained on large, diverse, and low-quality speech datasets, resulting in highly natural synthesized speech. However, they are limited by the difficulty of simulating v… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13509v1-abstract-full').style.display = 'inline'; document.getElementById('2407.13509v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.13509v1-abstract-full" style="display: none;"> Spontaneous style speech synthesis, which aims to generate human-like speech, often encounters challenges due to the scarcity of high-quality data and limitations in model capabilities. Recent language model-based TTS systems can be trained on large, diverse, and low-quality speech datasets, resulting in highly natural synthesized speech. However, they are limited by the difficulty of simulating various spontaneous behaviors and capturing prosody variations in spontaneous speech. In this paper, we propose a novel spontaneous speech synthesis system based on language models. We systematically categorize and uniformly model diverse spontaneous behaviors. Moreover, fine-grained prosody modeling is introduced to enhance the model's ability to capture subtle prosody variations in spontaneous speech.Experimental results show that our proposed method significantly outperforms the baseline methods in terms of prosody naturalness and spontaneous behavior naturalness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13509v1-abstract-full').style.display = 'none'; document.getElementById('2407.13509v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by INTERSPEECH 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.09817">arXiv:2407.09817</a> <span> [<a href="https://arxiv.org/pdf/2407.09817">pdf</a>, <a href="https://arxiv.org/format/2407.09817">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Empowering Whisper as a Joint Multi-Talker and Target-Talker Speech Recognition System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Meng%2C+L">Lingwei Meng</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuejiao Wang</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+Z">Zengrui Jin</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xixin Wu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xunying Liu</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.09817v2-abstract-short" style="display: inline;"> Multi-talker speech recognition and target-talker speech recognition, both involve transcription in multi-talker contexts, remain significant challenges. However, existing methods rarely attempt to simultaneously address both tasks. In this study, we propose a pioneering approach to empower Whisper, which is a speech foundation model, to tackle joint multi-talker and target-talker speech recogniti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09817v2-abstract-full').style.display = 'inline'; document.getElementById('2407.09817v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.09817v2-abstract-full" style="display: none;"> Multi-talker speech recognition and target-talker speech recognition, both involve transcription in multi-talker contexts, remain significant challenges. However, existing methods rarely attempt to simultaneously address both tasks. In this study, we propose a pioneering approach to empower Whisper, which is a speech foundation model, to tackle joint multi-talker and target-talker speech recognition tasks. Specifically, (i) we freeze Whisper and plug a Sidecar separator into its encoder to separate mixed embedding for multiple talkers; (ii) a Target Talker Identifier is introduced to identify the embedding flow of the target talker on the fly, requiring only three-second enrollment speech as a cue; (iii) soft prompt tuning for decoder is explored for better task adaptation. Our method outperforms previous methods on two- and three-talker LibriMix and LibriSpeechMix datasets for both tasks, and delivers acceptable zero-shot performance on multi-talker ASR on AishellMix Mandarin dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09817v2-abstract-full').style.display = 'none'; document.getElementById('2407.09817v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to INTERSPEECH 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.08551">arXiv:2407.08551</a> <span> [<a href="https://arxiv.org/pdf/2407.08551">pdf</a>, <a href="https://arxiv.org/format/2407.08551">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Autoregressive Speech Synthesis without Vector Quantization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Meng%2C+L">Lingwei Meng</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+L">Long Zhou</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Shujie Liu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+S">Sanyuan Chen</a>, <a href="/search/eess?searchtype=author&query=Han%2C+B">Bing Han</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shujie Hu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yanqing Liu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jinyu Li</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Sheng Zhao</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xixin Wu</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+F">Furu Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.08551v1-abstract-short" style="display: inline;"> We present MELLE, a novel continuous-valued tokens based language modeling approach for text to speech synthesis (TTS). MELLE autoregressively generates continuous mel-spectrogram frames directly from text condition, bypassing the need for vector quantization, which are originally designed for audio compression and sacrifice fidelity compared to mel-spectrograms. Specifically, (i) instead of cross… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08551v1-abstract-full').style.display = 'inline'; document.getElementById('2407.08551v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.08551v1-abstract-full" style="display: none;"> We present MELLE, a novel continuous-valued tokens based language modeling approach for text to speech synthesis (TTS). MELLE autoregressively generates continuous mel-spectrogram frames directly from text condition, bypassing the need for vector quantization, which are originally designed for audio compression and sacrifice fidelity compared to mel-spectrograms. Specifically, (i) instead of cross-entropy loss, we apply regression loss with a proposed spectrogram flux loss function to model the probability distribution of the continuous-valued tokens. (ii) we have incorporated variational inference into MELLE to facilitate sampling mechanisms, thereby enhancing the output diversity and model robustness. Experiments demonstrate that, compared to the two-stage codec language models VALL-E and its variants, the single-stage MELLE mitigates robustness issues by avoiding the inherent flaws of sampling discrete codes, achieves superior performance across multiple metrics, and, most importantly, offers a more streamlined paradigm. See https://aka.ms/melle for demos of our work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08551v1-abstract-full').style.display = 'none'; document.getElementById('2407.08551v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04726">arXiv:2407.04726</a> <span> [<a href="https://arxiv.org/pdf/2407.04726">pdf</a>, <a href="https://arxiv.org/format/2407.04726">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Data-Driven Prediction and Uncertainty Quantification of PWR Crud-Induced Power Shift Using Convolutional Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Furlong%2C+A">Aidan Furlong</a>, <a href="/search/eess?searchtype=author&query=Alsafadi%2C+F">Farah Alsafadi</a>, <a href="/search/eess?searchtype=author&query=Palmtag%2C+S">Scott Palmtag</a>, <a href="/search/eess?searchtype=author&query=Godfrey%2C+A">Andrew Godfrey</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xu Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04726v1-abstract-short" style="display: inline;"> The development of Crud-Induced Power Shift (CIPS) is an operational challenge in Pressurized Water Reactors that is due to the development of crud on the fuel rod cladding. The available predictive tools developed previously, usually based on fundamental physics, are computationally expensive and have shown differing degrees of accuracy. This work proposes a completely top-down approach to predic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04726v1-abstract-full').style.display = 'inline'; document.getElementById('2407.04726v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04726v1-abstract-full" style="display: none;"> The development of Crud-Induced Power Shift (CIPS) is an operational challenge in Pressurized Water Reactors that is due to the development of crud on the fuel rod cladding. The available predictive tools developed previously, usually based on fundamental physics, are computationally expensive and have shown differing degrees of accuracy. This work proposes a completely top-down approach to predict CIPS instances on an assembly level with reactor-specific calibration built-in. Built using artificial neural networks, this work uses a three-dimensional convolutional approach to leverage the image-like layout of the input data. As a classifier, the convolutional neural network model predicts whether a given assembly will experience CIPS as well as the time of occurrence during a given cycle. This surrogate model is both trained and tested using a combination of calculated core model parameters and measured plant data from Unit 1 of the Catawba Nuclear Station. After the evaluation of its performance using various metrics, Monte Carlo dropout is employed for extensive uncertainty quantification of the model predictions. The results indicate that this methodology could be a viable approach in predicting CIPS with an assembly-level resolution across both clean and afflicted cycles, while using limited computational resources. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04726v1-abstract-full').style.display = 'none'; document.getElementById('2407.04726v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.02318">arXiv:2407.02318</a> <span> [<a href="https://arxiv.org/pdf/2407.02318">pdf</a>, <a href="https://arxiv.org/format/2407.02318">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> The Solution for Temporal Sound Localisation Task of ICCV 1st Perception Test Challenge 2023 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+Y">Yurui Huang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Y">Yang Yang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+S">Shou Chen</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xiangyu Wu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Qingguo Chen</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+J">Jianfeng Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.02318v1-abstract-short" style="display: inline;"> In this paper, we propose a solution for improving the quality of temporal sound localization. We employ a multimodal fusion approach to combine visual and audio features. High-quality visual features are extracted using a state-of-the-art self-supervised pre-training network, resulting in efficient video feature representations. At the same time, audio features serve as complementary information… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02318v1-abstract-full').style.display = 'inline'; document.getElementById('2407.02318v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.02318v1-abstract-full" style="display: none;"> In this paper, we propose a solution for improving the quality of temporal sound localization. We employ a multimodal fusion approach to combine visual and audio features. High-quality visual features are extracted using a state-of-the-art self-supervised pre-training network, resulting in efficient video feature representations. At the same time, audio features serve as complementary information to help the model better localize the start and end of sounds. The fused features are trained in a multi-scale Transformer for training. In the final test dataset, we achieved a mean average precision (mAP) of 0.33, obtaining the second-best performance in this track. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02318v1-abstract-full').style.display = 'none'; document.getElementById('2407.02318v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.01700">arXiv:2407.01700</a> <span> [<a href="https://arxiv.org/pdf/2407.01700">pdf</a>, <a href="https://arxiv.org/format/2407.01700">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Joint Design of Conventional Public Transport Network and Mobility on Demand </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xiaoyi Wu</a>, <a href="/search/eess?searchtype=author&query=Mouhrim%2C+N">Nisrine Mouhrim</a>, <a href="/search/eess?searchtype=author&query=Araldo%2C+A">Andrea Araldo</a>, <a href="/search/eess?searchtype=author&query=Molenbruch%2C+Y">Yves Molenbruch</a>, <a href="/search/eess?searchtype=author&query=Feillet%2C+D">Dominique Feillet</a>, <a href="/search/eess?searchtype=author&query=Braekers%2C+K">Kris Braekers</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.01700v1-abstract-short" style="display: inline;"> Conventional Public Transport (PT) is based on fixed lines, running with routes and schedules determined a-priori. In low-demand areas, conventional PT is inefficient. Therein, Mobility on Demand (MoD) could serve users more efficiently and with an improved quality of service (QoS). The idea of integrating MoD into PT is therefore abundantly discussed by researchers and practitioners, mainly in th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01700v1-abstract-full').style.display = 'inline'; document.getElementById('2407.01700v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.01700v1-abstract-full" style="display: none;"> Conventional Public Transport (PT) is based on fixed lines, running with routes and schedules determined a-priori. In low-demand areas, conventional PT is inefficient. Therein, Mobility on Demand (MoD) could serve users more efficiently and with an improved quality of service (QoS). The idea of integrating MoD into PT is therefore abundantly discussed by researchers and practitioners, mainly in the form of adding MoD on top of PT. Efficiency can be instead gained if also conventional PT lines are redesigned after integrating MoD in the first or last mile. In this paper we focus on this re-design problem. We devise a bilevel optimization problem where, given a certain initial design, the upper level determines stop selection and frequency settings, while the lower level routes a fleet of MoD vehicles. We propose a solution method based on Particle Swarm Optimization (PSO) for the upper level, while we adopt Large Neighborhood Search (LNS) in the lower level. Our solution method is computationally efficient and we test it in simulations with up to 10k travel requests. Results show important operational cost savings obtained via appropriately reducing the conventional PT coverage after integrating MoD, while preserving QoS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01700v1-abstract-full').style.display = 'none'; document.getElementById('2407.01700v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26th Euro Working Group on Transportation Meeting</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17338">arXiv:2406.17338</a> <span> [<a href="https://arxiv.org/pdf/2406.17338">pdf</a>, <a href="https://arxiv.org/format/2406.17338">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Robustly Optimized Deep Feature Decoupling Network for Fatty Liver Diseases Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+P">Peng Huang</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shu Hu</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+B">Bo Peng</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Jiashu Zhang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xi Wu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17338v1-abstract-short" style="display: inline;"> Current medical image classification efforts mainly aim for higher average performance, often neglecting the balance between different classes. This can lead to significant differences in recognition accuracy between classes and obvious recognition weaknesses. Without the support of massive data, deep learning faces challenges in fine-grained classification of fatty liver. In this paper, we propos… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17338v1-abstract-full').style.display = 'inline'; document.getElementById('2406.17338v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17338v1-abstract-full" style="display: none;"> Current medical image classification efforts mainly aim for higher average performance, often neglecting the balance between different classes. This can lead to significant differences in recognition accuracy between classes and obvious recognition weaknesses. Without the support of massive data, deep learning faces challenges in fine-grained classification of fatty liver. In this paper, we propose an innovative deep learning framework that combines feature decoupling and adaptive adversarial training. Firstly, we employ two iteratively compressed decouplers to supervised decouple common features and specific features related to fatty liver in abdominal ultrasound images. Subsequently, the decoupled features are concatenated with the original image after transforming the color space and are fed into the classifier. During adversarial training, we adaptively adjust the perturbation and balance the adversarial strength by the accuracy of each class. The model will eliminate recognition weaknesses by correctly classifying adversarial samples, thus improving recognition robustness. Finally, the accuracy of our method improved by 4.16%, achieving 82.95%. As demonstrated by extensive experiments, our method is a generalized learning framework that can be directly used to eliminate the recognition weaknesses of any classifier while improving its average performance. Code is available at https://github.com/HP-ML/MICCAI2024. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17338v1-abstract-full').style.display = 'none'; document.getElementById('2406.17338v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">MICCAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.16200">arXiv:2406.16200</a> <span> [<a href="https://arxiv.org/pdf/2406.16200">pdf</a>, <a href="https://arxiv.org/format/2406.16200">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Towards unlocking the mystery of adversarial fragility of neural networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gao%2C+J">Jingchao Gao</a>, <a href="/search/eess?searchtype=author&query=Mudumbai%2C+R">Raghu Mudumbai</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xiaodong Wu</a>, <a href="/search/eess?searchtype=author&query=Yi%2C+J">Jirong Yi</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+C">Catherine Xu</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+H">Hui Xie</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+W">Weiyu Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.16200v1-abstract-short" style="display: inline;"> In this paper, we study the adversarial robustness of deep neural networks for classification tasks. We look at the smallest magnitude of possible additive perturbations that can change the output of a classification algorithm. We provide a matrix-theoretic explanation of the adversarial fragility of deep neural network for classification. In particular, our theoretical results show that neural ne… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16200v1-abstract-full').style.display = 'inline'; document.getElementById('2406.16200v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.16200v1-abstract-full" style="display: none;"> In this paper, we study the adversarial robustness of deep neural networks for classification tasks. We look at the smallest magnitude of possible additive perturbations that can change the output of a classification algorithm. We provide a matrix-theoretic explanation of the adversarial fragility of deep neural network for classification. In particular, our theoretical results show that neural network's adversarial robustness can degrade as the input dimension $d$ increases. Analytically we show that neural networks' adversarial robustness can be only $1/\sqrt{d}$ of the best possible adversarial robustness. Our matrix-theoretic explanation is consistent with an earlier information-theoretic feature-compression-based explanation for the adversarial fragility of neural networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16200v1-abstract-full').style.display = 'none'; document.getElementById('2406.16200v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.15222">arXiv:2406.15222</a> <span> [<a href="https://arxiv.org/pdf/2406.15222">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Rapid and Accurate Diagnosis of Acute Aortic Syndrome using Non-contrast CT: A Large-scale, Retrospective, Multi-center and AI-based Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hu%2C+Y">Yujian Hu</a>, <a href="/search/eess?searchtype=author&query=Xiang%2C+Y">Yilang Xiang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yan-Jie Zhou</a>, <a href="/search/eess?searchtype=author&query=He%2C+Y">Yangyan He</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+S">Shifeng Yang</a>, <a href="/search/eess?searchtype=author&query=Du%2C+X">Xiaolong Du</a>, <a href="/search/eess?searchtype=author&query=Den%2C+C">Chunlan Den</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+Y">Youyao Xu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+G">Gaofeng Wang</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+Z">Zhengyao Ding</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+J">Jingyong Huang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+W">Wenjun Zhao</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xuejun Wu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+D">Donglin Li</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+Q">Qianqian Zhu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhenjiang Li</a>, <a href="/search/eess?searchtype=author&query=Qiu%2C+C">Chenyang Qiu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Z">Ziheng Wu</a>, <a href="/search/eess?searchtype=author&query=He%2C+Y">Yunjun He</a>, <a href="/search/eess?searchtype=author&query=Tian%2C+C">Chen Tian</a>, <a href="/search/eess?searchtype=author&query=Qiu%2C+Y">Yihui Qiu</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+Z">Zuodong Lin</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiaolong Zhang</a>, <a href="/search/eess?searchtype=author&query=He%2C+Y">Yuan He</a>, <a href="/search/eess?searchtype=author&query=Yuan%2C+Z">Zhenpeng Yuan</a> , et al. (15 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.15222v3-abstract-short" style="display: inline;"> Chest pain symptoms are highly prevalent in emergency departments (EDs), where acute aortic syndrome (AAS) is a catastrophic cardiovascular emergency with a high fatality rate, especially when timely and accurate treatment is not administered. However, current triage practices in the ED can cause up to approximately half of patients with AAS to have an initially missed diagnosis or be misdiagnosed… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.15222v3-abstract-full').style.display = 'inline'; document.getElementById('2406.15222v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.15222v3-abstract-full" style="display: none;"> Chest pain symptoms are highly prevalent in emergency departments (EDs), where acute aortic syndrome (AAS) is a catastrophic cardiovascular emergency with a high fatality rate, especially when timely and accurate treatment is not administered. However, current triage practices in the ED can cause up to approximately half of patients with AAS to have an initially missed diagnosis or be misdiagnosed as having other acute chest pain conditions. Subsequently, these AAS patients will undergo clinically inaccurate or suboptimal differential diagnosis. Fortunately, even under these suboptimal protocols, nearly all these patients underwent non-contrast CT covering the aorta anatomy at the early stage of differential diagnosis. In this study, we developed an artificial intelligence model (DeepAAS) using non-contrast CT, which is highly accurate for identifying AAS and provides interpretable results to assist in clinical decision-making. Performance was assessed in two major phases: a multi-center retrospective study (n = 20,750) and an exploration in real-world emergency scenarios (n = 137,525). In the multi-center cohort, DeepAAS achieved a mean area under the receiver operating characteristic curve of 0.958 (95% CI 0.950-0.967). In the real-world cohort, DeepAAS detected 109 AAS patients with misguided initial suspicion, achieving 92.6% (95% CI 76.2%-97.5%) in mean sensitivity and 99.2% (95% CI 99.1%-99.3%) in mean specificity. Our AI model performed well on non-contrast CT at all applicable early stages of differential diagnosis workflows, effectively reduced the overall missed diagnosis and misdiagnosis rate from 48.8% to 4.8% and shortened the diagnosis time for patients with misguided initial suspicion from an average of 681.8 (74-11,820) mins to 68.5 (23-195) mins. DeepAAS could effectively fill the gap in the current clinical workflow without requiring additional tests. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.15222v3-abstract-full').style.display = 'none'; document.getElementById('2406.15222v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.14092">arXiv:2406.14092</a> <span> [<a href="https://arxiv.org/pdf/2406.14092">pdf</a>, <a href="https://arxiv.org/format/2406.14092">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Seamless Language Expansion: Enhancing Multilingual Mastery in Self-Supervised Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+J">Jing Xu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+M">Minglin Wu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xixin Wu</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.14092v1-abstract-short" style="display: inline;"> Self-supervised (SSL) models have shown great performance in various downstream tasks. However, they are typically developed for limited languages, and may encounter new languages in real-world. Developing a SSL model for each new language is costly. Thus, it is vital to figure out how to efficiently adapt existed SSL models to a new language without impairing its original abilities. We propose ad… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14092v1-abstract-full').style.display = 'inline'; document.getElementById('2406.14092v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.14092v1-abstract-full" style="display: none;"> Self-supervised (SSL) models have shown great performance in various downstream tasks. However, they are typically developed for limited languages, and may encounter new languages in real-world. Developing a SSL model for each new language is costly. Thus, it is vital to figure out how to efficiently adapt existed SSL models to a new language without impairing its original abilities. We propose adaptation methods which integrate LoRA to existed SSL models to extend new language. We also develop preservation strategies which include data combination and re-clustering to retain abilities on existed languages. Applied to mHuBERT, we investigate their effectiveness on speech re-synthesis task. Experiments show that our adaptation methods enable mHuBERT to be applied to a new language (Mandarin) with MOS value increased about 1.6 and the relative value of WER reduced up to 61.72%. Also, our preservation strategies ensure that the performance on both existed and new languages remains intact. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14092v1-abstract-full').style.display = 'none'; document.getElementById('2406.14092v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.13150">arXiv:2406.13150</a> <span> [<a href="https://arxiv.org/pdf/2406.13150">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MCAD: Multi-modal Conditioned Adversarial Diffusion Model for High-Quality PET Image Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cui%2C+J">Jiaqi Cui</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+X">Xinyi Zeng</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+P">Pinxian Zeng</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+B">Bo Liu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xi Wu</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiliu Zhou</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.13150v1-abstract-short" style="display: inline;"> Radiation hazards associated with standard-dose positron emission tomography (SPET) images remain a concern, whereas the quality of low-dose PET (LPET) images fails to meet clinical requirements. Therefore, there is great interest in reconstructing SPET images from LPET images. However, prior studies focus solely on image data, neglecting vital complementary information from other modalities, e.g.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13150v1-abstract-full').style.display = 'inline'; document.getElementById('2406.13150v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.13150v1-abstract-full" style="display: none;"> Radiation hazards associated with standard-dose positron emission tomography (SPET) images remain a concern, whereas the quality of low-dose PET (LPET) images fails to meet clinical requirements. Therefore, there is great interest in reconstructing SPET images from LPET images. However, prior studies focus solely on image data, neglecting vital complementary information from other modalities, e.g., patients' clinical tabular, resulting in compromised reconstruction with limited diagnostic utility. Moreover, they often overlook the semantic consistency between real SPET and reconstructed images, leading to distorted semantic contexts. To tackle these problems, we propose a novel Multi-modal Conditioned Adversarial Diffusion model (MCAD) to reconstruct SPET images from multi-modal inputs, including LPET images and clinical tabular. Specifically, our MCAD incorporates a Multi-modal conditional Encoder (Mc-Encoder) to extract multi-modal features, followed by a conditional diffusion process to blend noise with multi-modal features and gradually map blended features to the target SPET images. To balance multi-modal inputs, the Mc-Encoder embeds Optimal Multi-modal Transport co-Attention (OMTA) to narrow the heterogeneity gap between image and tabular while capturing their interactions, providing sufficient guidance for reconstruction. In addition, to mitigate semantic distortions, we introduce the Multi-Modal Masked Text Reconstruction (M3TRec), which leverages semantic knowledge extracted from denoised PET images to restore the masked clinical tabular, thereby compelling the network to maintain accurate semantics during reconstruction. To expedite the diffusion process, we further introduce an adversarial diffusive network with a reduced number of diffusion steps. Experiments show that our method achieves the state-of-the-art performance both qualitatively and quantitatively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13150v1-abstract-full').style.display = 'none'; document.getElementById('2406.13150v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Early accepted by MICCAI2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.12646">arXiv:2406.12646</a> <span> [<a href="https://arxiv.org/pdf/2406.12646">pdf</a>, <a href="https://arxiv.org/format/2406.12646">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> An Empirical Study on the Fairness of Foundation Models for Multi-Organ Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Q">Qin Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yizhe Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yan Li</a>, <a href="/search/eess?searchtype=author&query=Lyu%2C+J">Jun Lyu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+M">Meng Liu</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+L">Longyu Sun</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+M">Mengting Sun</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Q">Qirong Li</a>, <a href="/search/eess?searchtype=author&query=Mao%2C+W">Wenyue Mao</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xinran Wu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yajing Zhang</a>, <a href="/search/eess?searchtype=author&query=Chu%2C+Y">Yinghua Chu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shuo Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Chengyan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.12646v1-abstract-short" style="display: inline;"> The segmentation foundation model, e.g., Segment Anything Model (SAM), has attracted increasing interest in the medical image community. Early pioneering studies primarily concentrated on assessing and improving SAM's performance from the perspectives of overall accuracy and efficiency, yet little attention was given to the fairness considerations. This oversight raises questions about the potenti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12646v1-abstract-full').style.display = 'inline'; document.getElementById('2406.12646v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.12646v1-abstract-full" style="display: none;"> The segmentation foundation model, e.g., Segment Anything Model (SAM), has attracted increasing interest in the medical image community. Early pioneering studies primarily concentrated on assessing and improving SAM's performance from the perspectives of overall accuracy and efficiency, yet little attention was given to the fairness considerations. This oversight raises questions about the potential for performance biases that could mirror those found in task-specific deep learning models like nnU-Net. In this paper, we explored the fairness dilemma concerning large segmentation foundation models. We prospectively curate a benchmark dataset of 3D MRI and CT scans of the organs including liver, kidney, spleen, lung and aorta from a total of 1056 healthy subjects with expert segmentations. Crucially, we document demographic details such as gender, age, and body mass index (BMI) for each subject to facilitate a nuanced fairness analysis. We test state-of-the-art foundation models for medical image segmentation, including the original SAM, medical SAM and SAT models, to evaluate segmentation efficacy across different demographic groups and identify disparities. Our comprehensive analysis, which accounts for various confounding factors, reveals significant fairness concerns within these foundational models. Moreover, our findings highlight not only disparities in overall segmentation metrics, such as the Dice Similarity Coefficient but also significant variations in the spatial distribution of segmentation errors, offering empirical evidence of the nuanced challenges in ensuring fairness in medical image segmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12646v1-abstract-full').style.display = 'none'; document.getElementById('2406.12646v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to MICCAI-2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.10056">arXiv:2406.10056</a> <span> [<a href="https://arxiv.org/pdf/2406.10056">pdf</a>, <a href="https://arxiv.org/format/2406.10056">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> UniAudio 1.5: Large Language Model-driven Audio Codec is A Few-shot Audio Task Learner </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+D">Dongchao Yang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+H">Haohan Guo</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuanyuan Wang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+X">Xu Tan</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xixin Wu</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.10056v1-abstract-short" style="display: inline;"> The Large Language models (LLMs) have demonstrated supreme capabilities in text understanding and generation, but cannot be directly applied to cross-modal tasks without fine-tuning. This paper proposes a cross-modal in-context learning approach, empowering the frozen LLMs to achieve multiple audio tasks in a few-shot style without any parameter update. Specifically, we propose a novel and LLMs-dr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10056v1-abstract-full').style.display = 'inline'; document.getElementById('2406.10056v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.10056v1-abstract-full" style="display: none;"> The Large Language models (LLMs) have demonstrated supreme capabilities in text understanding and generation, but cannot be directly applied to cross-modal tasks without fine-tuning. This paper proposes a cross-modal in-context learning approach, empowering the frozen LLMs to achieve multiple audio tasks in a few-shot style without any parameter update. Specifically, we propose a novel and LLMs-driven audio codec model, LLM-Codec, to transfer the audio modality into the textual space, \textit{i.e.} representing audio tokens with words or sub-words in the vocabulary of LLMs, while keeping high audio reconstruction quality. The key idea is to reduce the modality heterogeneity between text and audio by compressing the audio modality into a well-trained LLMs token space. Thus, the audio representation can be viewed as a new \textit{foreign language}, and LLMs can learn the new \textit{foreign language} with several demonstrations. In experiments, we investigate the performance of the proposed approach across multiple audio understanding and generation tasks, \textit{e.g.} speech emotion classification, audio classification, text-to-speech generation, speech enhancement, etc. The experimental results demonstrate that the LLMs equipped with the proposed LLM-Codec, named as UniAudio 1.5, prompted by only a few examples, can achieve the expected functions in simple scenarios. It validates the feasibility and effectiveness of the proposed cross-modal in-context learning approach. To facilitate research on few-shot audio task learning and multi-modal LLMs, we have open-sourced the LLM-Codec model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10056v1-abstract-full').style.display = 'none'; document.getElementById('2406.10056v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09356">arXiv:2406.09356</a> <span> [<a href="https://arxiv.org/pdf/2406.09356">pdf</a>, <a href="https://arxiv.org/format/2406.09356">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> CMC-Bench: Towards a New Paradigm of Visual Signal Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+C">Chunyi Li</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xiele Wu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+H">Haoning Wu</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+D">Donghui Feng</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Zicheng Zhang</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+G">Guo Lu</a>, <a href="/search/eess?searchtype=author&query=Min%2C+X">Xiongkuo Min</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xiaohong Liu</a>, <a href="/search/eess?searchtype=author&query=Zhai%2C+G">Guangtao Zhai</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+W">Weisi Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09356v1-abstract-short" style="display: inline;"> Ultra-low bitrate image compression is a challenging and demanding topic. With the development of Large Multimodal Models (LMMs), a Cross Modality Compression (CMC) paradigm of Image-Text-Image has emerged. Compared with traditional codecs, this semantic-level compression can reduce image data size to 0.1\% or even lower, which has strong potential applications. However, CMC has certain defects in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09356v1-abstract-full').style.display = 'inline'; document.getElementById('2406.09356v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09356v1-abstract-full" style="display: none;"> Ultra-low bitrate image compression is a challenging and demanding topic. With the development of Large Multimodal Models (LMMs), a Cross Modality Compression (CMC) paradigm of Image-Text-Image has emerged. Compared with traditional codecs, this semantic-level compression can reduce image data size to 0.1\% or even lower, which has strong potential applications. However, CMC has certain defects in consistency with the original image and perceptual quality. To address this problem, we introduce CMC-Bench, a benchmark of the cooperative performance of Image-to-Text (I2T) and Text-to-Image (T2I) models for image compression. This benchmark covers 18,000 and 40,000 images respectively to verify 6 mainstream I2T and 12 T2I models, including 160,000 subjective preference scores annotated by human experts. At ultra-low bitrates, this paper proves that the combination of some I2T and T2I models has surpassed the most advanced visual signal codecs; meanwhile, it highlights where LMMs can be further optimized toward the compression task. We encourage LMM developers to participate in this test to promote the evolution of visual signal codec protocols. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09356v1-abstract-full').style.display = 'none'; document.getElementById('2406.09356v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08716">arXiv:2406.08716</a> <span> [<a href="https://arxiv.org/pdf/2406.08716">pdf</a>, <a href="https://arxiv.org/format/2406.08716">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> TSE-PI: Target Sound Extraction under Reverberant Environments with Pitch Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yiwen Wang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xihong Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08716v1-abstract-short" style="display: inline;"> Target sound extraction (TSE) separates the target sound from the mixture signals based on provided clues. However, the performance of existing models significantly degrades under reverberant conditions. Inspired by auditory scene analysis (ASA), this work proposes a TSE model provided with pitch information named TSE-PI. Conditional pitch extraction is achieved through the Feature-wise Linearly M… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08716v1-abstract-full').style.display = 'inline'; document.getElementById('2406.08716v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08716v1-abstract-full" style="display: none;"> Target sound extraction (TSE) separates the target sound from the mixture signals based on provided clues. However, the performance of existing models significantly degrades under reverberant conditions. Inspired by auditory scene analysis (ASA), this work proposes a TSE model provided with pitch information named TSE-PI. Conditional pitch extraction is achieved through the Feature-wise Linearly Modulated layer with the sound-class label. A modified Waveformer model combined with pitch information, employing a learnable Gammatone filterbank in place of the convolutional encoder, is used for target sound extraction. The inclusion of pitch information is aimed at improving the model's performance. The experimental results on the FSD50K dataset illustrate 2.4 dB improvements of target sound extraction under reverberant environments when incorporating pitch information and Gammatone filterbank. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08716v1-abstract-full').style.display = 'none'; document.getElementById('2406.08716v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Interspeech2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08336">arXiv:2406.08336</a> <span> [<a href="https://arxiv.org/pdf/2406.08336">pdf</a>, <a href="https://arxiv.org/format/2406.08336">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CoLM-DSR: Leveraging Neural Codec Language Modeling for Multi-Modal Dysarthric Speech Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xueyuan Chen</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+D">Dongchao Yang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+D">Dingdong Wang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xixin Wu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Z">Zhiyong Wu</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08336v2-abstract-short" style="display: inline;"> Dysarthric speech reconstruction (DSR) aims to transform dysarthric speech into normal speech. It still suffers from low speaker similarity and poor prosody naturalness. In this paper, we propose a multi-modal DSR model by leveraging neural codec language modeling to improve the reconstruction results, especially for the speaker similarity and prosody naturalness. Our proposed model consists of: (… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08336v2-abstract-full').style.display = 'inline'; document.getElementById('2406.08336v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08336v2-abstract-full" style="display: none;"> Dysarthric speech reconstruction (DSR) aims to transform dysarthric speech into normal speech. It still suffers from low speaker similarity and poor prosody naturalness. In this paper, we propose a multi-modal DSR model by leveraging neural codec language modeling to improve the reconstruction results, especially for the speaker similarity and prosody naturalness. Our proposed model consists of: (i) a multi-modal content encoder to extract robust phoneme embeddings from dysarthric speech with auxiliary visual inputs; (ii) a speaker codec encoder to extract and normalize the speaker-aware codecs from the dysarthric speech, in order to provide original timbre and normal prosody; (iii) a codec language model based speech decoder to reconstruct the speech based on the extracted phoneme embeddings and normalized codecs. Evaluations on the commonly used UASpeech corpus show that our proposed model can achieve significant improvements in terms of speaker similarity and prosody naturalness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08336v2-abstract-full').style.display = 'none'; document.getElementById('2406.08336v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.02940">arXiv:2406.02940</a> <span> [<a href="https://arxiv.org/pdf/2406.02940">pdf</a>, <a href="https://arxiv.org/format/2406.02940">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Addressing Index Collapse of Large-Codebook Speech Tokenizer with Dual-Decoding Product-Quantized Variational Auto-Encoder </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Guo%2C+H">Haohan Guo</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+F">Fenglong Xie</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+D">Dongchao Yang</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+H">Hui Lu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xixin Wu</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.02940v1-abstract-short" style="display: inline;"> VQ-VAE, as a mainstream approach of speech tokenizer, has been troubled by ``index collapse'', where only a small number of codewords are activated in large codebooks. This work proposes product-quantized (PQ) VAE with more codebooks but fewer codewords to address this problem and build large-codebook speech tokenizers. It encodes speech features into multiple VQ subspaces and composes them into c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02940v1-abstract-full').style.display = 'inline'; document.getElementById('2406.02940v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.02940v1-abstract-full" style="display: none;"> VQ-VAE, as a mainstream approach of speech tokenizer, has been troubled by ``index collapse'', where only a small number of codewords are activated in large codebooks. This work proposes product-quantized (PQ) VAE with more codebooks but fewer codewords to address this problem and build large-codebook speech tokenizers. It encodes speech features into multiple VQ subspaces and composes them into codewords in a larger codebook. Besides, to utilize each VQ subspace well, we also enhance PQ-VAE via a dual-decoding training strategy with the encoding and quantized sequences. The experimental results demonstrate that PQ-VAE addresses ``index collapse" effectively, especially for larger codebooks. The model with the proposed training strategy further improves codebook perplexity and reconstruction quality, outperforming other multi-codebook VQ approaches. Finally, PQ-VAE demonstrates its effectiveness in language-model-based TTS, supporting higher-quality speech generation with larger codebooks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02940v1-abstract-full').style.display = 'none'; document.getElementById('2406.02940v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.02328">arXiv:2406.02328</a> <span> [<a href="https://arxiv.org/pdf/2406.02328">pdf</a>, <a href="https://arxiv.org/format/2406.02328">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SimpleSpeech: Towards Simple and Efficient Text-to-Speech with Scalar Latent Transformer Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+D">Dongchao Yang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+D">Dingdong Wang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+H">Haohan Guo</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xueyuan Chen</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xixin Wu</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.02328v3-abstract-short" style="display: inline;"> In this study, we propose a simple and efficient Non-Autoregressive (NAR) text-to-speech (TTS) system based on diffusion, named SimpleSpeech. Its simpleness shows in three aspects: (1) It can be trained on the speech-only dataset, without any alignment information; (2) It directly takes plain text as input and generates speech through an NAR way; (3) It tries to model speech in a finite and compac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02328v3-abstract-full').style.display = 'inline'; document.getElementById('2406.02328v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.02328v3-abstract-full" style="display: none;"> In this study, we propose a simple and efficient Non-Autoregressive (NAR) text-to-speech (TTS) system based on diffusion, named SimpleSpeech. Its simpleness shows in three aspects: (1) It can be trained on the speech-only dataset, without any alignment information; (2) It directly takes plain text as input and generates speech through an NAR way; (3) It tries to model speech in a finite and compact latent space, which alleviates the modeling difficulty of diffusion. More specifically, we propose a novel speech codec model (SQ-Codec) with scalar quantization, SQ-Codec effectively maps the complex speech signal into a finite and compact latent space, named scalar latent space. Benefits from SQ-Codec, we apply a novel transformer diffusion model in the scalar latent space of SQ-Codec. We train SimpleSpeech on 4k hours of a speech-only dataset, it shows natural prosody and voice cloning ability. Compared with previous large-scale TTS models, it presents significant speech quality and generation speed improvement. Demos are released. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02328v3-abstract-full').style.display = 'none'; document.getElementById('2406.02328v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by InterSpeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.01313">arXiv:2406.01313</a> <span> [<a href="https://arxiv.org/pdf/2406.01313">pdf</a>, <a href="https://arxiv.org/ps/2406.01313">ps</a>, <a href="https://arxiv.org/format/2406.01313">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> 3D Trajectory Design for Energy-constrained Aerial CRNs Under Probabilistic LoS Channel </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lei%2C+H">Hongjiang Lei</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xiaqiu Wu</a>, <a href="/search/eess?searchtype=author&query=Park%2C+K">Ki-Hong Park</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+G">Gaofeng Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.01313v1-abstract-short" style="display: inline;"> Unmanned aerial vehicles (UAVs) have been attracting significant attention because there is a high probability of line-of-sight links being obtained between them and terrestrial nodes in high-rise urban areas. In this work, we investigate cognitive radio networks (CRNs) by jointly designing three-dimensional (3D) trajectory, the transmit power of the UAV, and user scheduling. Considering the UAV's… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01313v1-abstract-full').style.display = 'inline'; document.getElementById('2406.01313v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.01313v1-abstract-full" style="display: none;"> Unmanned aerial vehicles (UAVs) have been attracting significant attention because there is a high probability of line-of-sight links being obtained between them and terrestrial nodes in high-rise urban areas. In this work, we investigate cognitive radio networks (CRNs) by jointly designing three-dimensional (3D) trajectory, the transmit power of the UAV, and user scheduling. Considering the UAV's onboard energy consumption, an optimization problem is formulated in which the average achievable rate of the considered system is maximized by jointly optimizing the UAV's 3D trajectory, transmission power, and user scheduling. Due to the non-convex optimization problem, a lower bound on the average achievable rate is utilized to reduce the complexity of the solution. Subsequently, the original optimization problem is decoupled into four subproblems by using block coordinate descent, and each subproblem is transformed into manageable convex optimization problems by introducing slack variables and successive convex approximation. Numerical results validate the effectiveness of our proposed algorithm and demonstrate that the 3D trajectories of UAVs can enhance the average achievable rate of aerial CRNs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01313v1-abstract-full').style.display = 'none'; document.getElementById('2406.01313v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 6 figures,submitted to the IEEE journal for review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.19796">arXiv:2405.19796</a> <span> [<a href="https://arxiv.org/pdf/2405.19796">pdf</a>, <a href="https://arxiv.org/format/2405.19796">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Explainable Attribute-Based Speaker Verification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xiaoliang Wu</a>, <a href="/search/eess?searchtype=author&query=Luu%2C+C">Chau Luu</a>, <a href="/search/eess?searchtype=author&query=Bell%2C+P">Peter Bell</a>, <a href="/search/eess?searchtype=author&query=Rajan%2C+A">Ajitha Rajan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.19796v1-abstract-short" style="display: inline;"> This paper proposes a fully explainable approach to speaker verification (SV), a task that fundamentally relies on individual speaker characteristics. The opaque use of speaker attributes in current SV systems raises concerns of trust. Addressing this, we propose an attribute-based explainable SV system that identifies speakers by comparing personal attributes such as gender, nationality, and age… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19796v1-abstract-full').style.display = 'inline'; document.getElementById('2405.19796v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.19796v1-abstract-full" style="display: none;"> This paper proposes a fully explainable approach to speaker verification (SV), a task that fundamentally relies on individual speaker characteristics. The opaque use of speaker attributes in current SV systems raises concerns of trust. Addressing this, we propose an attribute-based explainable SV system that identifies speakers by comparing personal attributes such as gender, nationality, and age extracted automatically from voice recordings. We believe this approach better aligns with human reasoning, making it more understandable than traditional methods. Evaluated on the Voxceleb1 test set, the best performance of our system is comparable with the ground truth established when using all correct attributes, proving its efficacy. Whilst our approach sacrifices some performance compared to non-explainable methods, we believe that it moves us closer to the goal of transparent, interpretable AI and lays the groundwork for future enhancements through attribute expansion. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19796v1-abstract-full').style.display = 'none'; document.getElementById('2405.19796v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.17024">arXiv:2405.17024</a> <span> [<a href="https://arxiv.org/pdf/2405.17024">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Beware of Overestimated Decoding Performance Arising from Temporal Autocorrelations in Electroencephalogram Signals </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+X">Xiran Xu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+B">Bo Wang</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+B">Boda Xiao</a>, <a href="/search/eess?searchtype=author&query=Niu%2C+Y">Yadong Niu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yiwen Wang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xihong Wu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+J">Jing Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.17024v1-abstract-short" style="display: inline;"> Researchers have reported high decoding accuracy (>95%) using non-invasive Electroencephalogram (EEG) signals for brain-computer interface (BCI) decoding tasks like image decoding, emotion recognition, auditory spatial attention detection, etc. Since these EEG data were usually collected with well-designed paradigms in labs, the reliability and robustness of the corresponding decoding methods were… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17024v1-abstract-full').style.display = 'inline'; document.getElementById('2405.17024v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.17024v1-abstract-full" style="display: none;"> Researchers have reported high decoding accuracy (>95%) using non-invasive Electroencephalogram (EEG) signals for brain-computer interface (BCI) decoding tasks like image decoding, emotion recognition, auditory spatial attention detection, etc. Since these EEG data were usually collected with well-designed paradigms in labs, the reliability and robustness of the corresponding decoding methods were doubted by some researchers, and they argued that such decoding accuracy was overestimated due to the inherent temporal autocorrelation of EEG signals. However, the coupling between the stimulus-driven neural responses and the EEG temporal autocorrelations makes it difficult to confirm whether this overestimation exists in truth. Furthermore, the underlying pitfalls behind overestimated decoding accuracy have not been fully explained due to a lack of appropriate formulation. In this work, we formulate the pitfall in various EEG decoding tasks in a unified framework. EEG data were recorded from watermelons to remove stimulus-driven neural responses. Labels were assigned to continuous EEG according to the experimental design for EEG recording of several typical datasets, and then the decoding methods were conducted. The results showed the label can be successfully decoded as long as continuous EEG data with the same label were split into training and test sets. Further analysis indicated that high accuracy of various BCI decoding tasks could be achieved by associating labels with EEG intrinsic temporal autocorrelation features. These results underscore the importance of choosing the right experimental designs and data splits in BCI decoding tasks to prevent inflated accuracies due to EEG temporal autocorrelation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17024v1-abstract-full').style.display = 'none'; document.getElementById('2405.17024v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.03126">arXiv:2405.03126</a> <span> [<a href="https://arxiv.org/pdf/2405.03126">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Infrared Polarization Imaging-based Non-destructive Thermography Inspection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xianyu Wu</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+B">Bin Zhou</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+P">Peng Lin</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+R">Rongjin Cao</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+F">Feng Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.03126v1-abstract-short" style="display: inline;"> Infrared pulse thermography non-destructive testing (NDT) method is developed based on the difference in the infrared radiation intensity emitted by defective and non-defective areas of an object. However, when the radiation intensity of the defective target is similar to that of the non-defective area of the object, the detection results are poor. To address this issue, this study investigated th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.03126v1-abstract-full').style.display = 'inline'; document.getElementById('2405.03126v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.03126v1-abstract-full" style="display: none;"> Infrared pulse thermography non-destructive testing (NDT) method is developed based on the difference in the infrared radiation intensity emitted by defective and non-defective areas of an object. However, when the radiation intensity of the defective target is similar to that of the non-defective area of the object, the detection results are poor. To address this issue, this study investigated the polarization characteristics of the infrared radiation of different materials. Simulation results showed that the degree of infrared polarization of the object surface changed regularly with changes in thermal environment radiation. An infrared polarization imaging-based NDT method was proposed and demonstrated using specimens with four different simulated defective areas, which were designed and fabricated using four different materials. The experimental results were consistent with the simulation results, thereby proving the effectiveness of the proposed method. Compared with the infrared-radiation-intensity-based NDT method, the proposed method improved the image detail presentation and detection accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.03126v1-abstract-full').style.display = 'none'; document.getElementById('2405.03126v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.01725">arXiv:2405.01725</a> <span> [<a href="https://arxiv.org/pdf/2405.01725">pdf</a>, <a href="https://arxiv.org/format/2405.01725">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Development of Skip Connection in Deep Neural Networks for Computer Vision and Medical Image Analysis: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+G">Guoping Xu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xiaxia Wang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xinglong Wu</a>, <a href="/search/eess?searchtype=author&query=Leng%2C+X">Xuesong Leng</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+Y">Yongchao Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.01725v1-abstract-short" style="display: inline;"> Deep learning has made significant progress in computer vision, specifically in image classification, object detection, and semantic segmentation. The skip connection has played an essential role in the architecture of deep neural networks,enabling easier optimization through residual learning during the training stage and improving accuracy during testing. Many neural networks have inherited the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.01725v1-abstract-full').style.display = 'inline'; document.getElementById('2405.01725v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.01725v1-abstract-full" style="display: none;"> Deep learning has made significant progress in computer vision, specifically in image classification, object detection, and semantic segmentation. The skip connection has played an essential role in the architecture of deep neural networks,enabling easier optimization through residual learning during the training stage and improving accuracy during testing. Many neural networks have inherited the idea of residual learning with skip connections for various tasks, and it has been the standard choice for designing neural networks. This survey provides a comprehensive summary and outlook on the development of skip connections in deep neural networks. The short history of skip connections is outlined, and the development of residual learning in deep neural networks is surveyed. The effectiveness of skip connections in the training and testing stages is summarized, and future directions for using skip connections in residual learning are discussed. Finally, we summarize seminal papers, source code, models, and datasets that utilize skip connections in computer vision, including image classification, object detection, semantic segmentation, and image reconstruction. We hope this survey could inspire peer researchers in the community to develop further skip connections in various forms and tasks and the theory of residual learning in deep neural networks. The project page can be found at https://github.com/apple1986/Residual_Learning_For_Images <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.01725v1-abstract-full').style.display = 'none'; document.getElementById('2405.01725v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.17867">arXiv:2404.17867</a> <span> [<a href="https://arxiv.org/pdf/2404.17867">pdf</a>, <a href="https://arxiv.org/format/2404.17867">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Are Watermarks Bugs for Deepfake Detectors? Rethinking Proactive Forensics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xiaoshuai Wu</a>, <a href="/search/eess?searchtype=author&query=Liao%2C+X">Xin Liao</a>, <a href="/search/eess?searchtype=author&query=Ou%2C+B">Bo Ou</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yuling Liu</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+Z">Zheng Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.17867v1-abstract-short" style="display: inline;"> AI-generated content has accelerated the topic of media synthesis, particularly Deepfake, which can manipulate our portraits for positive or malicious purposes. Before releasing these threatening face images, one promising forensics solution is the injection of robust watermarks to track their own provenance. However, we argue that current watermarking models, originally devised for genuine images… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.17867v1-abstract-full').style.display = 'inline'; document.getElementById('2404.17867v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.17867v1-abstract-full" style="display: none;"> AI-generated content has accelerated the topic of media synthesis, particularly Deepfake, which can manipulate our portraits for positive or malicious purposes. Before releasing these threatening face images, one promising forensics solution is the injection of robust watermarks to track their own provenance. However, we argue that current watermarking models, originally devised for genuine images, may harm the deployed Deepfake detectors when directly applied to forged images, since the watermarks are prone to overlap with the forgery signals used for detection. To bridge this gap, we thus propose AdvMark, on behalf of proactive forensics, to exploit the adversarial vulnerability of passive detectors for good. Specifically, AdvMark serves as a plug-and-play procedure for fine-tuning any robust watermarking into adversarial watermarking, to enhance the forensic detectability of watermarked images; meanwhile, the watermarks can still be extracted for provenance tracking. Extensive experiments demonstrate the effectiveness of the proposed AdvMark, leveraging robust watermarking to fool Deepfake detectors, which can help improve the accuracy of downstream Deepfake detection without tuning the in-the-wild detectors. We believe this work will shed some light on the harmless proactive forensics against Deepfake. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.17867v1-abstract-full').style.display = 'none'; document.getElementById('2404.17867v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IJCAI 2024</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Wu%2C+X&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Wu%2C+X&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Wu%2C+X&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Wu%2C+X&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Wu%2C+X&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Wu%2C+X&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Wu%2C+X&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> <li> <a href="/search/?searchtype=author&query=Wu%2C+X&start=300" class="pagination-link " aria-label="Page 7" aria-current="page">7 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository