CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–21 of 21 results for author: <span class="mathjax">Fang, M</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Fang%2C+M">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Fang, M"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Fang%2C+M&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Fang, M"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05471">arXiv:2502.05471</a> <span> [<a href="https://arxiv.org/pdf/2502.05471">pdf</a>, <a href="https://arxiv.org/format/2502.05471">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Expressive Voice Conversion with Discrete Pitch-Conditioned Flow Matching Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zuo%2C+J">Jialong Zuo</a>, <a href="/search/eess?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+M">Minghui Fang</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+X">Xize Cheng</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Q">Qian Yang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+W">Wenrui Liu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+G">Guangyan Zhang</a>, <a href="/search/eess?searchtype=author&query=Tu%2C+Z">Zehai Tu</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Yiwen Guo</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05471v1-abstract-short" style="display: inline;"> This paper introduces PFlow-VC, a conditional flow matching voice conversion model that leverages fine-grained discrete pitch tokens and target speaker prompt information for expressive voice conversion (VC). Previous VC works primarily focus on speaker conversion, with further exploration needed in enhancing expressiveness (such as prosody and emotion) for timbre conversion. Unlike previous metho… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05471v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05471v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05471v1-abstract-full" style="display: none;"> This paper introduces PFlow-VC, a conditional flow matching voice conversion model that leverages fine-grained discrete pitch tokens and target speaker prompt information for expressive voice conversion (VC). Previous VC works primarily focus on speaker conversion, with further exploration needed in enhancing expressiveness (such as prosody and emotion) for timbre conversion. Unlike previous methods, we adopt a simple and efficient approach to enhance the style expressiveness of voice conversion models. Specifically, we pretrain a self-supervised pitch VQVAE model to discretize speaker-irrelevant pitch information and leverage a masked pitch-conditioned flow matching model for Mel-spectrogram synthesis, which provides in-context pitch modeling capabilities for the speaker conversion model, effectively improving the voice style transfer capacity. Additionally, we improve timbre similarity by combining global timbre embeddings with time-varying timbre tokens. Experiments on unseen LibriTTS test-clean and emotional speech dataset ESD show the superiority of the PFlow-VC model in both timbre conversion and style transfer. Audio samples are available on the demo page https://speechai-demo.github.io/PFlow-VC/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05471v1-abstract-full').style.display = 'none'; document.getElementById('2502.05471v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01384">arXiv:2501.01384</a> <span> [<a href="https://arxiv.org/pdf/2501.01384">pdf</a>, <a href="https://arxiv.org/format/2501.01384">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> OmniChat: Enhancing Spoken Dialogue Systems with Scalable Synthetic Data for Diverse Scenarios </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cheng%2C+X">Xize Cheng</a>, <a href="/search/eess?searchtype=author&query=Fu%2C+D">Dongjie Fu</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+X">Xiaoda Yang</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+M">Minghui Fang</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+R">Ruofan Hu</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+J">Jingyu Lu</a>, <a href="/search/eess?searchtype=author&query=Jionghao%2C+B">Bai Jionghao</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zehan Wang</a>, <a href="/search/eess?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+L">Linjun Li</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yu Chen</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+T">Tao Jin</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01384v1-abstract-short" style="display: inline;"> With the rapid development of large language models, researchers have created increasingly advanced spoken dialogue systems that can naturally converse with humans. However, these systems still struggle to handle the full complexity of real-world conversations, including audio events, musical contexts, and emotional expressions, mainly because current dialogue datasets are constrained in both scal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01384v1-abstract-full').style.display = 'inline'; document.getElementById('2501.01384v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01384v1-abstract-full" style="display: none;"> With the rapid development of large language models, researchers have created increasingly advanced spoken dialogue systems that can naturally converse with humans. However, these systems still struggle to handle the full complexity of real-world conversations, including audio events, musical contexts, and emotional expressions, mainly because current dialogue datasets are constrained in both scale and scenario diversity. In this paper, we propose leveraging synthetic data to enhance the dialogue models across diverse scenarios. We introduce ShareChatX, the first comprehensive, large-scale dataset for spoken dialogue that spans diverse scenarios. Based on this dataset, we introduce OmniChat, a multi-turn dialogue system with a heterogeneous feature fusion module, designed to optimize feature selection in different dialogue contexts. In addition, we explored critical aspects of training dialogue systems using synthetic data. Through comprehensive experimentation, we determined the ideal balance between synthetic and real data, achieving state-of-the-art results on the real-world dialogue dataset DailyTalk. We also highlight the crucial importance of synthetic data in tackling diverse, complex dialogue scenarios, especially those involving audio and music. For more details, please visit our demo page at \url{https://sharechatx.github.io/}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01384v1-abstract-full').style.display = 'none'; document.getElementById('2501.01384v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13917">arXiv:2412.13917</a> <span> [<a href="https://arxiv.org/pdf/2412.13917">pdf</a>, <a href="https://arxiv.org/format/2412.13917">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Speech Watermarking with Discrete Intermediate Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/eess?searchtype=author&query=Zuo%2C+J">Jialong Zuo</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+M">Minghui Fang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yifu Chen</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+T">Tao Jin</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13917v1-abstract-short" style="display: inline;"> Speech watermarking techniques can proactively mitigate the potential harmful consequences of instant voice cloning techniques. These techniques involve the insertion of signals into speech that are imperceptible to humans but can be detected by algorithms. Previous approaches typically embed watermark messages into continuous space. However, intuitively, embedding watermark information into robus… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13917v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13917v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13917v1-abstract-full" style="display: none;"> Speech watermarking techniques can proactively mitigate the potential harmful consequences of instant voice cloning techniques. These techniques involve the insertion of signals into speech that are imperceptible to humans but can be detected by algorithms. Previous approaches typically embed watermark messages into continuous space. However, intuitively, embedding watermark information into robust discrete latent space can significantly improve the robustness of watermarking systems. In this paper, we propose DiscreteWM, a novel speech watermarking framework that injects watermarks into the discrete intermediate representations of speech. Specifically, we map speech into discrete latent space with a vector-quantized autoencoder and inject watermarks by changing the modular arithmetic relation of discrete IDs. To ensure the imperceptibility of watermarks, we also propose a manipulator model to select the candidate tokens for watermark embedding. Experimental results demonstrate that our framework achieves state-of-the-art performance in robustness and imperceptibility, simultaneously. Moreover, our flexible frame-wise approach can serve as an efficient solution for both voice cloning detection and information hiding. Additionally, DiscreteWM can encode 1 to 150 bits of watermark information within a 1-second speech clip, indicating its encoding capacity. Audio samples are available at https://DiscreteWM.github.io/discrete_wm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13917v1-abstract-full').style.display = 'none'; document.getElementById('2412.13917v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09808">arXiv:2412.09808</a> <span> [<a href="https://arxiv.org/pdf/2412.09808">pdf</a>, <a href="https://arxiv.org/format/2412.09808">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> V2Sim: An Open-Source Microscopic V2G Simulation Platform in Urban Power and Transportation Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Qian%2C+T">Tao Qian</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+M">Mingyu Fang</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+Q">Qinran Hu</a>, <a href="/search/eess?searchtype=author&query=Shao%2C+C">Chengcheng Shao</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+J">Junyi Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09808v1-abstract-short" style="display: inline;"> This paper proposes V2Sim, an open source Pythonbased simulation platform designed for advanced vehicle-to-grid (V2G) analysis in coupled urban power and transportation networks. By integrating a microscopic urban transportation network (MUTN) with a power distribution network (PDN), V2Sim enables precise modeling of electric vehicle charging loads (EVCL) and dynamic V2G operations. The platform u… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09808v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09808v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09808v1-abstract-full" style="display: none;"> This paper proposes V2Sim, an open source Pythonbased simulation platform designed for advanced vehicle-to-grid (V2G) analysis in coupled urban power and transportation networks. By integrating a microscopic urban transportation network (MUTN) with a power distribution network (PDN), V2Sim enables precise modeling of electric vehicle charging loads (EVCL) and dynamic V2G operations. The platform uniquely combines SUMO for MUTN simulations and an optimized DistFlow model for PDN analysis, with dedicated models for fast charging stations (FCS) and slow charging stations (SCS), capturing detailed charging dynamics often overlooked in existing simulation tools. V2Sim supports a range of customizable V2G strategies, advanced fault-sensing in charging stations, and parallel simulation through multi-processing to accelerate large-scale case studies. Case studies using a real-world MUTN from Nanjing, China, demonstrate V2Sim's capability to analyze the spatial-temporal distribution of EVCL and evaluate V2G impacts, such as fault dissemination and pricing variations, in unprecedented detail. Unlike traditional equilibrium models, V2Sim captures single-vehicle behavior and charging interactions at the microscopic level, offering unparalleled accuracy in assessing the operational and planning needs of V2G-compatible systems. This platform serves as a comprehensive tool for researchers and urban planners aiming to optimize integrated power and transportation networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09808v1-abstract-full').style.display = 'none'; document.getElementById('2412.09808v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14842">arXiv:2411.14842</a> <span> [<a href="https://arxiv.org/pdf/2411.14842">pdf</a>, <a href="https://arxiv.org/format/2411.14842">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Who Can Withstand Chat-Audio Attacks? An Evaluation Benchmark for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wanqi Yang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yanda Li</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+M">Meng Fang</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+Y">Yunchao Wei</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+T">Tianyi Zhou</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+L">Ling Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14842v1-abstract-short" style="display: inline;"> Adversarial audio attacks pose a significant threat to the growing use of large language models (LLMs) in voice-based human-machine interactions. While existing research has primarily focused on model-specific adversarial methods, real-world applications demand a more generalizable and universal approach to audio adversarial attacks. In this paper, we introduce the Chat-Audio Attacks (CAA) benchma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14842v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14842v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14842v1-abstract-full" style="display: none;"> Adversarial audio attacks pose a significant threat to the growing use of large language models (LLMs) in voice-based human-machine interactions. While existing research has primarily focused on model-specific adversarial methods, real-world applications demand a more generalizable and universal approach to audio adversarial attacks. In this paper, we introduce the Chat-Audio Attacks (CAA) benchmark including four distinct types of audio attacks, which aims to explore the the vulnerabilities of LLMs to these audio attacks in conversational scenarios. To evaluate the robustness of LLMs, we propose three evaluation strategies: Standard Evaluation, utilizing traditional metrics to quantify model performance under attacks; GPT-4o-Based Evaluation, which simulates real-world conversational complexities; and Human Evaluation, offering insights into user perception and trust. We evaluate six state-of-the-art LLMs with voice interaction capabilities, including Gemini-1.5-Pro, GPT-4o, and others, using three distinct evaluation methods on the CAA benchmark. Our comprehensive analysis reveals the impact of four types of audio attacks on the performance of these models, demonstrating that GPT-4o exhibits the highest level of resilience. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14842v1-abstract-full').style.display = 'none'; document.getElementById('2411.14842v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13577">arXiv:2411.13577</a> <span> [<a href="https://arxiv.org/pdf/2411.13577">pdf</a>, <a href="https://arxiv.org/format/2411.13577">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> WavChat: A Survey of Spoken Dialogue Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yifu Chen</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+M">Minghui Fang</a>, <a href="/search/eess?searchtype=author&query=Zuo%2C+J">Jialong Zuo</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+J">Jingyu Lu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hanting Wang</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+L">Long Zhou</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Shujie Liu</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+X">Xize Cheng</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+X">Xiaoda Yang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zehan Wang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Q">Qian Yang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jian Li</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Y">Yidi Jiang</a>, <a href="/search/eess?searchtype=author&query=He%2C+J">Jingzhen He</a>, <a href="/search/eess?searchtype=author&query=Chu%2C+Y">Yunfei Chu</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+J">Jin Xu</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13577v2-abstract-short" style="display: inline;"> Recent advancements in spoken dialogue models, exemplified by systems like GPT-4o, have captured significant attention in the speech domain. Compared to traditional three-tier cascaded spoken dialogue models that comprise speech recognition (ASR), large language models (LLMs), and text-to-speech (TTS), modern spoken dialogue models exhibit greater intelligence. These advanced spoken dialogue model… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13577v2-abstract-full').style.display = 'inline'; document.getElementById('2411.13577v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13577v2-abstract-full" style="display: none;"> Recent advancements in spoken dialogue models, exemplified by systems like GPT-4o, have captured significant attention in the speech domain. Compared to traditional three-tier cascaded spoken dialogue models that comprise speech recognition (ASR), large language models (LLMs), and text-to-speech (TTS), modern spoken dialogue models exhibit greater intelligence. These advanced spoken dialogue models not only comprehend audio, music, and other speech-related features, but also capture stylistic and timbral characteristics in speech. Moreover, they generate high-quality, multi-turn speech responses with low latency, enabling real-time interaction through simultaneous listening and speaking capability. Despite the progress in spoken dialogue systems, there is a lack of comprehensive surveys that systematically organize and analyze these systems and the underlying technologies. To address this, we have first compiled existing spoken dialogue systems in the chronological order and categorized them into the cascaded and end-to-end paradigms. We then provide an in-depth overview of the core technologies in spoken dialogue models, covering aspects such as speech representation, training paradigm, streaming, duplex, and interaction capabilities. Each section discusses the limitations of these technologies and outlines considerations for future research. Additionally, we present a thorough review of relevant datasets, evaluation metrics, and benchmarks from the perspectives of training and evaluating spoken dialogue systems. We hope this survey will contribute to advancing both academic research and industrial applications in the field of spoken dialogue systems. The related material is available at https://github.com/jishengpeng/WavChat. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13577v2-abstract-full').style.display = 'none'; document.getElementById('2411.13577v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">60 papes, working in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01805">arXiv:2411.01805</a> <span> [<a href="https://arxiv.org/pdf/2411.01805">pdf</a>, <a href="https://arxiv.org/format/2411.01805">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MoMu-Diffusion: On Learning Long-Term Motion-Music Synchronization and Correspondence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=You%2C+F">Fuming You</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+M">Minghui Fang</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+L">Li Tang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yongqi Wang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01805v1-abstract-short" style="display: inline;"> Motion-to-music and music-to-motion have been studied separately, each attracting substantial research interest within their respective domains. The interaction between human motion and music is a reflection of advanced human intelligence, and establishing a unified relationship between them is particularly important. However, to date, there has been no work that considers them jointly to explore… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01805v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01805v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01805v1-abstract-full" style="display: none;"> Motion-to-music and music-to-motion have been studied separately, each attracting substantial research interest within their respective domains. The interaction between human motion and music is a reflection of advanced human intelligence, and establishing a unified relationship between them is particularly important. However, to date, there has been no work that considers them jointly to explore the modality alignment within. To bridge this gap, we propose a novel framework, termed MoMu-Diffusion, for long-term and synchronous motion-music generation. Firstly, to mitigate the huge computational costs raised by long sequences, we propose a novel Bidirectional Contrastive Rhythmic Variational Auto-Encoder (BiCoR-VAE) that extracts the modality-aligned latent representations for both motion and music inputs. Subsequently, leveraging the aligned latent spaces, we introduce a multi-modal Transformer-based diffusion model and a cross-guidance sampling strategy to enable various generation tasks, including cross-modal, multi-modal, and variable-length generation. Extensive experiments demonstrate that MoMu-Diffusion surpasses recent state-of-the-art methods both qualitatively and quantitatively, and can synthesize realistic, diverse, long-term, and beat-matched music or motion sequences. The generated samples and codes are available at https://momu-diffusion.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01805v1-abstract-full').style.display = 'none'; document.getElementById('2411.01805v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21269">arXiv:2410.21269</a> <span> [<a href="https://arxiv.org/pdf/2410.21269">pdf</a>, <a href="https://arxiv.org/format/2410.21269">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> OmniSep: Unified Omni-Modality Sound Separation with Query-Mixup </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cheng%2C+X">Xize Cheng</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zehan Wang</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+M">Minghui Fang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Ziang Zhang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+Z">Ziyang Ma</a>, <a href="/search/eess?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/eess?searchtype=author&query=Zuo%2C+J">Jialong Zuo</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+T">Tao Jin</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21269v1-abstract-short" style="display: inline;"> The scaling up has brought tremendous success in the fields of vision and language in recent years. When it comes to audio, however, researchers encounter a major challenge in scaling up the training data, as most natural audio contains diverse interfering signals. To address this limitation, we introduce Omni-modal Sound Separation (OmniSep), a novel framework capable of isolating clean soundtrac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21269v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21269v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21269v1-abstract-full" style="display: none;"> The scaling up has brought tremendous success in the fields of vision and language in recent years. When it comes to audio, however, researchers encounter a major challenge in scaling up the training data, as most natural audio contains diverse interfering signals. To address this limitation, we introduce Omni-modal Sound Separation (OmniSep), a novel framework capable of isolating clean soundtracks based on omni-modal queries, encompassing both single-modal and multi-modal composed queries. Specifically, we introduce the Query-Mixup strategy, which blends query features from different modalities during training. This enables OmniSep to optimize multiple modalities concurrently, effectively bringing all modalities under a unified framework for sound separation. We further enhance this flexibility by allowing queries to influence sound separation positively or negatively, facilitating the retention or removal of specific sounds as desired. Finally, OmniSep employs a retrieval-augmented approach known as Query-Aug, which enables open-vocabulary sound separation. Experimental evaluations on MUSIC, VGGSOUND-CLEAN+, and MUSIC-CLEAN+ datasets demonstrate effectiveness of OmniSep, achieving state-of-the-art performance in text-, image-, and audio-queried sound separation tasks. For samples and further information, please visit the demo page at \url{https://omnisep.github.io/}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21269v1-abstract-full').style.display = 'none'; document.getElementById('2410.21269v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Working in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00884">arXiv:2409.00884</a> <span> [<a href="https://arxiv.org/pdf/2409.00884">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Novel Hybrid Parameter-Efficient Fine-Tuning Approach for Hippocampus Segmentation and Alzheimer's Disease Diagnosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cheng%2C+W">Wangang Cheng</a>, <a href="/search/eess?searchtype=author&query=He%2C+G">Guanghua He</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+K">Keli Hu</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+M">Mingyu Fang</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+L">Liang Dong</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhong Li</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+H">Hancan Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00884v1-abstract-short" style="display: inline;"> Deep learning methods have significantly advanced medical image segmentation, yet their success hinges on large volumes of manually annotated data, which require specialized expertise for accurate labeling. Additionally, these methods often demand substantial computational resources, particularly for three-dimensional medical imaging tasks. Consequently, applying deep learning techniques for medic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00884v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00884v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00884v1-abstract-full" style="display: none;"> Deep learning methods have significantly advanced medical image segmentation, yet their success hinges on large volumes of manually annotated data, which require specialized expertise for accurate labeling. Additionally, these methods often demand substantial computational resources, particularly for three-dimensional medical imaging tasks. Consequently, applying deep learning techniques for medical image segmentation with limited annotated data and computational resources remains a critical challenge. In this paper, we propose a novel parameter-efficient fine-tuning strategy, termed HyPS, which employs a hybrid parallel and serial architecture. HyPS updates a minimal subset of model parameters, thereby retaining the pre-trained model's original knowledge tructure while enhancing its ability to learn specific features relevant to downstream tasks. We apply this strategy to the state-of-the-art SwinUNETR model for medical image segmentation. Initially, the model is pre-trained on the BraTs2021 dataset, after which the HyPS method is employed to transfer it to three distinct hippocampus datasets.Extensive experiments demonstrate that HyPS outperforms baseline methods, especially in scenarios with limited training samples. Furthermore, based on the segmentation results, we calculated the hippocampal volumes of subjects from the ADNI dataset and combined these with metadata to classify disease types. In distinguishing Alzheimer's disease (AD) from cognitively normal (CN) individuals, as well as early mild cognitive impairment (EMCI) from late mild cognitive impairment (LMCI), HyPS achieved classification accuracies of 83.78% and 64.29%, respectively. These findings indicate that the HyPS method not only facilitates effective hippocampal segmentation using pre-trained models but also holds potential for aiding Alzheimer's disease detection. Our code is publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00884v1-abstract-full').style.display = 'none'; document.getElementById('2409.00884v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.16532">arXiv:2408.16532</a> <span> [<a href="https://arxiv.org/pdf/2408.16532">pdf</a>, <a href="https://arxiv.org/format/2408.16532">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> WavTokenizer: an Efficient Acoustic Discrete Codec Tokenizer for Audio Language Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yifu Chen</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+M">Minghui Fang</a>, <a href="/search/eess?searchtype=author&query=Zuo%2C+J">Jialong Zuo</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Q">Qian Yang</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+X">Xize Cheng</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zehan Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+R">Ruiqi Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Ziang Zhang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+X">Xiaoda Yang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Y">Yidi Jiang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.16532v2-abstract-short" style="display: inline;"> Language models have been effectively applied to modeling natural signals, such as images, video, speech, and audio. A crucial component of these models is the codec tokenizer, which compresses high-dimensional natural signals into lower-dimensional discrete tokens. In this paper, we introduce WavTokenizer, which offers several advantages over previous SOTA acoustic codec models in the audio domai… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16532v2-abstract-full').style.display = 'inline'; document.getElementById('2408.16532v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.16532v2-abstract-full" style="display: none;"> Language models have been effectively applied to modeling natural signals, such as images, video, speech, and audio. A crucial component of these models is the codec tokenizer, which compresses high-dimensional natural signals into lower-dimensional discrete tokens. In this paper, we introduce WavTokenizer, which offers several advantages over previous SOTA acoustic codec models in the audio domain: 1)extreme compression. By compressing the layers of quantizers and the temporal dimension of the discrete codec, one-second audio of 24kHz sampling rate requires only a single quantizer with 40 or 75 tokens. 2)improved subjective quality. Despite the reduced number of tokens, WavTokenizer achieves state-of-the-art reconstruction quality with outstanding UTMOS scores and inherently contains richer semantic information. Specifically, we achieve these results by designing a broader VQ space, extended contextual windows, and improved attention networks, as well as introducing a powerful multi-scale discriminator and an inverse Fourier transform structure. We conducted extensive reconstruction experiments in the domains of speech, audio, and music. WavTokenizer exhibited strong performance across various objective and subjective metrics compared to state-of-the-art models. We also tested semantic information, VQ utilization, and adaptability to generative models. Comprehensive ablation studies confirm the necessity of each module in WavTokenizer. The related code, demos, and pre-trained models are available at https://github.com/jishengpeng/WavTokenizer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16532v2-abstract-full').style.display = 'none'; document.getElementById('2408.16532v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Working in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09304">arXiv:2406.09304</a> <span> [<a href="https://arxiv.org/pdf/2406.09304">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Applied Physics">physics.app-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Self-reconfigurable Multifunctional Memristive Nociceptor for Intelligent Robotics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shengbo Wang</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+M">Mingchao Fang</a>, <a href="/search/eess?searchtype=author&query=Song%2C+L">Lekai Song</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Cong Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Jian Zhang</a>, <a href="/search/eess?searchtype=author&query=Nathan%2C+A">Arokia Nathan</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+G">Guohua Hu</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+S">Shuo Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09304v1-abstract-short" style="display: inline;"> Artificial nociceptors, mimicking human-like stimuli perception, are of significance for intelligent robotics to work in hazardous and dynamic scenarios. One of the most essential characteristics of the human nociceptor is its self-adjustable attribute, which indicates that the threshold of determination of a potentially hazardous stimulus relies on environmental knowledge. This critical attribute… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09304v1-abstract-full').style.display = 'inline'; document.getElementById('2406.09304v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09304v1-abstract-full" style="display: none;"> Artificial nociceptors, mimicking human-like stimuli perception, are of significance for intelligent robotics to work in hazardous and dynamic scenarios. One of the most essential characteristics of the human nociceptor is its self-adjustable attribute, which indicates that the threshold of determination of a potentially hazardous stimulus relies on environmental knowledge. This critical attribute has been currently omitted, but it is highly desired for artificial nociceptors. Inspired by these shortcomings, this article presents, for the first time, a Self-Directed Channel (SDC) memristor-based self-reconfigurable nociceptor, capable of perceiving hazardous pressure stimuli under different temperatures and demonstrates key features of tactile nociceptors, including 'threshold,' 'no-adaptation,' and 'sensitization.' The maximum amplification of hazardous external stimuli is 1000%, and its response characteristics dynamically adapt to current temperature conditions by automatically altering the generated modulation schemes for the memristor. The maximum difference ratio of the response of memristors at different temperatures is 500%, and this adaptability closely mimics the functions of biological tactile nociceptors, resulting in accurate danger perception in various conditions. Beyond temperature adaptation, this memristor-based nociceptor has the potential to integrate different sensory modalities by applying various sensors, thereby achieving human-like perception capabilities in real-world environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09304v1-abstract-full').style.display = 'none'; document.getElementById('2406.09304v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08835">arXiv:2406.08835</a> <span> [<a href="https://arxiv.org/pdf/2406.08835">pdf</a>, <a href="https://arxiv.org/format/2406.08835">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> EffectiveASR: A Single-Step Non-Autoregressive Mandarin Speech Recognition Architecture with High Accuracy and Inference Speed </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhuang%2C+Z">Ziyang Zhuang</a>, <a href="/search/eess?searchtype=author&query=Miao%2C+C">Chenfeng Miao</a>, <a href="/search/eess?searchtype=author&query=Zou%2C+K">Kun Zou</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+M">Ming Fang</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+T">Tao Wei</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zijian Li</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+N">Ning Cheng</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+W">Wei Hu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shaojun Wang</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+J">Jing Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08835v4-abstract-short" style="display: inline;"> Non-autoregressive (NAR) automatic speech recognition (ASR) models predict tokens independently and simultaneously, bringing high inference speed. However, there is still a gap in the accuracy of the NAR models compared to the autoregressive (AR) models. In this paper, we propose a single-step NAR ASR architecture with high accuracy and inference speed, called EffectiveASR. It uses an Index Mappin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08835v4-abstract-full').style.display = 'inline'; document.getElementById('2406.08835v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08835v4-abstract-full" style="display: none;"> Non-autoregressive (NAR) automatic speech recognition (ASR) models predict tokens independently and simultaneously, bringing high inference speed. However, there is still a gap in the accuracy of the NAR models compared to the autoregressive (AR) models. In this paper, we propose a single-step NAR ASR architecture with high accuracy and inference speed, called EffectiveASR. It uses an Index Mapping Vector (IMV) based alignment generator to generate alignments during training, and an alignment predictor to learn the alignments for inference. It can be trained end-to-end (E2E) with cross-entropy loss combined with alignment loss. The proposed EffectiveASR achieves competitive results on the AISHELL-1 and AISHELL-2 Mandarin benchmarks compared to the leading models. Specifically, it achieves character error rates (CER) of 4.26%/4.62% on the AISHELL-1 dev/test dataset, which outperforms the AR Conformer with about 30x inference speedup. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08835v4-abstract-full').style.display = 'none'; document.getElementById('2406.08835v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP) 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.01205">arXiv:2406.01205</a> <span> [<a href="https://arxiv.org/pdf/2406.01205">pdf</a>, <a href="https://arxiv.org/format/2406.01205">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> ControlSpeech: Towards Simultaneous Zero-shot Speaker Cloning and Zero-shot Language Style Control With Decoupled Codec </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/eess?searchtype=author&query=Zuo%2C+J">Jialong Zuo</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+M">Minghui Fang</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Hai Huang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zehan Wang</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+X">Xize Cheng</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.01205v2-abstract-short" style="display: inline;"> In this paper, we present ControlSpeech, a text-to-speech (TTS) system capable of fully cloning the speaker's voice and enabling arbitrary control and adjustment of speaking style, merely based on a few seconds of audio prompt and a simple textual style description prompt. Prior zero-shot TTS models and controllable TTS models either could only mimic the speaker's voice without further control and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01205v2-abstract-full').style.display = 'inline'; document.getElementById('2406.01205v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.01205v2-abstract-full" style="display: none;"> In this paper, we present ControlSpeech, a text-to-speech (TTS) system capable of fully cloning the speaker's voice and enabling arbitrary control and adjustment of speaking style, merely based on a few seconds of audio prompt and a simple textual style description prompt. Prior zero-shot TTS models and controllable TTS models either could only mimic the speaker's voice without further control and adjustment capabilities or were unrelated to speaker-specific voice generation. Therefore, ControlSpeech focuses on a more challenging new task-a TTS system with controllable timbre, content, and style at the same time. ControlSpeech takes speech prompts, content prompts, and style prompts as inputs and utilizes bidirectional attention and mask-based parallel decoding to capture corresponding codec representations in a discrete decoupling codec space. Moreover, we discovered the issue of text style controllability in a many-to-many mapping fashion and proposed the Style Mixture Semantic Density (SMSD) model to resolve this problem. SMSD module which is based on Gaussian mixture density networks, is designed to enhance the fine-grained partitioning and sampling capabilities of style semantic information and generate speech with more diverse styles. In terms of experiments, we make available a controllable model toolkit called ControlToolkit with a new style controllable dataset, some replicated baseline models and propose new metrics to evaluate both the control capability and the quality of generated audio in ControlSpeech. The relevant ablation studies validate the necessity of each component in ControlSpeech is necessary. We hope that ControlSpeech can establish the next foundation paradigm of controllable speech synthesis. The relevant code and demo are available at https://github.com/jishengpeng/ControlSpeech . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01205v2-abstract-full').style.display = 'none'; document.getElementById('2406.01205v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.12208">arXiv:2402.12208</a> <span> [<a href="https://arxiv.org/pdf/2402.12208">pdf</a>, <a href="https://arxiv.org/format/2402.12208">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Language-Codec: Reducing the Gaps Between Discrete Codec Representation and Speech Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+M">Minghui Fang</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Zuo%2C+J">Jialung Zuo</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shulei Wang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.12208v3-abstract-short" style="display: inline;"> In recent years, large language models have achieved significant success in generative tasks (e.g., speech cloning and audio generation) related to speech, audio, music, and other signal domains. A crucial element of these models is the discrete acoustic codecs, which serves as an intermediate representation replacing the mel-spectrogram. However, there exist several gaps between discrete codecs a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.12208v3-abstract-full').style.display = 'inline'; document.getElementById('2402.12208v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.12208v3-abstract-full" style="display: none;"> In recent years, large language models have achieved significant success in generative tasks (e.g., speech cloning and audio generation) related to speech, audio, music, and other signal domains. A crucial element of these models is the discrete acoustic codecs, which serves as an intermediate representation replacing the mel-spectrogram. However, there exist several gaps between discrete codecs and downstream speech language models. Specifically, 1) most codec models are trained on only 1,000 hours of data, whereas most speech language models are trained on 60,000 hours; 2) Achieving good reconstruction performance requires the utilization of numerous codebooks, which increases the burden on downstream speech language models; 3) The initial channel of the codebooks contains excessive information, making it challenging to directly generate acoustic tokens from weakly supervised signals such as text in downstream tasks. Consequently, leveraging the characteristics of speech language models, we propose Language-Codec. In the Language-Codec, we introduce a Mask Channel Residual Vector Quantization (MCRVQ) mechanism along with improved Fourier transform structures and larger training datasets to address the aforementioned gaps. We compare our method with competing audio compression algorithms and observe significant outperformance across extensive evaluations. Furthermore, we also validate the efficiency of the Language-Codec on downstream speech language models. The source code and pre-trained models can be accessed at https://github.com/jishengpeng/languagecodec . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.12208v3-abstract-full').style.display = 'none'; document.getElementById('2402.12208v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">We release a more powerful checkpoint in Language-Codec v3</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.07676">arXiv:2402.07676</a> <span> [<a href="https://arxiv.org/pdf/2402.07676">pdf</a>, <a href="https://arxiv.org/format/2402.07676">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Applications">stat.AP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Mathematical Physics">math-ph</span> </div> </div> <p class="title is-5 mathjax"> Statistical modelling and Bayesian inversion for a Compton imaging system: application to radioactive source localisation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Tarpau%2C+C">Cecilia Tarpau</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+M">Ming Fang</a>, <a href="/search/eess?searchtype=author&query=Zygalakis%2C+K+C">Konstantinos C. Zygalakis</a>, <a href="/search/eess?searchtype=author&query=Pereyra%2C+M">Marcelo Pereyra</a>, <a href="/search/eess?searchtype=author&query=Di+Fulvio%2C+A">Angela Di Fulvio</a>, <a href="/search/eess?searchtype=author&query=Altmann%2C+Y">Yoann Altmann</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.07676v2-abstract-short" style="display: inline;"> This paper presents a statistical forward model for a Compton imaging system, called Compton imager. This system, under development at the University of Illinois Urbana Champaign, is a variant of Compton cameras with a single type of sensors which can simultaneously act as scatterers and absorbers. This imager is convenient for imaging situations requiring a wide field of view. The proposed statis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.07676v2-abstract-full').style.display = 'inline'; document.getElementById('2402.07676v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.07676v2-abstract-full" style="display: none;"> This paper presents a statistical forward model for a Compton imaging system, called Compton imager. This system, under development at the University of Illinois Urbana Champaign, is a variant of Compton cameras with a single type of sensors which can simultaneously act as scatterers and absorbers. This imager is convenient for imaging situations requiring a wide field of view. The proposed statistical forward model is then used to solve the inverse problem of estimating the location and energy of point-like sources from observed data. This inverse problem is formulated and solved in a Bayesian framework by using a Metropolis within Gibbs algorithm for the estimation of the location, and an expectation-maximization algorithm for the estimation of the energy. This approach leads to more accurate estimation when compared with the deterministic standard back-projection approach, with the additional benefit of uncertainty quantification in the low photon imaging setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.07676v2-abstract-full').style.display = 'none'; document.getElementById('2402.07676v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.14430">arXiv:2308.14430</a> <span> [<a href="https://arxiv.org/pdf/2308.14430">pdf</a>, <a href="https://arxiv.org/format/2308.14430">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICASSP48485.2024.10445879">10.1109/ICASSP48485.2024.10445879 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> TextrolSpeech: A Text Style Control Speech Corpus With Codec Language Text-to-Speech Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/eess?searchtype=author&query=Zuo%2C+J">Jialong Zuo</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+M">Minghui Fang</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+F">Feiyang Chen</a>, <a href="/search/eess?searchtype=author&query=Duan%2C+X">Xinyu Duan</a>, <a href="/search/eess?searchtype=author&query=Huai%2C+B">Baoxing Huai</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.14430v1-abstract-short" style="display: inline;"> Recently, there has been a growing interest in the field of controllable Text-to-Speech (TTS). While previous studies have relied on users providing specific style factor values based on acoustic knowledge or selecting reference speeches that meet certain requirements, generating speech solely from natural text prompts has emerged as a new challenge for researchers. This challenge arises due to th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.14430v1-abstract-full').style.display = 'inline'; document.getElementById('2308.14430v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.14430v1-abstract-full" style="display: none;"> Recently, there has been a growing interest in the field of controllable Text-to-Speech (TTS). While previous studies have relied on users providing specific style factor values based on acoustic knowledge or selecting reference speeches that meet certain requirements, generating speech solely from natural text prompts has emerged as a new challenge for researchers. This challenge arises due to the scarcity of high-quality speech datasets with natural text style prompt and the absence of advanced text-controllable TTS models. In light of this, 1) we propose TextrolSpeech, which is the first large-scale speech emotion dataset annotated with rich text attributes. The dataset comprises 236,220 pairs of style prompt in natural text descriptions with five style factors and corresponding speech samples. Through iterative experimentation, we introduce a multi-stage prompt programming approach that effectively utilizes the GPT model for generating natural style descriptions in large volumes. 2) Furthermore, to address the need for generating audio with greater style diversity, we propose an efficient architecture called Salle. This architecture treats text controllable TTS as a language model task, utilizing audio codec codes as an intermediate representation to replace the conventional mel-spectrogram. Finally, we successfully demonstrate the ability of the proposed model by showing a comparable performance in the controllable TTS task. Audio samples are available at https://sall-e.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.14430v1-abstract-full').style.display = 'none'; document.getElementById('2308.14430v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.16750">arXiv:2306.16750</a> <span> [<a href="https://arxiv.org/pdf/2306.16750">pdf</a>, <a href="https://arxiv.org/format/2306.16750">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/978-3-031-43421-1_34">10.1007/978-3-031-43421-1_34 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Eigensubspace of Temporal-Difference Dynamics and How It Improves Value Approximation in Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=He%2C+Q">Qiang He</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+T">Tianyi Zhou</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+M">Meng Fang</a>, <a href="/search/eess?searchtype=author&query=Maghsudi%2C+S">Setareh Maghsudi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.16750v2-abstract-short" style="display: inline;"> We propose a novel value approximation method, namely Eigensubspace Regularized Critic (ERC) for deep reinforcement learning (RL). ERC is motivated by an analysis of the dynamics of Q-value approximation error in the Temporal-Difference (TD) method, which follows a path defined by the 1-eigensubspace of the transition kernel associated with the Markov Decision Process (MDP). It reveals a fundament… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.16750v2-abstract-full').style.display = 'inline'; document.getElementById('2306.16750v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.16750v2-abstract-full" style="display: none;"> We propose a novel value approximation method, namely Eigensubspace Regularized Critic (ERC) for deep reinforcement learning (RL). ERC is motivated by an analysis of the dynamics of Q-value approximation error in the Temporal-Difference (TD) method, which follows a path defined by the 1-eigensubspace of the transition kernel associated with the Markov Decision Process (MDP). It reveals a fundamental property of TD learning that has remained unused in previous deep RL approaches. In ERC, we propose a regularizer that guides the approximation error tending towards the 1-eigensubspace, resulting in a more efficient and stable path of value approximation. Moreover, we theoretically prove the convergence of the ERC method. Besides, theoretical analysis and experiments demonstrate that ERC effectively reduces the variance of value functions. Among 26 tasks in the DMControl benchmark, ERC outperforms state-of-the-art methods for 20. Besides, it shows significant advantages in Q-value approximation and variance reduction. Our code is available at https://sites.google.com/view/erc-ecml23/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.16750v2-abstract-full').style.display = 'none'; document.getElementById('2306.16750v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ECML23. Code: https://sites.google.com/view/erc-ecml23/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.14312">arXiv:2211.14312</a> <span> [<a href="https://arxiv.org/pdf/2211.14312">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Karyotype AI for Precision Oncology </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shamsi%2C+Z">Zahra Shamsi</a>, <a href="/search/eess?searchtype=author&query=Bryant%2C+D">Drew Bryant</a>, <a href="/search/eess?searchtype=author&query=Wilson%2C+J">Jacob Wilson</a>, <a href="/search/eess?searchtype=author&query=Qu%2C+X">Xiaoyu Qu</a>, <a href="/search/eess?searchtype=author&query=Dubey%2C+A">Avinava Dubey</a>, <a href="/search/eess?searchtype=author&query=Kothari%2C+K">Konik Kothari</a>, <a href="/search/eess?searchtype=author&query=Dehghani%2C+M">Mostafa Dehghani</a>, <a href="/search/eess?searchtype=author&query=Chavarha%2C+M">Mariya Chavarha</a>, <a href="/search/eess?searchtype=author&query=Likhosherstov%2C+V">Valerii Likhosherstov</a>, <a href="/search/eess?searchtype=author&query=Williams%2C+B">Brian Williams</a>, <a href="/search/eess?searchtype=author&query=Frumkin%2C+M">Michael Frumkin</a>, <a href="/search/eess?searchtype=author&query=Appelbaum%2C+F">Fred Appelbaum</a>, <a href="/search/eess?searchtype=author&query=Choromanski%2C+K">Krzysztof Choromanski</a>, <a href="/search/eess?searchtype=author&query=Bashir%2C+A">Ali Bashir</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+M">Min Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.14312v3-abstract-short" style="display: inline;"> Chromosome analysis is essential for diagnosing genetic disorders. For hematologic malignancies, identification of somatic clonal aberrations by karyotype analysis remains the standard of care. However, karyotyping is costly and time-consuming because of the largely manual process and the expertise required in identifying and annotating aberrations. Efforts to automate karyotype analysis to date f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.14312v3-abstract-full').style.display = 'inline'; document.getElementById('2211.14312v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.14312v3-abstract-full" style="display: none;"> Chromosome analysis is essential for diagnosing genetic disorders. For hematologic malignancies, identification of somatic clonal aberrations by karyotype analysis remains the standard of care. However, karyotyping is costly and time-consuming because of the largely manual process and the expertise required in identifying and annotating aberrations. Efforts to automate karyotype analysis to date fell short in aberration detection. Using a training set of ~10k patient specimens and ~50k karyograms from over 5 years from the Fred Hutchinson Cancer Center, we created a labeled set of images representing individual chromosomes. These individual chromosomes were used to train and assess deep learning models for classifying the 24 human chromosomes and identifying chromosomal aberrations. The top-accuracy models utilized the recently introduced Topological Vision Transformers (TopViTs) with 2-level-block-Toeplitz masking, to incorporate structural inductive bias. TopViT outperformed CNN (Inception) models with >99.3% accuracy for chromosome identification, and exhibited accuracies >99% for aberration detection in most aberrations. Notably, we were able to show high-quality performance even in "few shot" learning scenarios. Incorporating the definition of clonality substantially improved both precision and recall (sensitivity). When applied to "zero shot" scenarios, the model captured aberrations without training, with perfect precision at >50% recall. Together these results show that modern deep learning models can approach expert-level performance for chromosome aberration detection. To our knowledge, this is the first study demonstrating the downstream effectiveness of TopViTs. These results open up exciting opportunities for not only expediting patient results but providing a scalable technology for early screening of low-abundance chromosomal lesions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.14312v3-abstract-full').style.display = 'none'; document.getElementById('2211.14312v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.16640">arXiv:2210.16640</a> <span> [<a href="https://arxiv.org/pdf/2210.16640">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/JBHI.2020.3002805">10.1109/JBHI.2020.3002805 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> 2D and 3D CT Radiomic Features Performance Comparison in Characterization of Gastric Cancer: A Multi-center Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Meng%2C+L">Lingwei Meng</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+D">Di Dong</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xin Chen</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+M">Mengjie Fang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+R">Rongpin Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jing Li</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Z">Zaiyi Liu</a>, <a href="/search/eess?searchtype=author&query=Tian%2C+J">Jie Tian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.16640v1-abstract-short" style="display: inline;"> Objective: Radiomics, an emerging tool for medical image analysis, is potential towards precisely characterizing gastric cancer (GC). Whether using one-slice 2D annotation or whole-volume 3D annotation remains a long-time debate, especially for heterogeneous GC. We comprehensively compared 2D and 3D radiomic features' representation and discrimination capacity regarding GC, via three tasks. Meth… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.16640v1-abstract-full').style.display = 'inline'; document.getElementById('2210.16640v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.16640v1-abstract-full" style="display: none;"> Objective: Radiomics, an emerging tool for medical image analysis, is potential towards precisely characterizing gastric cancer (GC). Whether using one-slice 2D annotation or whole-volume 3D annotation remains a long-time debate, especially for heterogeneous GC. We comprehensively compared 2D and 3D radiomic features' representation and discrimination capacity regarding GC, via three tasks. Methods: Four-center 539 GC patients were retrospectively enrolled and divided into the training and validation cohorts. From 2D or 3D regions of interest (ROIs) annotated by radiologists, radiomic features were extracted respectively. Feature selection and model construction procedures were customed for each combination of two modalities (2D or 3D) and three tasks. Subsequently, six machine learning models (Model_2D^LNM, Model_3D^LNM; Model_2D^LVI, Model_3D^LVI; Model_2D^pT, Model_3D^pT) were derived and evaluated to reflect modalities' performances in characterizing GC. Furthermore, we performed an auxiliary experiment to assess modalities' performances when resampling spacing is different. Results: Regarding three tasks, the yielded areas under the curve (AUCs) were: Model_2D^LNM's 0.712 (95% confidence interval, 0.613-0.811), Model_3D^LNM's 0.680 (0.584-0.775); Model_2D^LVI's 0.677 (0.595-0.761), Model_3D^LVI's 0.615 (0.528-0.703); Model_2D^pT's 0.840 (0.779-0.901), Model_3D^pT's 0.813 (0.747-0.879). Moreover, the auxiliary experiment indicated that Models_2D are statistically more advantageous than Models3D with different resampling spacings. Conclusion: Models constructed with 2D radiomic features revealed comparable performances with those constructed with 3D features in characterizing GC. Significance: Our work indicated that time-saving 2D annotation would be the better choice in GC, and provided a related reference to further radiomics-based researches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.16640v1-abstract-full').style.display = 'none'; document.getElementById('2210.16640v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in IEEE Journal of Biomedical and Health Informatics</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE.J.Biomed.Health.Inf. 25 (2021) 755-763 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.13774">arXiv:2204.13774</a> <span> [<a href="https://arxiv.org/pdf/2204.13774">pdf</a>, <a href="https://arxiv.org/format/2204.13774">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Instrumentation and Detectors">physics.ins-det</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Algorithms for TRISO Fuel Identification Based on X-ray CT Validated on Tungsten-Carbide Compacts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Fang%2C+M">Ming Fang</a>, <a href="/search/eess?searchtype=author&query=Di+Fulvio%2C+A">Angela Di Fulvio</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.13774v1-abstract-short" style="display: inline;"> Tristructural-isotropic (TRISO) fuel is one of the most mature fuel types for candidate advanced reactor types under development. TRISO-fuel pebbles flow continuously through the reactor core and can be reinserted into the reactor several times until a target burnup is reached. The capability of identifying individual fuel pebbles would allow us to calculate the fuel residence time in the core and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.13774v1-abstract-full').style.display = 'inline'; document.getElementById('2204.13774v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.13774v1-abstract-full" style="display: none;"> Tristructural-isotropic (TRISO) fuel is one of the most mature fuel types for candidate advanced reactor types under development. TRISO-fuel pebbles flow continuously through the reactor core and can be reinserted into the reactor several times until a target burnup is reached. The capability of identifying individual fuel pebbles would allow us to calculate the fuel residence time in the core and validate pebble flow computational models, prevent excessive burnup accumulation or premature fuel discharge, and maintain accountability of special nuclear materials during fuel circulation. In this work, we have developed a 3D image reconstruction and segmentation algorithm to accurately segment TRISO particles and extract the unique 3D distribution. We have developed a rotation-invariant and noise-robust identification algorithm that allows us to identify the pebble and retrieve the pebble ID in the presence of rotations and noises. We also report the results of 200kV X-ray CT image reconstruction of a mock-up fuel sample consisting of tungsten-carbide (WC) kernels in a lucite matrix. The 3D distribution of TRISO particles along with other signatures such as $^{235}$U enrichment and burnup level extracted through neutron multiplicity counting, would enable accurate fuel identification in a reasonable amount of time. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.13774v1-abstract-full').style.display = 'none'; document.getElementById('2204.13774v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1909.11430">arXiv:1909.11430</a> <span> [<a href="https://arxiv.org/pdf/1909.11430">pdf</a>, <a href="https://arxiv.org/format/1909.11430">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Breaking the Data Barrier: Towards Robust Speech Translation via Adversarial Stability Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cheng%2C+Q">Qiao Cheng</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+M">Meiyuan Fang</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yaqian Han</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+J">Jin Huang</a>, <a href="/search/eess?searchtype=author&query=Duan%2C+Y">Yitao Duan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1909.11430v3-abstract-short" style="display: inline;"> In a pipeline speech translation system, automatic speech recognition (ASR) system will transmit errors in recognition to the downstream machine translation (MT) system. A standard machine translation system is usually trained on parallel corpus composed of clean text and will perform poorly on text with recognition noise, a gap well known in speech translation community. In this paper, we propose… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1909.11430v3-abstract-full').style.display = 'inline'; document.getElementById('1909.11430v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1909.11430v3-abstract-full" style="display: none;"> In a pipeline speech translation system, automatic speech recognition (ASR) system will transmit errors in recognition to the downstream machine translation (MT) system. A standard machine translation system is usually trained on parallel corpus composed of clean text and will perform poorly on text with recognition noise, a gap well known in speech translation community. In this paper, we propose a training architecture which aims at making a neural machine translation model more robust against speech recognition errors. Our approach addresses the encoder and the decoder simultaneously using adversarial learning and data augmentation, respectively. Experimental results on IWSLT2018 speech translation task show that our approach can bridge the gap between the ASR output and the MT input, outperforms the baseline by up to 2.83 BLEU on noisy ASR output, while maintaining close performance on clean text. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1909.11430v3-abstract-full').style.display = 'none'; document.getElementById('1909.11430v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 September, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at the 16th International Workshop on Spoken Language Translation (IWSLT 2019)</span> </p> </li> </ol> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>