Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 52 results for author: <span class="mathjax">He, Q</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=He%2C+Q">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="He, Q"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=He%2C+Q&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="He, Q"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=He%2C+Q&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=He%2C+Q&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=He%2C+Q&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15602">arXiv:2412.15602</a> <span> [<a href="https://arxiv.org/pdf/2412.15602">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Music Genre Classification: Ensemble Learning with Subcomponents-level Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yichen Liu</a>, <a href="/search/eess?searchtype=author&query=Dasgupta%2C+A">Abhijit Dasgupta</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qiwei He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15602v1-abstract-short" style="display: inline;"> Music Genre Classification is one of the most popular topics in the fields of Music Information Retrieval (MIR) and digital signal processing. Deep Learning has emerged as the top performer for classifying music genres among various methods. The letter introduces a novel approach by combining ensemble learning with attention to sub-components, aiming to enhance the accuracy of identifying music ge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15602v1-abstract-full').style.display = 'inline'; document.getElementById('2412.15602v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15602v1-abstract-full" style="display: none;"> Music Genre Classification is one of the most popular topics in the fields of Music Information Retrieval (MIR) and digital signal processing. Deep Learning has emerged as the top performer for classifying music genres among various methods. The letter introduces a novel approach by combining ensemble learning with attention to sub-components, aiming to enhance the accuracy of identifying music genres. The core innovation of our work is the proposal to classify the subcomponents of the music pieces separately, allowing our model to capture distinct characteristics from those sub components. By applying ensemble learning techniques to these individual classifications, we make the final classification decision on the genre of the music. The proposed method has superior advantages in terms of accuracy compared to the other state-of-the-art techniques trained and tested on the GTZAN dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15602v1-abstract-full').style.display = 'none'; document.getElementById('2412.15602v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06262">arXiv:2412.06262</a> <span> [<a href="https://arxiv.org/pdf/2412.06262">pdf</a>, <a href="https://arxiv.org/format/2412.06262">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> A Lightweight U-like Network Utilizing Neural Memory Ordinary Differential Equations for Slimming the Decoder </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=He%2C+Q">Quansong He</a>, <a href="/search/eess?searchtype=author&query=Yao%2C+X">Xiaojun Yao</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+J">Jun Wu</a>, <a href="/search/eess?searchtype=author&query=Yi%2C+Z">Zhang Yi</a>, <a href="/search/eess?searchtype=author&query=He%2C+T">Tao He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06262v1-abstract-short" style="display: inline;"> In recent years, advanced U-like networks have demonstrated remarkable performance in medical image segmentation tasks. However, their drawbacks, including excessive parameters, high computational complexity, and slow inference speed, pose challenges for practical implementation in scenarios with limited computational resources. Existing lightweight U-like networks have alleviated some of these pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06262v1-abstract-full').style.display = 'inline'; document.getElementById('2412.06262v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06262v1-abstract-full" style="display: none;"> In recent years, advanced U-like networks have demonstrated remarkable performance in medical image segmentation tasks. However, their drawbacks, including excessive parameters, high computational complexity, and slow inference speed, pose challenges for practical implementation in scenarios with limited computational resources. Existing lightweight U-like networks have alleviated some of these problems, but they often have pre-designed structures and consist of inseparable modules, limiting their application scenarios. In this paper, we propose three plug-and-play decoders by employing different discretization methods of the neural memory Ordinary Differential Equations (nmODEs). These decoders integrate features at various levels of abstraction by processing information from skip connections and performing numerical operations on upward path. Through experiments on the PH2, ISIC2017, and ISIC2018 datasets, we embed these decoders into different U-like networks, demonstrating their effectiveness in significantly reducing the number of parameters and FLOPs while maintaining performance. In summary, the proposed discretized nmODEs decoders are capable of reducing the number of parameters by about 20% ~ 50% and FLOPs by up to 74%, while possessing the potential to adapt to all U-like networks. Our code is available at https://github.com/nayutayuki/Lightweight-nmODE-Decoders-For-U-like-networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06262v1-abstract-full').style.display = 'none'; document.getElementById('2412.06262v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16331">arXiv:2411.16331</a> <span> [<a href="https://arxiv.org/pdf/2411.16331">pdf</a>, <a href="https://arxiv.org/format/2411.16331">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Sonic: Shifting Focus to Global Audio Perception in Portrait Animation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ji%2C+X">Xiaozhong Ji</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+X">Xiaobin Hu</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+Z">Zhihong Xu</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+J">Junwei Zhu</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+C">Chuming Lin</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qingdong He</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Jiangning Zhang</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+D">Donghao Luo</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yi Chen</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+Q">Qin Lin</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+Q">Qinglin Lu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Chengjie Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16331v1-abstract-short" style="display: inline;"> The study of talking face generation mainly explores the intricacies of synchronizing facial movements and crafting visually appealing, temporally-coherent animations. However, due to the limited exploration of global audio perception, current approaches predominantly employ auxiliary visual and spatial knowledge to stabilize the movements, which often results in the deterioration of the naturalne… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16331v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16331v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16331v1-abstract-full" style="display: none;"> The study of talking face generation mainly explores the intricacies of synchronizing facial movements and crafting visually appealing, temporally-coherent animations. However, due to the limited exploration of global audio perception, current approaches predominantly employ auxiliary visual and spatial knowledge to stabilize the movements, which often results in the deterioration of the naturalness and temporal inconsistencies.Considering the essence of audio-driven animation, the audio signal serves as the ideal and unique priors to adjust facial expressions and lip movements, without resorting to interference of any visual signals. Based on this motivation, we propose a novel paradigm, dubbed as Sonic, to {s}hift f{o}cus on the exploration of global audio per{c}ept{i}o{n}.To effectively leverage global audio knowledge, we disentangle it into intra- and inter-clip audio perception and collaborate with both aspects to enhance overall perception.For the intra-clip audio perception, 1). \textbf{Context-enhanced audio learning}, in which long-range intra-clip temporal audio knowledge is extracted to provide facial expression and lip motion priors implicitly expressed as the tone and speed of speech. 2). \textbf{Motion-decoupled controller}, in which the motion of the head and expression movement are disentangled and independently controlled by intra-audio clips. Most importantly, for inter-clip audio perception, as a bridge to connect the intra-clips to achieve the global perception, \textbf{Time-aware position shift fusion}, in which the global inter-clip audio information is considered and fused for long-audio inference via through consecutively time-aware shifted windows. Extensive experiments demonstrate that the novel audio-driven paradigm outperform existing SOTA methodologies in terms of video quality, temporally consistency, lip synchronization precision, and motion diversity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16331v1-abstract-full').style.display = 'none'; document.getElementById('2411.16331v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">refer to our main-page \url{https://jixiaozhong.github.io/Sonic/}</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04844">arXiv:2411.04844</a> <span> [<a href="https://arxiv.org/pdf/2411.04844">pdf</a>, <a href="https://arxiv.org/format/2411.04844">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Discretized Gaussian Representation for Tomographic Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+S">Shaokai Wu</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+Y">Yuxiang Lu</a>, <a href="/search/eess?searchtype=author&query=Ji%2C+W">Wei Ji</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+S">Suizhi Huang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+F">Fengyu Yang</a>, <a href="/search/eess?searchtype=author&query=Sirejiding%2C+S">Shalayiding Sirejiding</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qichen He</a>, <a href="/search/eess?searchtype=author&query=Tong%2C+J">Jing Tong</a>, <a href="/search/eess?searchtype=author&query=Ji%2C+Y">Yanbiao Ji</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+Y">Yue Ding</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+H">Hongtao Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04844v2-abstract-short" style="display: inline;"> Computed Tomography (CT) is a widely used imaging technique that provides detailed cross-sectional views of objects. Over the past decade, Deep Learning-based Reconstruction (DLR) methods have led efforts to enhance image quality and reduce noise, yet they often require large amounts of data and are computationally intensive. Inspired by recent advancements in scene reconstruction, some approaches… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04844v2-abstract-full').style.display = 'inline'; document.getElementById('2411.04844v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04844v2-abstract-full" style="display: none;"> Computed Tomography (CT) is a widely used imaging technique that provides detailed cross-sectional views of objects. Over the past decade, Deep Learning-based Reconstruction (DLR) methods have led efforts to enhance image quality and reduce noise, yet they often require large amounts of data and are computationally intensive. Inspired by recent advancements in scene reconstruction, some approaches have adapted NeRF and 3D Gaussian Splatting (3DGS) techniques for CT reconstruction. However, these methods are not ideal for direct 3D volume reconstruction. In this paper, we reconsider the representation of CT reconstruction and propose a novel Discretized Gaussian Representation (DGR) specifically designed for CT. Unlike the popular 3D Gaussian Splatting, our representation directly reconstructs the 3D volume using a set of discretized Gaussian functions in an end-to-end manner. Additionally, we introduce a Fast Volume Reconstruction technique that efficiently aggregates the contributions of these Gaussians into a discretized volume. Extensive experiments on both real-world and synthetic datasets demonstrate the effectiveness of our method in improving reconstruction quality and computational efficiency. Our code has been provided for review purposes and will be made publicly available upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04844v2-abstract-full').style.display = 'none'; document.getElementById('2411.04844v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20336">arXiv:2410.20336</a> <span> [<a href="https://arxiv.org/pdf/2410.20336">pdf</a>, <a href="https://arxiv.org/format/2410.20336">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Get Large Language Models Ready to Speak: A Late-fusion Approach for Speech Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shen%2C+M">Maohao Shen</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shun Zhang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+J">Jilong Wu</a>, <a href="/search/eess?searchtype=author&query=Xiu%2C+Z">Zhiping Xiu</a>, <a href="/search/eess?searchtype=author&query=AlBadawy%2C+E">Ehab AlBadawy</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+Y">Yiting Lu</a>, <a href="/search/eess?searchtype=author&query=Seltzer%2C+M">Mike Seltzer</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qing He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20336v1-abstract-short" style="display: inline;"> Large language models (LLMs) have revolutionized natural language processing (NLP) with impressive performance across various text-based tasks. However, the extension of text-dominant LLMs to with speech generation tasks remains under-explored. In this work, we introduce a text-to-speech (TTS) system powered by a fine-tuned Llama model, named TTS-Llama, that achieves state-of-the-art speech synthe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20336v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20336v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20336v1-abstract-full" style="display: none;"> Large language models (LLMs) have revolutionized natural language processing (NLP) with impressive performance across various text-based tasks. However, the extension of text-dominant LLMs to with speech generation tasks remains under-explored. In this work, we introduce a text-to-speech (TTS) system powered by a fine-tuned Llama model, named TTS-Llama, that achieves state-of-the-art speech synthesis performance. Building on TTS-Llama, we further propose MoLE-Llama, a text-and-speech multimodal LLM developed through purely late-fusion parameter-efficient fine-tuning (PEFT) and a mixture-of-expert architecture. Extensive empirical results demonstrate MoLE-Llama's competitive performance on both text-only question-answering (QA) and TTS tasks, mitigating catastrophic forgetting issue in either modality. Finally, we further explore MoLE-Llama in text-in-speech-out QA tasks, demonstrating its great potential as a multimodal dialog system capable of speech generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20336v1-abstract-full').style.display = 'none'; document.getElementById('2410.20336v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19544">arXiv:2407.19544</a> <span> [<a href="https://arxiv.org/pdf/2407.19544">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Deep Generative Models-Assisted Automated Labeling for Electron Microscopy Images Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yuan%2C+W">Wenhao Yuan</a>, <a href="/search/eess?searchtype=author&query=Yao%2C+B">Bingqing Yao</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+S">Shengdong Tan</a>, <a href="/search/eess?searchtype=author&query=You%2C+F">Fengqi You</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qian He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19544v1-abstract-short" style="display: inline;"> The rapid advancement of deep learning has facilitated the automated processing of electron microscopy (EM) big data stacks. However, designing a framework that eliminates manual labeling and adapts to domain gaps remains challenging. Current research remains entangled in the dilemma of pursuing complete automation while still requiring simulations or slight manual annotations. Here we demonstrate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19544v1-abstract-full').style.display = 'inline'; document.getElementById('2407.19544v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19544v1-abstract-full" style="display: none;"> The rapid advancement of deep learning has facilitated the automated processing of electron microscopy (EM) big data stacks. However, designing a framework that eliminates manual labeling and adapts to domain gaps remains challenging. Current research remains entangled in the dilemma of pursuing complete automation while still requiring simulations or slight manual annotations. Here we demonstrate tandem generative adversarial network (tGAN), a fully label-free and simulation-free pipeline capable of generating EM images for computer vision training. The tGAN can assimilate key features from new data stacks, thus producing a tailored virtual dataset for the training of automated EM analysis tools. Using segmenting nanoparticles for analyzing size distribution of supported catalysts as the demonstration, our findings showcased that the recognition accuracy of tGAN even exceeds the manually-labeling method by 5%. It can also be adaptively deployed to various data domains without further manual manipulation, which is verified by transfer learning from HAADF-STEM to BF-TEM. This generalizability may enable it to extend its application to a broader range of imaging characterizations, liberating microscopists and materials scientists from tedious dataset annotations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19544v1-abstract-full').style.display = 'none'; document.getElementById('2407.19544v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.17758">arXiv:2407.17758</a> <span> [<a href="https://arxiv.org/pdf/2407.17758">pdf</a>, <a href="https://arxiv.org/format/2407.17758">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Speed-enhanced Subdomain Adaptation Regression for Long-term Stable Neural Decoding in Brain-computer Interfaces </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wei%2C+J">Jiyu Wei</a>, <a href="/search/eess?searchtype=author&query=Rong%2C+D">Dazhong Rong</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+X">Xinyun Zhu</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qinming He</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yueming Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.17758v1-abstract-short" style="display: inline;"> Brain-computer interfaces (BCIs) offer a means to convert neural signals into control signals, providing a potential restoration of movement for people with paralysis. Despite their promise, BCIs face a significant challenge in maintaining decoding accuracy over time due to neural nonstationarities. However, the decoding accuracy of BCI drops severely across days due to the neural data drift. Whil… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17758v1-abstract-full').style.display = 'inline'; document.getElementById('2407.17758v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.17758v1-abstract-full" style="display: none;"> Brain-computer interfaces (BCIs) offer a means to convert neural signals into control signals, providing a potential restoration of movement for people with paralysis. Despite their promise, BCIs face a significant challenge in maintaining decoding accuracy over time due to neural nonstationarities. However, the decoding accuracy of BCI drops severely across days due to the neural data drift. While current recalibration techniques address this issue to a degree, they often fail to leverage the limited labeled data, to consider the signal correlation between two days, or to perform conditional alignment in regression tasks. This paper introduces a novel approach to enhance recalibration performance. We begin with preliminary experiments that reveal the temporal patterns of neural signal changes and identify three critical elements for effective recalibration: global alignment, conditional speed alignment, and feature-label consistency. Building on these insights, we propose the Speed-enhanced Subdomain Adaptation Regression (SSAR) framework, integrating semi-supervised learning with domain adaptation techniques in regression neural decoding. SSAR employs Speed-enhanced Subdomain Alignment (SeSA) for global and speed conditional alignment of similarly labeled data, with Contrastive Consistency Constraint (CCC) to enhance the alignment of SeSA by reinforcing feature-label consistency through contrastive learning. Our comprehensive set of experiments, both qualitative and quantitative, substantiate the superior recalibration performance and robustness of SSAR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17758v1-abstract-full').style.display = 'none'; document.getElementById('2407.17758v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05368">arXiv:2407.05368</a> <span> [<a href="https://arxiv.org/pdf/2407.05368">pdf</a>, <a href="https://arxiv.org/format/2407.05368">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Music Era Recognition Using Supervised Contrastive Learning and Artist Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=He%2C+Q">Qiqi He</a>, <a href="/search/eess?searchtype=author&query=Song%2C+X">Xuchen Song</a>, <a href="/search/eess?searchtype=author&query=Hao%2C+W">Weituo Hao</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Ju-Chiang Wang</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+W">Wei-Tsung Lu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+W">Wei Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05368v1-abstract-short" style="display: inline;"> Does popular music from the 60s sound different than that of the 90s? Prior study has shown that there would exist some variations of patterns and regularities related to instrumentation changes and growing loudness across multi-decadal trends. This indicates that perceiving the era of a song from musical features such as audio and artist information is possible. Music era information can be an im… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05368v1-abstract-full').style.display = 'inline'; document.getElementById('2407.05368v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05368v1-abstract-full" style="display: none;"> Does popular music from the 60s sound different than that of the 90s? Prior study has shown that there would exist some variations of patterns and regularities related to instrumentation changes and growing loudness across multi-decadal trends. This indicates that perceiving the era of a song from musical features such as audio and artist information is possible. Music era information can be an important feature for playlist generation and recommendation. However, the release year of a song can be inaccessible in many circumstances. This paper addresses a novel task of music era recognition. We formulate the task as a music classification problem and propose solutions based on supervised contrastive learning. An audio-based model is developed to predict the era from audio. For the case where the artist information is available, we extend the audio-based model to take multimodal inputs and develop a framework, called MultiModal Contrastive (MMC) learning, to enhance the training. Experimental result on Million Song Dataset demonstrates that the audio-based model achieves 54% in accuracy with a tolerance of 3-years range; incorporating the artist information with the MMC framework for training leads to 9% improvement further. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05368v1-abstract-full').style.display = 'none'; document.getElementById('2407.05368v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08122">arXiv:2406.08122</a> <span> [<a href="https://arxiv.org/pdf/2406.08122">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Fully Few-shot Class-incremental Audio Classification Using Expandable Dual-embedding Extractor </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Si%2C+Y">Yongjie Si</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yanxiong Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jialong Li</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+J">Jiaxin Tan</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qianhua He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08122v1-abstract-short" style="display: inline;"> It's assumed that training data is sufficient in base session of few-shot class-incremental audio classification. However, it's difficult to collect abundant samples for model training in base session in some practical scenarios due to the data scarcity of some classes. This paper explores a new problem of fully few-shot class-incremental audio classification with few training samples in all sessi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08122v1-abstract-full').style.display = 'inline'; document.getElementById('2406.08122v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08122v1-abstract-full" style="display: none;"> It's assumed that training data is sufficient in base session of few-shot class-incremental audio classification. However, it's difficult to collect abundant samples for model training in base session in some practical scenarios due to the data scarcity of some classes. This paper explores a new problem of fully few-shot class-incremental audio classification with few training samples in all sessions. Moreover, we propose a method using expandable dual-embedding extractor to solve it. The proposed model consists of an embedding extractor and an expandable classifier. The embedding extractor consists of a pretrained Audio Spectrogram Transformer (AST) and a finetuned AST. The expandable classifier consists of prototypes and each prototype represents a class. Experiments are conducted on three datasets (LS-100, NSynth-100 and FSC-89). Results show that our method exceeds seven baseline ones in average accuracy with statistical significance. Code is at: https://github.com/YongjieSi/EDE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08122v1-abstract-full').style.display = 'none'; document.getElementById('2406.08122v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication on Interspeech 2024. 5 pages, 3 figures, 5 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08119">arXiv:2406.08119</a> <span> [<a href="https://arxiv.org/pdf/2406.08119">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Low-Complexity Acoustic Scene Classification Using Parallel Attention-Convolution Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yanxiong Li</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+J">Jiaxin Tan</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+G">Guoqing Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jialong Li</a>, <a href="/search/eess?searchtype=author&query=Si%2C+Y">Yongjie Si</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qianhua He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08119v1-abstract-short" style="display: inline;"> This work is an improved system that we submitted to task 1 of DCASE2023 challenge. We propose a method of low-complexity acoustic scene classification by a parallel attention-convolution network which consists of four modules, including pre-processing, fusion, global and local contextual information extraction. The proposed network is computationally efficient to capture global and local contextu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08119v1-abstract-full').style.display = 'inline'; document.getElementById('2406.08119v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08119v1-abstract-full" style="display: none;"> This work is an improved system that we submitted to task 1 of DCASE2023 challenge. We propose a method of low-complexity acoustic scene classification by a parallel attention-convolution network which consists of four modules, including pre-processing, fusion, global and local contextual information extraction. The proposed network is computationally efficient to capture global and local contextual information from each audio clip. In addition, we integrate other techniques into our method, such as knowledge distillation, data augmentation, and adaptive residual normalization. When evaluated on the official dataset of DCASE2023 challenge, our method obtains the highest accuracy of 56.10% with parameter number of 5.21 kilo and multiply-accumulate operations of 1.44 million. It exceeds the top two systems of DCASE2023 challenge in accuracy and complexity, and obtains state-of-the-art result. Code is at: https://github.com/Jessytan/Low-complexity-ASC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08119v1-abstract-full').style.display = 'none'; document.getElementById('2406.08119v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication on Interspeech 2024. 5 pages, 4 figures, 3 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.14210">arXiv:2405.14210</a> <span> [<a href="https://arxiv.org/pdf/2405.14210">pdf</a>, <a href="https://arxiv.org/format/2405.14210">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Eidos: Efficient, Imperceptible Adversarial 3D Point Clouds </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hanwei Zhang</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+L">Luo Cheng</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qisong He</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+W">Wei Huang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+R">Renjue Li</a>, <a href="/search/eess?searchtype=author&query=Sicre%2C+R">Ronan Sicre</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+X">Xiaowei Huang</a>, <a href="/search/eess?searchtype=author&query=Hermanns%2C+H">Holger Hermanns</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+L">Lijun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.14210v2-abstract-short" style="display: inline;"> Classification of 3D point clouds is a challenging machine learning (ML) task with important real-world applications in a spectrum from autonomous driving and robot-assisted surgery to earth observation from low orbit. As with other ML tasks, classification models are notoriously brittle in the presence of adversarial attacks. These are rooted in imperceptible changes to inputs with the effect tha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.14210v2-abstract-full').style.display = 'inline'; document.getElementById('2405.14210v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.14210v2-abstract-full" style="display: none;"> Classification of 3D point clouds is a challenging machine learning (ML) task with important real-world applications in a spectrum from autonomous driving and robot-assisted surgery to earth observation from low orbit. As with other ML tasks, classification models are notoriously brittle in the presence of adversarial attacks. These are rooted in imperceptible changes to inputs with the effect that a seemingly well-trained model ends up misclassifying the input. This paper adds to the understanding of adversarial attacks by presenting Eidos, a framework providing Efficient Imperceptible aDversarial attacks on 3D pOint cloudS. Eidos supports a diverse set of imperceptibility metrics. It employs an iterative, two-step procedure to identify optimal adversarial examples, thereby enabling a runtime-imperceptibility trade-off. We provide empirical evidence relative to several popular 3D point cloud classification models and several established 3D attack methods, showing Eidos' superiority with respect to efficiency as well as imperceptibility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.14210v2-abstract-full').style.display = 'none'; document.getElementById('2405.14210v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.15984">arXiv:2401.15984</a> <span> [<a href="https://arxiv.org/pdf/2401.15984">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> </div> </div> <p class="title is-5 mathjax"> Choroidal thinning assessment through facial video analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=He%2C+Q">Qinghua He</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yi Zhang</a>, <a href="/search/eess?searchtype=author&query=Shen%2C+M">Mengxi Shen</a>, <a href="/search/eess?searchtype=author&query=Gregori%2C+G">Giovanni Gregori</a>, <a href="/search/eess?searchtype=author&query=Rosenfeld%2C+P+J">Philip J. Rosenfeld</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+R+K">Ruikang K. Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.15984v1-abstract-short" style="display: inline;"> Different features of skin are associated with various medical conditions and provide opportunities to evaluate and monitor body health. This study created a strategy to assess choroidal thinning through the video analysis of facial skin. Videos capturing the entire facial skin were collected from 48 participants with age-related macular degeneration (AMD) and 12 healthy individuals. These facial… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.15984v1-abstract-full').style.display = 'inline'; document.getElementById('2401.15984v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.15984v1-abstract-full" style="display: none;"> Different features of skin are associated with various medical conditions and provide opportunities to evaluate and monitor body health. This study created a strategy to assess choroidal thinning through the video analysis of facial skin. Videos capturing the entire facial skin were collected from 48 participants with age-related macular degeneration (AMD) and 12 healthy individuals. These facial videos were analyzed using video-based trans-angiosomes imaging photoplethysmography (TaiPPG) to generate facial imaging biomarkers that were correlated with choroidal thickness (CT) measurements. The CT of all patients was determined using swept-source optical coherence tomography (SS-OCT). The results revealed the relationship between relative blood pulsation amplitude (BPA) in three typical facial angiosomes (cheek, side-forehead and mid-forehead) and the average macular CT (r = 0.48, p < 0.001; r = -0.56, p < 0.001; r = -0.40, p < 0.01). When considering a diagnostic threshold of 200渭m, the newly developed facial video analysis tool effectively distinguished between cases of choroidal thinning and normal cases, yielding areas under the curve of 0.75, 0.79 and 0.69. These findings shed light on the connection between choroidal blood flow and facial skin hemodynamics, which suggests the potential for predicting vascular diseases through widely accessible skin imaging data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.15984v1-abstract-full').style.display = 'none'; document.getElementById('2401.15984v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.10460">arXiv:2401.10460</a> <span> [<a href="https://arxiv.org/pdf/2401.10460">pdf</a>, <a href="https://arxiv.org/format/2401.10460">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Ultra-lightweight Neural Differential DSP Vocoder For High Quality Speech Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Agrawal%2C+P">Prabhav Agrawal</a>, <a href="/search/eess?searchtype=author&query=Koehler%2C+T">Thilo Koehler</a>, <a href="/search/eess?searchtype=author&query=Xiu%2C+Z">Zhiping Xiu</a>, <a href="/search/eess?searchtype=author&query=Serai%2C+P">Prashant Serai</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qing He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.10460v1-abstract-short" style="display: inline;"> Neural vocoders model the raw audio waveform and synthesize high-quality audio, but even the highly efficient ones, like MB-MelGAN and LPCNet, fail to run real-time on a low-end device like a smartglass. A pure digital signal processing (DSP) based vocoder can be implemented via lightweight fast Fourier transforms (FFT), and therefore, is a magnitude faster than any neural vocoder. A DSP vocoder o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.10460v1-abstract-full').style.display = 'inline'; document.getElementById('2401.10460v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.10460v1-abstract-full" style="display: none;"> Neural vocoders model the raw audio waveform and synthesize high-quality audio, but even the highly efficient ones, like MB-MelGAN and LPCNet, fail to run real-time on a low-end device like a smartglass. A pure digital signal processing (DSP) based vocoder can be implemented via lightweight fast Fourier transforms (FFT), and therefore, is a magnitude faster than any neural vocoder. A DSP vocoder often gets a lower audio quality due to consuming over-smoothed acoustic model predictions of approximate representations for the vocal tract. In this paper, we propose an ultra-lightweight differential DSP (DDSP) vocoder that uses a jointly optimized acoustic model with a DSP vocoder, and learns without an extracted spectral feature for the vocal tract. The model achieves audio quality comparable to neural vocoders with a high average MOS of 4.36 while being efficient as a DSP vocoder. Our C++ implementation, without any hardware-specific optimization, is at 15 MFLOPS, surpasses MB-MelGAN by 340 times in terms of FLOPS, and achieves a vocoder-only RTF of 0.003 and overall RTF of 0.044 while running single-threaded on a 2GHz Intel Xeon CPU. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.10460v1-abstract-full').style.display = 'none'; document.getElementById('2401.10460v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.04394">arXiv:2401.04394</a> <span> [<a href="https://arxiv.org/pdf/2401.04394">pdf</a>, <a href="https://arxiv.org/format/2401.04394">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SonicVisionLM: Playing Sound with Vision Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xie%2C+Z">Zhifeng Xie</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+S">Shengye Yu</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qile He</a>, <a href="/search/eess?searchtype=author&query=Li%2C+M">Mengtian Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.04394v3-abstract-short" style="display: inline;"> There has been a growing interest in the task of generating sound for silent videos, primarily because of its practicality in streamlining video post-production. However, existing methods for video-sound generation attempt to directly create sound from visual representations, which can be challenging due to the difficulty of aligning visual representations with audio representations. In this paper… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.04394v3-abstract-full').style.display = 'inline'; document.getElementById('2401.04394v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.04394v3-abstract-full" style="display: none;"> There has been a growing interest in the task of generating sound for silent videos, primarily because of its practicality in streamlining video post-production. However, existing methods for video-sound generation attempt to directly create sound from visual representations, which can be challenging due to the difficulty of aligning visual representations with audio representations. In this paper, we present SonicVisionLM, a novel framework aimed at generating a wide range of sound effects by leveraging vision-language models(VLMs). Instead of generating audio directly from video, we use the capabilities of powerful VLMs. When provided with a silent video, our approach first identifies events within the video using a VLM to suggest possible sounds that match the video content. This shift in approach transforms the challenging task of aligning image and audio into more well-studied sub-problems of aligning image-to-text and text-to-audio through the popular diffusion models. To improve the quality of audio recommendations with LLMs, we have collected an extensive dataset that maps text descriptions to specific sound effects and developed a time-controlled audio adapter. Our approach surpasses current state-of-the-art methods for converting video to audio, enhancing synchronization with the visuals, and improving alignment between audio and video components. Project page: https://yusiissy.github.io/SonicVisionLM.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.04394v3-abstract-full').style.display = 'none'; document.getElementById('2401.04394v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.00271">arXiv:2311.00271</a> <span> [<a href="https://arxiv.org/pdf/2311.00271">pdf</a>, <a href="https://arxiv.org/format/2311.00271">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> EdgeDis: Enabling Fast, Economical, and Reliable Data Dissemination for Mobile Edge Computing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+B">Bo Li</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qiang He</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+F">Feifei Chen</a>, <a href="/search/eess?searchtype=author&query=Lyu%2C+L">Lingjuan Lyu</a>, <a href="/search/eess?searchtype=author&query=Bouguettaya%2C+A">Athman Bouguettaya</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Y">Yun Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.00271v1-abstract-short" style="display: inline;"> Mobile edge computing (MEC) enables web data caching in close geographic proximity to end users. Popular data can be cached on edge servers located less than hundreds of meters away from end users. This ensures bounded latency guarantees for various latency-sensitive web applications. However, transmitting a large volume of data out of the cloud onto many geographically-distributed web servers ind… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.00271v1-abstract-full').style.display = 'inline'; document.getElementById('2311.00271v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.00271v1-abstract-full" style="display: none;"> Mobile edge computing (MEC) enables web data caching in close geographic proximity to end users. Popular data can be cached on edge servers located less than hundreds of meters away from end users. This ensures bounded latency guarantees for various latency-sensitive web applications. However, transmitting a large volume of data out of the cloud onto many geographically-distributed web servers individually can be expensive. In addition, web content dissemination may be interrupted by various intentional and accidental events in the volatile MEC environment, which undermines dissemination efficiency and subsequently incurs extra transmission costs. To tackle the above challenges, we present a novel scheme named EdgeDis that coordinates data dissemination by distributed consensus among those servers. We analyze EdgeDis's validity theoretically and evaluate its performance experimentally. Results demonstrate that compared with baseline and state-of-the-art schemes, EdgeDis: 1) is 5.97x - 7.52x faster; 2) reduces dissemination costs by 48.21% to 91.87%; and 3) reduces performance loss caused by dissemination failures by up to 97.30% in time and 96.35% in costs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.00271v1-abstract-full').style.display = 'none'; document.getElementById('2311.00271v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.06746">arXiv:2308.06746</a> <span> [<a href="https://arxiv.org/pdf/2308.06746">pdf</a>, <a href="https://arxiv.org/ps/2308.06746">ps</a>, <a href="https://arxiv.org/format/2308.06746">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Self-supervised Noise2noise Method Utilizing Corrupted Images with a Modular Network for LDCT Denoising </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhu%2C+Y">Yuting Zhu</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qiang He</a>, <a href="/search/eess?searchtype=author&query=Yao%2C+Y">Yudong Yao</a>, <a href="/search/eess?searchtype=author&query=Teng%2C+Y">Yueyang Teng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.06746v1-abstract-short" style="display: inline;"> Deep learning is a very promising technique for low-dose computed tomography (LDCT) image denoising. However, traditional deep learning methods require paired noisy and clean datasets, which are often difficult to obtain. This paper proposes a new method for performing LDCT image denoising with only LDCT data, which means that normal-dose CT (NDCT) is not needed. We adopt a combination including t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.06746v1-abstract-full').style.display = 'inline'; document.getElementById('2308.06746v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.06746v1-abstract-full" style="display: none;"> Deep learning is a very promising technique for low-dose computed tomography (LDCT) image denoising. However, traditional deep learning methods require paired noisy and clean datasets, which are often difficult to obtain. This paper proposes a new method for performing LDCT image denoising with only LDCT data, which means that normal-dose CT (NDCT) is not needed. We adopt a combination including the self-supervised noise2noise model and the noisy-as-clean strategy. First, we add a second yet similar type of noise to LDCT images multiple times. Note that we use LDCT images based on the noisy-as-clean strategy for corruption instead of NDCT images. Then, the noise2noise model is executed with only the secondary corrupted images for training. We select a modular U-Net structure from several candidates with shared parameters to perform the task, which increases the receptive field without increasing the parameter size. The experimental results obtained on the Mayo LDCT dataset show the effectiveness of the proposed method compared with that of state-of-the-art deep learning methods. The developed code is available at https://github.com/XYuan01/Self-supervised-Noise2Noise-for-LDCT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.06746v1-abstract-full').style.display = 'none'; document.getElementById('2308.06746v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.16750">arXiv:2306.16750</a> <span> [<a href="https://arxiv.org/pdf/2306.16750">pdf</a>, <a href="https://arxiv.org/format/2306.16750">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/978-3-031-43421-1_34">10.1007/978-3-031-43421-1_34 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Eigensubspace of Temporal-Difference Dynamics and How It Improves Value Approximation in Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=He%2C+Q">Qiang He</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+T">Tianyi Zhou</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+M">Meng Fang</a>, <a href="/search/eess?searchtype=author&query=Maghsudi%2C+S">Setareh Maghsudi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.16750v2-abstract-short" style="display: inline;"> We propose a novel value approximation method, namely Eigensubspace Regularized Critic (ERC) for deep reinforcement learning (RL). ERC is motivated by an analysis of the dynamics of Q-value approximation error in the Temporal-Difference (TD) method, which follows a path defined by the 1-eigensubspace of the transition kernel associated with the Markov Decision Process (MDP). It reveals a fundament… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.16750v2-abstract-full').style.display = 'inline'; document.getElementById('2306.16750v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.16750v2-abstract-full" style="display: none;"> We propose a novel value approximation method, namely Eigensubspace Regularized Critic (ERC) for deep reinforcement learning (RL). ERC is motivated by an analysis of the dynamics of Q-value approximation error in the Temporal-Difference (TD) method, which follows a path defined by the 1-eigensubspace of the transition kernel associated with the Markov Decision Process (MDP). It reveals a fundamental property of TD learning that has remained unused in previous deep RL approaches. In ERC, we propose a regularizer that guides the approximation error tending towards the 1-eigensubspace, resulting in a more efficient and stable path of value approximation. Moreover, we theoretically prove the convergence of the ERC method. Besides, theoretical analysis and experiments demonstrate that ERC effectively reduces the variance of value functions. Among 26 tasks in the DMControl benchmark, ERC outperforms state-of-the-art methods for 20. Besides, it shows significant advantages in Q-value approximation and variance reduction. Our code is available at https://sites.google.com/view/erc-ecml23/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.16750v2-abstract-full').style.display = 'none'; document.getElementById('2306.16750v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ECML23. Code: https://sites.google.com/view/erc-ecml23/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.08903">arXiv:2306.08903</a> <span> [<a href="https://arxiv.org/pdf/2306.08903">pdf</a>, <a href="https://arxiv.org/format/2306.08903">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Two-Way Semantic Transmission of Images without Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yu%2C+K">Kaiwen Yu</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qi He</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+G">Gang Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.08903v1-abstract-short" style="display: inline;"> As a competitive technology for 6G, semantic communications can significantly improve transmission efficiency. However, many existing semantic communication systems require information feedback during the training coding process, resulting in a significant communication overhead. In this article, we consider a two-way semantic communication (TW-SC) system, where information feedback can be omitted… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.08903v1-abstract-full').style.display = 'inline'; document.getElementById('2306.08903v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.08903v1-abstract-full" style="display: none;"> As a competitive technology for 6G, semantic communications can significantly improve transmission efficiency. However, many existing semantic communication systems require information feedback during the training coding process, resulting in a significant communication overhead. In this article, we consider a two-way semantic communication (TW-SC) system, where information feedback can be omitted by exploiting the weight reciprocity in the transceiver. Particularly, the channel simulator and semantic transceiver are implemented on both TW-SC nodes and the channel distribution is modeled by a conditional generative adversarial network. Simulation results demonstrate that the proposed TW-SC system performs closing to the state-of-the-art one-way semantic communication systems but requiring no feedback between the transceiver in the training process. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.08903v1-abstract-full').style.display = 'none'; document.getElementById('2306.08903v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.05621">arXiv:2306.05621</a> <span> [<a href="https://arxiv.org/pdf/2306.05621">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Acoustic Scene Clustering Using Joint Optimization of Deep Embedding Learning and Clustering Iteration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yanxiong Li</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+M">Mingle Liu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wucheng Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yuhan Zhang</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qianhua He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.05621v1-abstract-short" style="display: inline;"> Recent efforts have been made on acoustic scene classification in the audio signal processing community. In contrast, few studies have been conducted on acoustic scene clustering, which is a newly emerging problem. Acoustic scene clustering aims at merging the audio recordings of the same class of acoustic scene into a single cluster without using prior information and training classifiers. In thi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.05621v1-abstract-full').style.display = 'inline'; document.getElementById('2306.05621v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.05621v1-abstract-full" style="display: none;"> Recent efforts have been made on acoustic scene classification in the audio signal processing community. In contrast, few studies have been conducted on acoustic scene clustering, which is a newly emerging problem. Acoustic scene clustering aims at merging the audio recordings of the same class of acoustic scene into a single cluster without using prior information and training classifiers. In this study, we propose a method for acoustic scene clustering that jointly optimizes the procedures of feature learning and clustering iteration. In the proposed method, the learned feature is a deep embedding that is extracted from a deep convolutional neural network (CNN), while the clustering algorithm is the agglomerative hierarchical clustering (AHC). We formulate a unified loss function for integrating and optimizing these two procedures. Various features and methods are compared. The experimental results demonstrate that the proposed method outperforms other unsupervised methods in terms of the normalized mutual information and the clustering accuracy. In addition, the deep embedding outperforms many state-of-the-art features. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.05621v1-abstract-full').style.display = 'none'; document.getElementById('2306.05621v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 6 figures, 11 tables. Accepted for publication in IEEE TMM</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.03730">arXiv:2306.03730</a> <span> [<a href="https://arxiv.org/pdf/2306.03730">pdf</a>, <a href="https://arxiv.org/format/2306.03730">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Modality-Agnostic Learning for Medical Image Segmentation Using Multi-modality Self-distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=He%2C+Q">Qisheng He</a>, <a href="/search/eess?searchtype=author&query=Summerfield%2C+N">Nicholas Summerfield</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+M">Ming Dong</a>, <a href="/search/eess?searchtype=author&query=Glide-Hurst%2C+C">Carri Glide-Hurst</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.03730v1-abstract-short" style="display: inline;"> Medical image segmentation of tumors and organs at risk is a time-consuming yet critical process in the clinic that utilizes multi-modality imaging (e.g, different acquisitions, data types, and sequences) to increase segmentation precision. In this paper, we propose a novel framework, Modality-Agnostic learning through Multi-modality Self-dist-illation (MAG-MS), to investigate the impact of input… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.03730v1-abstract-full').style.display = 'inline'; document.getElementById('2306.03730v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.03730v1-abstract-full" style="display: none;"> Medical image segmentation of tumors and organs at risk is a time-consuming yet critical process in the clinic that utilizes multi-modality imaging (e.g, different acquisitions, data types, and sequences) to increase segmentation precision. In this paper, we propose a novel framework, Modality-Agnostic learning through Multi-modality Self-dist-illation (MAG-MS), to investigate the impact of input modalities on medical image segmentation. MAG-MS distills knowledge from the fusion of multiple modalities and applies it to enhance representation learning for individual modalities. Thus, it provides a versatile and efficient approach to handle limited modalities during testing. Our extensive experiments on benchmark datasets demonstrate the high efficiency of MAG-MS and its superior segmentation performance than current state-of-the-art methods. Furthermore, using MAG-MS, we provide valuable insight and guidance on selecting input modalities for medical image segmentation tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.03730v1-abstract-full').style.display = 'none'; document.getElementById('2306.03730v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.02054">arXiv:2306.02054</a> <span> [<a href="https://arxiv.org/pdf/2306.02054">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Low-Complexity Acoustic Scene Classification Using Data Augmentation and Lightweight ResNet </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yanxiong Li</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+W">Wenchang Cao</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+W">Wei Xie</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Q">Qisheng Huang</a>, <a href="/search/eess?searchtype=author&query=Pang%2C+W">Wenfeng Pang</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qianhua He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.02054v1-abstract-short" style="display: inline;"> We present a work on low-complexity acoustic scene classification (ASC) with multiple devices, namely the subtask A of Task 1 of the DCASE2021 challenge. This subtask focuses on classifying audio samples of multiple devices with a low-complexity model, where two main difficulties need to be overcome. First, the audio samples are recorded by different devices, and there is mismatch of recording dev… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.02054v1-abstract-full').style.display = 'inline'; document.getElementById('2306.02054v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.02054v1-abstract-full" style="display: none;"> We present a work on low-complexity acoustic scene classification (ASC) with multiple devices, namely the subtask A of Task 1 of the DCASE2021 challenge. This subtask focuses on classifying audio samples of multiple devices with a low-complexity model, where two main difficulties need to be overcome. First, the audio samples are recorded by different devices, and there is mismatch of recording devices in audio samples. We reduce the negative impact of the mismatch of recording devices by using some effective strategies, including data augmentation (e.g., mix-up, spectrum correction, pitch shift), usages of multi-patch network structure and channel attention. Second, the model size should be smaller than a threshold (e.g., 128 KB required by the DCASE2021 challenge). To meet this condition, we adopt a ResNet with both depthwise separable convolution and channel attention as the backbone network, and perform model compression. In summary, we propose a low-complexity ASC method using data augmentation and a lightweight ResNet. Evaluated on the official development and evaluation datasets, our method obtains classification accuracy scores of 71.6% and 66.7%, respectively; and obtains Log-loss scores of 1.038 and 1.136, respectively. Our final model size is 110.3 KB which is smaller than the maximum of 128 KB. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.02054v1-abstract-full').style.display = 'none'; document.getElementById('2306.02054v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 5 figures, 4 tables. Accepted for publication in the 16th IEEE International Conference on Signal Processing (IEEE ICSP)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.02053">arXiv:2306.02053</a> <span> [<a href="https://arxiv.org/pdf/2306.02053">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Few-shot Class-incremental Audio Classification Using Stochastic Classifier </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yanxiong Li</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+W">Wenchang Cao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jialong Li</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+W">Wei Xie</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qianhua He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.02053v1-abstract-short" style="display: inline;"> It is generally assumed that number of classes is fixed in current audio classification methods, and the model can recognize pregiven classes only. When new classes emerge, the model needs to be retrained with adequate samples of all classes. If new classes continually emerge, these methods will not work well and even infeasible. In this study, we propose a method for fewshot class-incremental aud… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.02053v1-abstract-full').style.display = 'inline'; document.getElementById('2306.02053v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.02053v1-abstract-full" style="display: none;"> It is generally assumed that number of classes is fixed in current audio classification methods, and the model can recognize pregiven classes only. When new classes emerge, the model needs to be retrained with adequate samples of all classes. If new classes continually emerge, these methods will not work well and even infeasible. In this study, we propose a method for fewshot class-incremental audio classification, which continually recognizes new classes and remember old ones. The proposed model consists of an embedding extractor and a stochastic classifier. The former is trained in base session and frozen in incremental sessions, while the latter is incrementally expanded in all sessions. Two datasets (NS-100 and LS-100) are built by choosing samples from audio corpora of NSynth and LibriSpeech, respectively. Results show that our method exceeds four baseline ones in average accuracy and performance dropping rate. Code is at https://github.com/vinceasvp/meta-sc. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.02053v1-abstract-full').style.display = 'none'; document.getElementById('2306.02053v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures, 4 tables. Accepted for publication in INTERSPEECH 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.19541">arXiv:2305.19541</a> <span> [<a href="https://arxiv.org/pdf/2305.19541">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TMM.2023.3253301">10.1109/TMM.2023.3253301 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Few-Shot Speaker Identification Using Lightweight Prototypical Network with Feature Grouping and Interaction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yanxiong Li</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+W">Wenchang Cao</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Q">Qisheng Huang</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qianhua He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.19541v1-abstract-short" style="display: inline;"> Existing methods for few-shot speaker identification (FSSI) obtain high accuracy, but their computational complexities and model sizes need to be reduced for lightweight applications. In this work, we propose a FSSI method using a lightweight prototypical network with the final goal to implement the FSSI on intelligent terminals with limited resources, such as smart watches and smart speakers. In… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.19541v1-abstract-full').style.display = 'inline'; document.getElementById('2305.19541v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.19541v1-abstract-full" style="display: none;"> Existing methods for few-shot speaker identification (FSSI) obtain high accuracy, but their computational complexities and model sizes need to be reduced for lightweight applications. In this work, we propose a FSSI method using a lightweight prototypical network with the final goal to implement the FSSI on intelligent terminals with limited resources, such as smart watches and smart speakers. In the proposed prototypical network, an embedding module is designed to perform feature grouping for reducing the memory requirement and computational complexity, and feature interaction for enhancing the representational ability of the learned speaker embedding. In the proposed embedding module, audio feature of each speech sample is split into several low-dimensional feature subsets that are transformed by a recurrent convolutional block in parallel. Then, the operations of averaging, addition, concatenation, element-wise summation and statistics pooling are sequentially executed to learn a speaker embedding for each speech sample. The recurrent convolutional block consists of a block of bidirectional long short-term memory, and a block of de-redundancy convolution in which feature grouping and interaction are conducted too. Our method is compared to baseline methods on three datasets that are selected from three public speech corpora (VoxCeleb1, VoxCeleb2, and LibriSpeech). The results show that our method obtains higher accuracy under several conditions, and has advantages over all baseline methods in computational complexity and model size. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.19541v1-abstract-full').style.display = 'none'; document.getElementById('2305.19541v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 4 figures, 12 tables. Accepted for publication in IEEE TMM</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.18045">arXiv:2305.18045</a> <span> [<a href="https://arxiv.org/pdf/2305.18045">pdf</a>, <a href="https://arxiv.org/ps/2305.18045">ps</a>, <a href="https://arxiv.org/format/2305.18045">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Few-shot Class-incremental Audio Classification Using Adaptively-refined Prototypes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xie%2C+W">Wei Xie</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yanxiong Li</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qianhua He</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+W">Wenchang Cao</a>, <a href="/search/eess?searchtype=author&query=Virtanen%2C+T">Tuomas Virtanen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.18045v1-abstract-short" style="display: inline;"> New classes of sounds constantly emerge with a few samples, making it challenging for models to adapt to dynamic acoustic environments. This challenge motivates us to address the new problem of few-shot class-incremental audio classification. This study aims to enable a model to continuously recognize new classes of sounds with a few training samples of new classes while remembering the learned on… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.18045v1-abstract-full').style.display = 'inline'; document.getElementById('2305.18045v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.18045v1-abstract-full" style="display: none;"> New classes of sounds constantly emerge with a few samples, making it challenging for models to adapt to dynamic acoustic environments. This challenge motivates us to address the new problem of few-shot class-incremental audio classification. This study aims to enable a model to continuously recognize new classes of sounds with a few training samples of new classes while remembering the learned ones. To this end, we propose a method to generate discriminative prototypes and use them to expand the model's classifier for recognizing sounds of new and learned classes. The model is first trained with a random episodic training strategy, and then its backbone is used to generate the prototypes. A dynamic relation projection module refines the prototypes to enhance their discriminability. Results on two datasets (derived from the corpora of Nsynth and FSD-MIX-CLIPS) show that the proposed method exceeds three state-of-the-art methods in average accuracy and performance dropping rate. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.18045v1-abstract-full').style.display = 'none'; document.getElementById('2305.18045v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages,2 figures, Accepted by Interspeech 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.12197">arXiv:2303.12197</a> <span> [<a href="https://arxiv.org/pdf/2303.12197">pdf</a>, <a href="https://arxiv.org/format/2303.12197">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Self-Supervised Representations for Singing Voice Conversion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jayashankar%2C+T">Tejas Jayashankar</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+J">Jilong Wu</a>, <a href="/search/eess?searchtype=author&query=Sari%2C+L">Leda Sari</a>, <a href="/search/eess?searchtype=author&query=Kant%2C+D">David Kant</a>, <a href="/search/eess?searchtype=author&query=Manohar%2C+V">Vimal Manohar</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qing He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.12197v1-abstract-short" style="display: inline;"> A singing voice conversion model converts a song in the voice of an arbitrary source singer to the voice of a target singer. Recently, methods that leverage self-supervised audio representations such as HuBERT and Wav2Vec 2.0 have helped further the state-of-the-art. Though these methods produce more natural and melodic singing outputs, they often rely on confusion and disentanglement losses to re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.12197v1-abstract-full').style.display = 'inline'; document.getElementById('2303.12197v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.12197v1-abstract-full" style="display: none;"> A singing voice conversion model converts a song in the voice of an arbitrary source singer to the voice of a target singer. Recently, methods that leverage self-supervised audio representations such as HuBERT and Wav2Vec 2.0 have helped further the state-of-the-art. Though these methods produce more natural and melodic singing outputs, they often rely on confusion and disentanglement losses to render the self-supervised representations speaker and pitch-invariant. In this paper, we circumvent disentanglement training and propose a new model that leverages ASR fine-tuned self-supervised representations as inputs to a HiFi-GAN neural vocoder for singing voice conversion. We experiment with different f0 encoding schemes and show that an f0 harmonic generation module that uses a parallel bank of transposed convolutions (PBTC) alongside ASR fine-tuned Wav2Vec 2.0 features results in the best singing voice conversion quality. Additionally, the model is capable of making a spoken voice sing. We also show that a simple f0 shifting scheme during inference helps retain singer identity and bolsters the performance of our singing voice conversion model. Our results are backed up by extensive MOS studies that compare different ablations and baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.12197v1-abstract-full').style.display = 'none'; document.getElementById('2303.12197v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.02277">arXiv:2303.02277</a> <span> [<a href="https://arxiv.org/pdf/2303.02277">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Augmented smartphone bilirubinometer enabled by a mobile app that turns smartphone into multispectral imager </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=He%2C+Q">Qinghua He</a>, <a href="/search/eess?searchtype=author&query=Li%2C+W">Wanyu Li</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+Y">Yaping Shi</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+Y">Yi Yu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yi Zhang</a>, <a href="/search/eess?searchtype=author&query=Geng%2C+W">Wenqian Geng</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+Z">Zhiyuan Sun</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+R+K">Ruikang K Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.02277v1-abstract-short" style="display: inline;"> We present the development of SpeCamX, a mobile application that transforms any unmodified smartphone into a powerful multispectral imager capable of capturing multispectral information. Our application includes an augmented bilirubinometer, enabling accurate prediction of blood bilirubin levels (BBL). In a clinical study involving 320 patients with liver diseases, we used SpeCamX to image the bul… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.02277v1-abstract-full').style.display = 'inline'; document.getElementById('2303.02277v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.02277v1-abstract-full" style="display: none;"> We present the development of SpeCamX, a mobile application that transforms any unmodified smartphone into a powerful multispectral imager capable of capturing multispectral information. Our application includes an augmented bilirubinometer, enabling accurate prediction of blood bilirubin levels (BBL). In a clinical study involving 320 patients with liver diseases, we used SpeCamX to image the bulbar conjunctiva region, and we employed a hybrid machine learning prediction model to predict BBL. We observed a high correlation with blood test results, demonstrating the efficacy of our approach. Furthermore, we compared our method, which uses spectrally augmented learning (SAL), with traditional learning based on RGB photographs (RGBL), and our results clearly indicate that SpeCamX outperforms RGBL in terms of prediction accuracy, efficiency, and stability. This study highlights the potential of SpeCamX to improve the prediction of bio-chromophores, and its ability to transform an ordinary smartphone into a powerful medical tool without the need for additional investments or expertise. This makes it suitable for widespread use, particularly in areas where medical resources are scarce. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.02277v1-abstract-full').style.display = 'none'; document.getElementById('2303.02277v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.00802">arXiv:2303.00802</a> <span> [<a href="https://arxiv.org/pdf/2303.00802">pdf</a>, <a href="https://arxiv.org/format/2303.00802">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Synthetic Cross-accent Data Augmentation for Automatic Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Klumpp%2C+P">Philipp Klumpp</a>, <a href="/search/eess?searchtype=author&query=Chitkara%2C+P">Pooja Chitkara</a>, <a href="/search/eess?searchtype=author&query=Sar%C4%B1%2C+L">Leda Sar谋</a>, <a href="/search/eess?searchtype=author&query=Serai%2C+P">Prashant Serai</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+J">Jilong Wu</a>, <a href="/search/eess?searchtype=author&query=Veliche%2C+I">Irina-Elena Veliche</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongqing Huang</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qing He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.00802v1-abstract-short" style="display: inline;"> The awareness for biased ASR datasets or models has increased notably in recent years. Even for English, despite a vast amount of available training data, systems perform worse for non-native speakers. In this work, we improve an accent-conversion model (ACM) which transforms native US-English speech into accented pronunciation. We include phonetic knowledge in the ACM training to provide accurate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.00802v1-abstract-full').style.display = 'inline'; document.getElementById('2303.00802v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.00802v1-abstract-full" style="display: none;"> The awareness for biased ASR datasets or models has increased notably in recent years. Even for English, despite a vast amount of available training data, systems perform worse for non-native speakers. In this work, we improve an accent-conversion model (ACM) which transforms native US-English speech into accented pronunciation. We include phonetic knowledge in the ACM training to provide accurate feedback about how well certain pronunciation patterns were recovered in the synthesized waveform. Furthermore, we investigate the feasibility of learned accent representations instead of static embeddings. Generated data was then used to train two state-of-the-art ASR systems. We evaluated our approach on native and non-native English datasets and found that synthetically accented data helped the ASR to better understand speech from seen accents. This observation did not translate to unseen accents, and it was not observed for a model that had been pre-trained exclusively with native speech. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.00802v1-abstract-full').style.display = 'none'; document.getElementById('2303.00802v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.13402">arXiv:2301.13402</a> <span> [<a href="https://arxiv.org/pdf/2301.13402">pdf</a>, <a href="https://arxiv.org/format/2301.13402">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> ReGANIE: Rectifying GAN Inversion Errors for Accurate Real Image Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+B">Bingchuan Li</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+T">Tianxiang Ma</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+P">Peng Zhang</a>, <a href="/search/eess?searchtype=author&query=Hua%2C+M">Miao Hua</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+W">Wei Liu</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qian He</a>, <a href="/search/eess?searchtype=author&query=Yi%2C+Z">Zili Yi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.13402v1-abstract-short" style="display: inline;"> The StyleGAN family succeed in high-fidelity image generation and allow for flexible and plausible editing of generated images by manipulating the semantic-rich latent style space.However, projecting a real image into its latent space encounters an inherent trade-off between inversion quality and editability. Existing encoder-based or optimization-based StyleGAN inversion methods attempt to mitiga… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.13402v1-abstract-full').style.display = 'inline'; document.getElementById('2301.13402v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.13402v1-abstract-full" style="display: none;"> The StyleGAN family succeed in high-fidelity image generation and allow for flexible and plausible editing of generated images by manipulating the semantic-rich latent style space.However, projecting a real image into its latent space encounters an inherent trade-off between inversion quality and editability. Existing encoder-based or optimization-based StyleGAN inversion methods attempt to mitigate the trade-off but see limited performance. To fundamentally resolve this problem, we propose a novel two-phase framework by designating two separate networks to tackle editing and reconstruction respectively, instead of balancing the two. Specifically, in Phase I, a W-space-oriented StyleGAN inversion network is trained and used to perform image inversion and editing, which assures the editability but sacrifices reconstruction quality. In Phase II, a carefully designed rectifying network is utilized to rectify the inversion errors and perform ideal reconstruction. Experimental results show that our approach yields near-perfect reconstructions without sacrificing the editability, thus allowing accurate manipulation of real images. Further, we evaluate the performance of our rectifying network, and see great generalizability towards unseen manipulation types and out-of-domain images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.13402v1-abstract-full').style.display = 'none'; document.getElementById('2301.13402v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.13282">arXiv:2211.13282</a> <span> [<a href="https://arxiv.org/pdf/2211.13282">pdf</a>, <a href="https://arxiv.org/format/2211.13282">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Voice-preserving Zero-shot Multiple Accent Conversion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jin%2C+M">Mumin Jin</a>, <a href="/search/eess?searchtype=author&query=Serai%2C+P">Prashant Serai</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+J">Jilong Wu</a>, <a href="/search/eess?searchtype=author&query=Tjandra%2C+A">Andros Tjandra</a>, <a href="/search/eess?searchtype=author&query=Manohar%2C+V">Vimal Manohar</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qing He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.13282v2-abstract-short" style="display: inline;"> Most people who have tried to learn a foreign language would have experienced difficulties understanding or speaking with a native speaker's accent. For native speakers, understanding or speaking a new accent is likewise a difficult task. An accent conversion system that changes a speaker's accent but preserves that speaker's voice identity, such as timbre and pitch, has the potential for a range… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.13282v2-abstract-full').style.display = 'inline'; document.getElementById('2211.13282v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.13282v2-abstract-full" style="display: none;"> Most people who have tried to learn a foreign language would have experienced difficulties understanding or speaking with a native speaker's accent. For native speakers, understanding or speaking a new accent is likewise a difficult task. An accent conversion system that changes a speaker's accent but preserves that speaker's voice identity, such as timbre and pitch, has the potential for a range of applications, such as communication, language learning, and entertainment. Existing accent conversion models tend to change the speaker identity and accent at the same time. Here, we use adversarial learning to disentangle accent dependent features while retaining other acoustic characteristics. What sets our work apart from existing accent conversion models is the capability to convert an unseen speaker's utterance to multiple accents while preserving its original voice identity. Subjective evaluations show that our model generates audio that sound closer to the target accent and like the original speaker. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.13282v2-abstract-full').style.display = 'none'; document.getElementById('2211.13282v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IEEE ICASSP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.03034">arXiv:2211.03034</a> <span> [<a href="https://arxiv.org/pdf/2211.03034">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Other Condensed Matter">cond-mat.other</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> An Integrated Constrained Gradient Descent (iCGD) Protocol to Correct Scan-Positional Errors for Electron Ptychography with High Accuracy and Precision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ning%2C+S">Shoucong Ning</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+W">Wenhui Xu</a>, <a href="/search/eess?searchtype=author&query=Loh%2C+L">Leyi Loh</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+Z">Zhen Lu</a>, <a href="/search/eess?searchtype=author&query=Bosman%2C+M">Michel Bosman</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+F">Fucai Zhang</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qian He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.03034v1-abstract-short" style="display: inline;"> Correcting scan-positional errors is critical in achieving electron ptychography with both high resolution and high precision. This is a demanding and challenging task due to the sheer number of parameters that need to be optimized. For atomic-resolution ptychographic reconstructions, we found classical refining methods for scan positions not satisfactory due to the inherent entanglement between t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.03034v1-abstract-full').style.display = 'inline'; document.getElementById('2211.03034v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.03034v1-abstract-full" style="display: none;"> Correcting scan-positional errors is critical in achieving electron ptychography with both high resolution and high precision. This is a demanding and challenging task due to the sheer number of parameters that need to be optimized. For atomic-resolution ptychographic reconstructions, we found classical refining methods for scan positions not satisfactory due to the inherent entanglement between the object and scan positions, which can produce systematic errors in the results. Here, we propose a new protocol consisting of a series of constrained gradient descent (CGD) methods to achieve better recovery of scan positions. The central idea of these CGD methods is to utilize a priori knowledge about the nature of STEM experiments and add necessary constraints to isolate different types of scan positional errors during the iterative reconstruction process. Each constraint will be introduced with the help of simulated 4D-STEM datasets with known positional errors. Then the integrated constrained gradient decent (iCGD) protocol will be demonstrated using an experimental 4D-STEM dataset of the 1H-MoS2 monolayer. We will show that the iCGD protocol can effectively address the errors of scan positions across the spectrum and help to achieve electron ptychography with high accuracy and precision. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.03034v1-abstract-full').style.display = 'none'; document.getElementById('2211.03034v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.16045">arXiv:2210.16045</a> <span> [<a href="https://arxiv.org/pdf/2210.16045">pdf</a>, <a href="https://arxiv.org/format/2210.16045">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Towards zero-shot Text-based voice editing using acoustic context conditioning, utterance embeddings, and reference encoders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Fong%2C+J">Jason Fong</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yun Wang</a>, <a href="/search/eess?searchtype=author&query=Agrawal%2C+P">Prabhav Agrawal</a>, <a href="/search/eess?searchtype=author&query=Manohar%2C+V">Vimal Manohar</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+J">Jilong Wu</a>, <a href="/search/eess?searchtype=author&query=K%C3%B6hler%2C+T">Thilo K枚hler</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qing He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.16045v1-abstract-short" style="display: inline;"> Text-based voice editing (TBVE) uses synthetic output from text-to-speech (TTS) systems to replace words in an original recording. Recent work has used neural models to produce edited speech that is similar to the original speech in terms of clarity, speaker identity, and prosody. However, one limitation of prior work is the usage of finetuning to optimise performance: this requires further model… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.16045v1-abstract-full').style.display = 'inline'; document.getElementById('2210.16045v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.16045v1-abstract-full" style="display: none;"> Text-based voice editing (TBVE) uses synthetic output from text-to-speech (TTS) systems to replace words in an original recording. Recent work has used neural models to produce edited speech that is similar to the original speech in terms of clarity, speaker identity, and prosody. However, one limitation of prior work is the usage of finetuning to optimise performance: this requires further model training on data from the target speaker, which is a costly process that may incorporate potentially sensitive data into server-side models. In contrast, this work focuses on the zero-shot approach which avoids finetuning altogether, and instead uses pretrained speaker verification embeddings together with a jointly trained reference encoder to encode utterance-level information that helps capture aspects such as speaker identity and prosody. Subjective listening tests find that both utterance embeddings and a reference encoder improve the continuity of speaker identity and prosody between the edited synthetic speech and unedited original recording in the zero-shot setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.16045v1-abstract-full').style.display = 'none'; document.getElementById('2210.16045v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.11180">arXiv:2204.11180</a> <span> [<a href="https://arxiv.org/pdf/2204.11180">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Few-Shot Speaker Identification Using Depthwise Separable Convolutional Network with Channel Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yanxiong Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wucheng Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+W">Wenchang Cao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+W">Wei Li</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qianhua He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.11180v1-abstract-short" style="display: inline;"> Although few-shot learning has attracted much attention from the fields of image and audio classification, few efforts have been made on few-shot speaker identification. In the task of few-shot learning, overfitting is a tough problem mainly due to the mismatch between training and testing conditions. In this paper, we propose a few-shot speaker identification method which can alleviate the overfi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.11180v1-abstract-full').style.display = 'inline'; document.getElementById('2204.11180v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.11180v1-abstract-full" style="display: none;"> Although few-shot learning has attracted much attention from the fields of image and audio classification, few efforts have been made on few-shot speaker identification. In the task of few-shot learning, overfitting is a tough problem mainly due to the mismatch between training and testing conditions. In this paper, we propose a few-shot speaker identification method which can alleviate the overfitting problem. In the proposed method, the model of a depthwise separable convolutional network with channel attention is trained with a prototypical loss function. Experimental datasets are extracted from three public speech corpora: Aishell-2, VoxCeleb1 and TORGO. Experimental results show that the proposed method exceeds state-of-the-art methods for few-shot speaker identification in terms of accuracy and F-score. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.11180v1-abstract-full').style.display = 'none'; document.getElementById('2204.11180v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Odyssey 2022 (The Speaker and Language Recognition Workshop 2022, Beijing, China)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.02839">arXiv:2204.02839</a> <span> [<a href="https://arxiv.org/pdf/2204.02839">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CCAT-NET: A Novel Transformer Based Semi-supervised Framework for Covid-19 Lung Lesion Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+M">Mingyang Liu</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+L">Li Xiao</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+H">Huiqin Jiang</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qing He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.02839v1-abstract-short" style="display: inline;"> The spread of the novel coronavirus disease 2019 (COVID-19) has claimed millions of lives. Automatic segmentation of lesions from CT images can assist doctors with screening, treatment, and monitoring. However, accurate segmentation of lesions from CT images can be very challenging due to data and model limitations. Recently, Transformer-based networks have attracted a lot of attention in the area… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.02839v1-abstract-full').style.display = 'inline'; document.getElementById('2204.02839v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.02839v1-abstract-full" style="display: none;"> The spread of the novel coronavirus disease 2019 (COVID-19) has claimed millions of lives. Automatic segmentation of lesions from CT images can assist doctors with screening, treatment, and monitoring. However, accurate segmentation of lesions from CT images can be very challenging due to data and model limitations. Recently, Transformer-based networks have attracted a lot of attention in the area of computer vision, as Transformer outperforms CNN at a bunch of tasks. In this work, we propose a novel network structure that combines CNN and Transformer for the segmentation of COVID-19 lesions. We further propose an efficient semi-supervised learning framework to address the shortage of labeled data. Extensive experiments showed that our proposed network outperforms most existing networks and the semi-supervised learning framework can outperform the base network by 3.0% and 8.2% in terms of Dice coefficient and sensitivity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.02839v1-abstract-full').style.display = 'none'; document.getElementById('2204.02839v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.13420">arXiv:2203.13420</a> <span> [<a href="https://arxiv.org/pdf/2203.13420">pdf</a>, <a href="https://arxiv.org/format/2203.13420">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Automatic Song Translation for Tonal Languages </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Guo%2C+F">Fenfei Guo</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+C">Chen Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Zhirui Zhang</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qixin He</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kejun Zhang</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+J">Jun Xie</a>, <a href="/search/eess?searchtype=author&query=Boyd-Graber%2C+J">Jordan Boyd-Graber</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.13420v1-abstract-short" style="display: inline;"> This paper develops automatic song translation (AST) for tonal languages and addresses the unique challenge of aligning words' tones with melody of a song in addition to conveying the original meaning. We propose three criteria for effective AST -- preserving meaning, singability and intelligibility -- and design metrics for these criteria. We develop a new benchmark for English--Mandarin song tra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.13420v1-abstract-full').style.display = 'inline'; document.getElementById('2203.13420v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.13420v1-abstract-full" style="display: none;"> This paper develops automatic song translation (AST) for tonal languages and addresses the unique challenge of aligning words' tones with melody of a song in addition to conveying the original meaning. We propose three criteria for effective AST -- preserving meaning, singability and intelligibility -- and design metrics for these criteria. We develop a new benchmark for English--Mandarin song translation and develop an unsupervised AST system, Guided AliGnment for Automatic Song Translation (GagaST), which combines pre-training with three decoding constraints. Both automatic and human evaluations show GagaST successfully balances semantics and singability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.13420v1-abstract-full').style.display = 'none'; document.getElementById('2203.13420v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at Findings of ACL 2022, 15 pages, 4 Tables and 10 Figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.06338">arXiv:2202.06338</a> <span> [<a href="https://arxiv.org/pdf/2202.06338">pdf</a>, <a href="https://arxiv.org/format/2202.06338">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICASSP43922.2022.9746919">10.1109/ICASSP43922.2022.9746919 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> DEEPCHORUS: A Hybrid Model of Multi-scale Convolution and Self-attention for Chorus Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=He%2C+Q">Qiqi He</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+X">Xiaoheng Sun</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+Y">Yi Yu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+W">Wei Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.06338v2-abstract-short" style="display: inline;"> Chorus detection is a challenging problem in musical signal processing as the chorus often repeats more than once in popular songs, usually with rich instruments and complex rhythm forms. Most of the existing works focus on the receptiveness of chorus sections based on some explicit features such as loudness and occurrence frequency. These pre-assumptions for chorus limit the generalization capaci… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.06338v2-abstract-full').style.display = 'inline'; document.getElementById('2202.06338v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.06338v2-abstract-full" style="display: none;"> Chorus detection is a challenging problem in musical signal processing as the chorus often repeats more than once in popular songs, usually with rich instruments and complex rhythm forms. Most of the existing works focus on the receptiveness of chorus sections based on some explicit features such as loudness and occurrence frequency. These pre-assumptions for chorus limit the generalization capacity of these methods, causing misdetection on other repeated sections such as verse. To solve the problem, in this paper we propose an end-to-end chorus detection model DeepChorus, reducing the engineering effort and the need for prior knowledge. The proposed model includes two main structures: i) a Multi-Scale Network to derive preliminary representations of chorus segments, and ii) a Self-Attention Convolution Network to further process the features into probability curves representing chorus presence. To obtain the final results, we apply an adaptive threshold to binarize the original curve. The experimental results show that DeepChorus outperforms existing state-of-the-art methods in most cases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.06338v2-abstract-full').style.display = 'none'; document.getElementById('2202.06338v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.01564">arXiv:2202.01564</a> <span> [<a href="https://arxiv.org/pdf/2202.01564">pdf</a>, <a href="https://arxiv.org/format/2202.01564">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> Weakly Supervised Nuclei Segmentation via Instance Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+W">Weizhen Liu</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qian He</a>, <a href="/search/eess?searchtype=author&query=He%2C+X">Xuming He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.01564v2-abstract-short" style="display: inline;"> Weakly supervised nuclei segmentation is a critical problem for pathological image analysis and greatly benefits the community due to the significant reduction of labeling cost. Adopting point annotations, previous methods mostly rely on less expressive representations for nuclei instances and thus have difficulty in handling crowded nuclei. In this paper, we propose to decouple weakly supervised… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.01564v2-abstract-full').style.display = 'inline'; document.getElementById('2202.01564v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.01564v2-abstract-full" style="display: none;"> Weakly supervised nuclei segmentation is a critical problem for pathological image analysis and greatly benefits the community due to the significant reduction of labeling cost. Adopting point annotations, previous methods mostly rely on less expressive representations for nuclei instances and thus have difficulty in handling crowded nuclei. In this paper, we propose to decouple weakly supervised semantic and instance segmentation in order to enable more effective subtask learning and to promote instance-aware representation learning. To achieve this, we design a modular deep network with two branches: a semantic proposal network and an instance encoding network, which are trained in a two-stage manner with an instance-sensitive loss. Empirical results show that our approach achieves the state-of-the-art performance on two public benchmarks of pathological images from different types of organs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.01564v2-abstract-full').style.display = 'none'; document.getElementById('2202.01564v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ISBI 2022 as Oral Presentation</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.03099">arXiv:2112.03099</a> <span> [<a href="https://arxiv.org/pdf/2112.03099">pdf</a>, <a href="https://arxiv.org/format/2112.03099">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> VocBench: A Neural Vocoder Benchmark for Speech Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=AlBadawy%2C+E+A">Ehab A. AlBadawy</a>, <a href="/search/eess?searchtype=author&query=Gibiansky%2C+A">Andrew Gibiansky</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qing He</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+J">Jilong Wu</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+M">Ming-Ching Chang</a>, <a href="/search/eess?searchtype=author&query=Lyu%2C+S">Siwei Lyu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.03099v1-abstract-short" style="display: inline;"> Neural vocoders, used for converting the spectral representations of an audio signal to the waveforms, are a commonly used component in speech synthesis pipelines. It focuses on synthesizing waveforms from low-dimensional representation, such as Mel-Spectrograms. In recent years, different approaches have been introduced to develop such vocoders. However, it becomes more challenging to assess thes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.03099v1-abstract-full').style.display = 'inline'; document.getElementById('2112.03099v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.03099v1-abstract-full" style="display: none;"> Neural vocoders, used for converting the spectral representations of an audio signal to the waveforms, are a commonly used component in speech synthesis pipelines. It focuses on synthesizing waveforms from low-dimensional representation, such as Mel-Spectrograms. In recent years, different approaches have been introduced to develop such vocoders. However, it becomes more challenging to assess these new vocoders and compare their performance to previous ones. To address this problem, we present VocBench, a framework that benchmark the performance of state-of-the art neural vocoders. VocBench uses a systematic study to evaluate different neural vocoders in a shared environment that enables a fair comparison between them. In our experiments, we use the same setup for datasets, training pipeline, and evaluation metrics for all neural vocoders. We perform a subjective and objective evaluation to compare the performance of each vocoder along a different axis. Our results demonstrate that the framework is capable of showing the competitive efficacy and the quality of the synthesized samples for each vocoder. VocBench framework is available at https://github.com/facebookresearch/vocoder-benchmark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.03099v1-abstract-full').style.display = 'none'; document.getElementById('2112.03099v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in icassp 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.02676">arXiv:2111.02676</a> <span> [<a href="https://arxiv.org/pdf/2111.02676">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> A semi-automatic ultrasound image analysis system for the grading diagnosis of COVID-19 pneumonia </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuanyuan Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yao Zhang</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qiong He</a>, <a href="/search/eess?searchtype=author&query=Liao%2C+H">Hongen Liao</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+J">Jianwen Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.02676v1-abstract-short" style="display: inline;"> This paper proposes a semi-automatic system based on quantitative characterization of the specific image patterns in lung ultrasound (LUS) images, in order to assess the lung conditions of patients with COVID-19 pneumonia, as well as to differentiate between the severe / and no-severe cases. Specifically, four parameters are extracted from each LUS image, namely the thickness (TPL) and roughness (… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.02676v1-abstract-full').style.display = 'inline'; document.getElementById('2111.02676v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.02676v1-abstract-full" style="display: none;"> This paper proposes a semi-automatic system based on quantitative characterization of the specific image patterns in lung ultrasound (LUS) images, in order to assess the lung conditions of patients with COVID-19 pneumonia, as well as to differentiate between the severe / and no-severe cases. Specifically, four parameters are extracted from each LUS image, namely the thickness (TPL) and roughness (RPL) of the pleural line, and the accumulated with (AWBL) and acoustic coefficient (ACBL) of B lines. 27 patients are enrolled in this study, which are grouped into 13 moderate patients, 7 severe patients and 7 critical patients. Furthermore, the severe and critical patients are regarded as the severe cases, and the moderate patients are regarded as the non-severe cases. Biomarkers among different groups are compared. Each single biomarker and a classifier with all the biomarkers as input are utilized for the binary diagnosis of severe case and non-severe case, respectively. The classifier achieves the best classification performance among all the compared methods (area under the receiver operating characteristics curve = 0.93, sensitivity = 0.93, specificity = 0.85). The proposed image analysis system could be potentially applied to the grading and prognosis evaluation of patients with COVID-19 pneumonia. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.02676v1-abstract-full').style.display = 'none'; document.getElementById('2111.02676v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2109.01607">arXiv:2109.01607</a> <span> [<a href="https://arxiv.org/pdf/2109.01607">pdf</a>, <a href="https://arxiv.org/format/2109.01607">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Musical Tempo Estimation Using a Multi-scale Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Sun%2C+X">Xiaoheng Sun</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qiqi He</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yongwei Gao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+W">Wei Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2109.01607v1-abstract-short" style="display: inline;"> Recently, some single-step systems without onset detection have shown their effectiveness in automatic musical tempo estimation. Following the success of these systems, in this paper we propose a Multi-scale Grouped Attention Network to further explore the potential of such methods. A multi-scale structure is introduced as the overall network architecture where information from different scales is… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.01607v1-abstract-full').style.display = 'inline'; document.getElementById('2109.01607v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2109.01607v1-abstract-full" style="display: none;"> Recently, some single-step systems without onset detection have shown their effectiveness in automatic musical tempo estimation. Following the success of these systems, in this paper we propose a Multi-scale Grouped Attention Network to further explore the potential of such methods. A multi-scale structure is introduced as the overall network architecture where information from different scales is aggregated to strengthen contextual feature learning. Furthermore, we propose a Grouped Attention Module as the key component of the network. The proposed module separates the input feature into several groups along the frequency axis, which makes it capable of capturing long-range dependencies from different frequency positions on the spectrogram. In comparison experiments, the results on public datasets show that the proposed model outperforms existing state-of-the-art methods on Accuracy1. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.01607v1-abstract-full').style.display = 'none'; document.getElementById('2109.01607v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ISMIR 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2107.05604">arXiv:2107.05604</a> <span> [<a href="https://arxiv.org/pdf/2107.05604">pdf</a>, <a href="https://arxiv.org/format/2107.05604">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Direct speech-to-speech translation with discrete units </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lee%2C+A">Ann Lee</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+P">Peng-Jen Chen</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Changhan Wang</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+J">Jiatao Gu</a>, <a href="/search/eess?searchtype=author&query=Popuri%2C+S">Sravya Popuri</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+X">Xutai Ma</a>, <a href="/search/eess?searchtype=author&query=Polyak%2C+A">Adam Polyak</a>, <a href="/search/eess?searchtype=author&query=Adi%2C+Y">Yossi Adi</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qing He</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+Y">Yun Tang</a>, <a href="/search/eess?searchtype=author&query=Pino%2C+J">Juan Pino</a>, <a href="/search/eess?searchtype=author&query=Hsu%2C+W">Wei-Ning Hsu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2107.05604v2-abstract-short" style="display: inline;"> We present a direct speech-to-speech translation (S2ST) model that translates speech from one language to speech in another language without relying on intermediate text generation. We tackle the problem by first applying a self-supervised discrete speech encoder on the target speech and then training a sequence-to-sequence speech-to-unit translation (S2UT) model to predict the discrete representa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.05604v2-abstract-full').style.display = 'inline'; document.getElementById('2107.05604v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2107.05604v2-abstract-full" style="display: none;"> We present a direct speech-to-speech translation (S2ST) model that translates speech from one language to speech in another language without relying on intermediate text generation. We tackle the problem by first applying a self-supervised discrete speech encoder on the target speech and then training a sequence-to-sequence speech-to-unit translation (S2UT) model to predict the discrete representations of the target speech. When target text transcripts are available, we design a joint speech and text training framework that enables the model to generate dual modality output (speech and text) simultaneously in the same inference pass. Experiments on the Fisher Spanish-English dataset show that the proposed framework yields improvement of 6.7 BLEU compared with a baseline direct S2ST model that predicts spectrogram features. When trained without any text transcripts, our model performance is comparable to models that predict spectrograms and are trained with text supervision, showing the potential of our system for translation between unwritten languages. Audio samples are available at https://facebookresearch.github.io/speech_translation/direct_s2st_units/index.html . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.05604v2-abstract-full').style.display = 'none'; document.getElementById('2107.05604v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 July, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ACL 2022 (long paper)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2105.03583">arXiv:2105.03583</a> <span> [<a href="https://arxiv.org/pdf/2105.03583">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Domestic activities clustering from audio recordings using convolutional capsule autoencoder network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lin%2C+Z">Ziheng Lin</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yanxiong Li</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zhangjin Huang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+W">Wenhao Zhang</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+Y">Yufeng Tan</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yichun Chen</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qianhua He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2105.03583v1-abstract-short" style="display: inline;"> Recent efforts have been made on domestic activities classification from audio recordings, especially the works submitted to the challenge of DCASE (Detection and Classification of Acoustic Scenes and Events) since 2018. In contrast, few studies were done on domestic activities clustering, which is a newly emerging problem. Domestic activities clustering from audio recordings aims at merging audio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2105.03583v1-abstract-full').style.display = 'inline'; document.getElementById('2105.03583v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2105.03583v1-abstract-full" style="display: none;"> Recent efforts have been made on domestic activities classification from audio recordings, especially the works submitted to the challenge of DCASE (Detection and Classification of Acoustic Scenes and Events) since 2018. In contrast, few studies were done on domestic activities clustering, which is a newly emerging problem. Domestic activities clustering from audio recordings aims at merging audio clips which belong to the same class of domestic activity into a single cluster. Domestic activities clustering is an effective way for unsupervised estimation of daily activities performed in home environment. In this study, we propose a method for domestic activities clustering using a convolutional capsule autoencoder network (CCAN). In the method, the deep embeddings are learned by the autoencoder in the CCAN, while the deep embeddings which belong to the same class of domestic activities are merged into a single cluster by a clustering layer in the CCAN. Evaluated on a public dataset adopted in DCASE-2018 Task 5, the results show that the proposed method outperforms state-of-the-art methods in terms of the metrics of clustering accuracy and normalized mutual information. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2105.03583v1-abstract-full').style.display = 'none'; document.getElementById('2105.03583v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 May, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures, 5 tables, Accepted by IEEE ICASSP 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.14332">arXiv:2104.14332</a> <span> [<a href="https://arxiv.org/pdf/2104.14332">pdf</a>, <a href="https://arxiv.org/format/2104.14332">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Hypernetwork Dismantling via Deep Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yan%2C+D">Dengcheng Yan</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+W">Wenxin Xie</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yiwen Zhang</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qiang He</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Y">Yun Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.14332v2-abstract-short" style="display: inline;"> Network dismantling aims to degrade the connectivity of a network by removing an optimal set of nodes. It has been widely adopted in many real-world applications such as epidemic control and rumor containment. However, conventional methods usually focus on simple network modeling with only pairwise interactions, while group-wise interactions modeled by hypernetwork are ubiquitous and critical. In… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.14332v2-abstract-full').style.display = 'inline'; document.getElementById('2104.14332v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.14332v2-abstract-full" style="display: none;"> Network dismantling aims to degrade the connectivity of a network by removing an optimal set of nodes. It has been widely adopted in many real-world applications such as epidemic control and rumor containment. However, conventional methods usually focus on simple network modeling with only pairwise interactions, while group-wise interactions modeled by hypernetwork are ubiquitous and critical. In this work, we formulate the hypernetwork dismantling problem as a node sequence decision problem and propose a deep reinforcement learning (DRL)-based hypernetwork dismantling framework. Besides, we design a novel inductive hypernetwork embedding method to ensure the transferability to various real-world hypernetworks. Our framework first generates small-scale synthetic hypernetworks and embeds the nodes and hypernetworks into a low dimensional vector space to represent the action and state space in DRL, respectively. Then trial-and-error dismantling tasks are conducted by an agent on these synthetic hypernetworks, and the dismantling strategy is continuously optimized. Finally, the well-optimized strategy is applied to real-world hypernetwork dismantling tasks. Experimental results on five real-world hypernetworks demonstrate the effectiveness of our proposed framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.14332v2-abstract-full').style.display = 'none'; document.getElementById('2104.14332v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.00705">arXiv:2104.00705</a> <span> [<a href="https://arxiv.org/pdf/2104.00705">pdf</a>, <a href="https://arxiv.org/format/2104.00705">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Multi-rate attention architecture for fast streamable Text-to-speech spectrum modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=He%2C+Q">Qing He</a>, <a href="/search/eess?searchtype=author&query=Xiu%2C+Z">Zhiping Xiu</a>, <a href="/search/eess?searchtype=author&query=Koehler%2C+T">Thilo Koehler</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+J">Jilong Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.00705v1-abstract-short" style="display: inline;"> Typical high quality text-to-speech (TTS) systems today use a two-stage architecture, with a spectrum model stage that generates spectral frames and a vocoder stage that generates the actual audio. High-quality spectrum models usually incorporate the encoder-decoder architecture with self-attention or bi-directional long short-term (BLSTM) units. While these models can produce high quality speech,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.00705v1-abstract-full').style.display = 'inline'; document.getElementById('2104.00705v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.00705v1-abstract-full" style="display: none;"> Typical high quality text-to-speech (TTS) systems today use a two-stage architecture, with a spectrum model stage that generates spectral frames and a vocoder stage that generates the actual audio. High-quality spectrum models usually incorporate the encoder-decoder architecture with self-attention or bi-directional long short-term (BLSTM) units. While these models can produce high quality speech, they often incur O($L$) increase in both latency and real-time factor (RTF) with respect to input length $L$. In other words, longer inputs leads to longer delay and slower synthesis speed, limiting its use in real-time applications. In this paper, we propose a multi-rate attention architecture that breaks the latency and RTF bottlenecks by computing a compact representation during encoding and recurrently generating the attention vector in a streaming manner during decoding. The proposed architecture achieves high audio quality (MOS of 4.31 compared to groundtruth 4.48), low latency, and low RTF at the same time. Meanwhile, both latency and RTF of the proposed system stay constant regardless of input lengths, making it ideal for real-time applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.00705v1-abstract-full').style.display = 'none'; document.getElementById('2104.00705v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.12985">arXiv:2011.12985</a> <span> [<a href="https://arxiv.org/pdf/2011.12985">pdf</a>, <a href="https://arxiv.org/format/2011.12985">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> FBWave: Efficient and Scalable Neural Vocoders for Streaming Text-To-Speech on the Edge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+B">Bichen Wu</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qing He</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+P">Peizhao Zhang</a>, <a href="/search/eess?searchtype=author&query=Koehler%2C+T">Thilo Koehler</a>, <a href="/search/eess?searchtype=author&query=Keutzer%2C+K">Kurt Keutzer</a>, <a href="/search/eess?searchtype=author&query=Vajda%2C+P">Peter Vajda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.12985v1-abstract-short" style="display: inline;"> Nowadays more and more applications can benefit from edge-based text-to-speech (TTS). However, most existing TTS models are too computationally expensive and are not flexible enough to be deployed on the diverse variety of edge devices with their equally diverse computational capacities. To address this, we propose FBWave, a family of efficient and scalable neural vocoders that can achieve optimal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.12985v1-abstract-full').style.display = 'inline'; document.getElementById('2011.12985v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.12985v1-abstract-full" style="display: none;"> Nowadays more and more applications can benefit from edge-based text-to-speech (TTS). However, most existing TTS models are too computationally expensive and are not flexible enough to be deployed on the diverse variety of edge devices with their equally diverse computational capacities. To address this, we propose FBWave, a family of efficient and scalable neural vocoders that can achieve optimal performance-efficiency trade-offs for different edge devices. FBWave is a hybrid flow-based generative model that combines the advantages of autoregressive and non-autoregressive models. It produces high quality audio and supports streaming during inference while remaining highly computationally efficient. Our experiments show that FBWave can achieve similar audio quality to WaveRNN while reducing MACs by 40x. More efficient variants of FBWave can achieve up to 109x fewer MACs while still delivering acceptable audio quality. Audio demos are available at https://bichenwu09.github.io/vocoder_demos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.12985v1-abstract-full').style.display = 'none'; document.getElementById('2011.12985v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.12485">arXiv:2011.12485</a> <span> [<a href="https://arxiv.org/pdf/2011.12485">pdf</a>, <a href="https://arxiv.org/format/2011.12485">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> How to Train Neural Networks for Flare Removal </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yicheng Wu</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qiurui He</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+T">Tianfan Xue</a>, <a href="/search/eess?searchtype=author&query=Garg%2C+R">Rahul Garg</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+J">Jiawen Chen</a>, <a href="/search/eess?searchtype=author&query=Veeraraghavan%2C+A">Ashok Veeraraghavan</a>, <a href="/search/eess?searchtype=author&query=Barron%2C+J+T">Jonathan T. Barron</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.12485v4-abstract-short" style="display: inline;"> When a camera is pointed at a strong light source, the resulting photograph may contain lens flare artifacts. Flares appear in a wide variety of patterns (halos, streaks, color bleeding, haze, etc.) and this diversity in appearance makes flare removal challenging. Existing analytical solutions make strong assumptions about the artifact's geometry or brightness, and therefore only work well on a sm… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.12485v4-abstract-full').style.display = 'inline'; document.getElementById('2011.12485v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.12485v4-abstract-full" style="display: none;"> When a camera is pointed at a strong light source, the resulting photograph may contain lens flare artifacts. Flares appear in a wide variety of patterns (halos, streaks, color bleeding, haze, etc.) and this diversity in appearance makes flare removal challenging. Existing analytical solutions make strong assumptions about the artifact's geometry or brightness, and therefore only work well on a small subset of flares. Machine learning techniques have shown success in removing other types of artifacts, like reflections, but have not been widely applied to flare removal due to the lack of training data. To solve this problem, we explicitly model the optical causes of flare either empirically or using wave optics, and generate semi-synthetic pairs of flare-corrupted and clean images. This enables us to train neural networks to remove lens flare for the first time. Experiments show our data synthesis approach is critical for accurate flare removal, and that models trained with our technique generalize well to real lens flares across different scenes, lighting conditions, and cameras. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.12485v4-abstract-full').style.display = 'none'; document.getElementById('2011.12485v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">A new version paper is uploaded</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2005.09082">arXiv:2005.09082</a> <span> [<a href="https://arxiv.org/pdf/2005.09082">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Quantum Noise of Kramers-Kronig Receiver </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+F">Fan Zhang</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+J">Jiayu Zheng</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+H">Haijun Kang</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+F">Fengxiao Sun</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qiongyi He</a>, <a href="/search/eess?searchtype=author&query=Su%2C+X">Xiaolong Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2005.09082v19-abstract-short" style="display: inline;"> The Kramers-Kronig (KK) receiver provides an efficient method to reconstruct the complex-valued optical field by means of intensity detection given a minimum-phase signal. In this paper, we analytically show that for detecting coherent states through measuring the minimum-phase signal, while keeping the radial quantum fluctuation the same as the balanced heterodyne detection does, the KK receiver… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.09082v19-abstract-full').style.display = 'inline'; document.getElementById('2005.09082v19-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2005.09082v19-abstract-full" style="display: none;"> The Kramers-Kronig (KK) receiver provides an efficient method to reconstruct the complex-valued optical field by means of intensity detection given a minimum-phase signal. In this paper, we analytically show that for detecting coherent states through measuring the minimum-phase signal, while keeping the radial quantum fluctuation the same as the balanced heterodyne detection does, the KK receiver can indirectly recover the tangential component with fluctuation equivalently reduced to 1/3 times the radial one at the decision time, by adopting the KK relations to utilize the information of the physically measured radial component of other time of the symbol period. In consequence, the KK receiver achieves 3/2 times the signal-to-noise ratio of balanced heterodyne detection, while presenting an asymmetric quantum fluctuation distribution depending on the time-varying phase. Therefore, the KK receiver provides a feasible scheme to reduce the quantum fluctuation for obtaining the selected component to 2/ 3 times that of physically measuring the same component of the coherent state. This work provides a physical insight of the KK receiver and should enrich the knowledge of electromagnetic noise in quantum optical measurement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.09082v19-abstract-full').style.display = 'none'; document.getElementById('2005.09082v19-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2002.06758">arXiv:2002.06758</a> <span> [<a href="https://arxiv.org/pdf/2002.06758">pdf</a>, <a href="https://arxiv.org/format/2002.06758">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Interactive Text-to-Speech System via Joint Style Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yang Gao</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+W">Weiyi Zheng</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Z">Zhaojun Yang</a>, <a href="/search/eess?searchtype=author&query=Kohler%2C+T">Thilo Kohler</a>, <a href="/search/eess?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qing He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2002.06758v2-abstract-short" style="display: inline;"> While modern TTS technologies have made significant advancements in audio quality, there is still a lack of behavior naturalness compared to conversing with people. We propose a style-embedded TTS system that generates styled responses based on the speech query style. To achieve this, the system includes a style extraction model that extracts a style embedding from the speech query, which is then… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2002.06758v2-abstract-full').style.display = 'inline'; document.getElementById('2002.06758v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2002.06758v2-abstract-full" style="display: none;"> While modern TTS technologies have made significant advancements in audio quality, there is still a lack of behavior naturalness compared to conversing with people. We propose a style-embedded TTS system that generates styled responses based on the speech query style. To achieve this, the system includes a style extraction model that extracts a style embedding from the speech query, which is then used by the TTS to produce a matching response. We faced two main challenges: 1) only a small portion of the TTS training dataset has style labels, which is needed to train a multi-style TTS that respects different style embeddings during inference. 2) The TTS system and the style extraction model have disjoint training datasets. We need consistent style labels across these two datasets so that the TTS can learn to respect the labels produced by the style extraction model during inference. To solve these, we adopted a semi-supervised approach that uses the style extraction model to create style labels for the TTS dataset and applied transfer learning to learn the style embedding jointly. Our experiment results show user preference for the styled TTS responses and demonstrate the style-embedded TTS system's capability of mimicking the speech query style. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2002.06758v2-abstract-full').style.display = 'none'; document.getElementById('2002.06758v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 February, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Interspeech 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2001.10717">arXiv:2001.10717</a> <span> [<a href="https://arxiv.org/pdf/2001.10717">pdf</a>, <a href="https://arxiv.org/format/2001.10717">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> </div> </div> <p class="title is-5 mathjax"> Patient Specific Biomechanics Are Clinically Significant In Accurate Computer Aided Surgical Image Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Barrow%2C+M">Michael Barrow</a>, <a href="/search/eess?searchtype=author&query=Chao%2C+A">Alice Chao</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qizhi He</a>, <a href="/search/eess?searchtype=author&query=Ramamoorthy%2C+S">Sonia Ramamoorthy</a>, <a href="/search/eess?searchtype=author&query=Sirlin%2C+C">Claude Sirlin</a>, <a href="/search/eess?searchtype=author&query=Kastner%2C+R">Ryan Kastner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2001.10717v1-abstract-short" style="display: inline;"> Augmented Reality is used in Image Guided surgery (AR IG) to fuse surgical landmarks from preoperative images into a video overlay. Physical simulation is essential to maintaining accurate position of the landmarks as surgery progresses and ensuring patient safety by avoiding accidental damage to vessels etc. In liver procedures, AR IG simulation accuracy is hampered by an inability to model stiff… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.10717v1-abstract-full').style.display = 'inline'; document.getElementById('2001.10717v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2001.10717v1-abstract-full" style="display: none;"> Augmented Reality is used in Image Guided surgery (AR IG) to fuse surgical landmarks from preoperative images into a video overlay. Physical simulation is essential to maintaining accurate position of the landmarks as surgery progresses and ensuring patient safety by avoiding accidental damage to vessels etc. In liver procedures, AR IG simulation accuracy is hampered by an inability to model stiffness variations unique to the patients disease. We introduce a novel method to account for patient specific stiffness variation based on Magnetic Resonance Elastography (MRE) data. To the best of our knowledge we are the first to demonstrate the use of in-vivo biomechanical data for AR IG landmark placement. In this early work, a comparative evaluation of our MRE data driven simulation and the traditional method shows clinically significant differences in accuracy during landmark placement and motivates further animal model trials. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.10717v1-abstract-full').style.display = 'none'; document.getElementById('2001.10717v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1907.11377">arXiv:1907.11377</a> <span> [<a href="https://arxiv.org/pdf/1907.11377">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Deep Learning Detection of Inaccurate Smart Electricity Meters: A Case Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+M">Ming Liu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+D">Dongpeng Liu</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+G">Guangyu Sun</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Y">Yi Zhao</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+D">Duolin Wang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+F">Fangxing Liu</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+X">Xiang Fang</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qing He</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+D">Dong Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1907.11377v3-abstract-short" style="display: inline;"> Detecting inaccurate smart meters and targeting them for replacement can save significant resources. For this purpose, a novel deep-learning method was developed based on long short-term memory (LSTM) and a modified convolutional neural network (CNN) to predict electricity usage trajectories based on historical data. From the significant difference between the predicted trajectory and the observed… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1907.11377v3-abstract-full').style.display = 'inline'; document.getElementById('1907.11377v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1907.11377v3-abstract-full" style="display: none;"> Detecting inaccurate smart meters and targeting them for replacement can save significant resources. For this purpose, a novel deep-learning method was developed based on long short-term memory (LSTM) and a modified convolutional neural network (CNN) to predict electricity usage trajectories based on historical data. From the significant difference between the predicted trajectory and the observed one, the meters that cannot measure electricity accurately are located. In a case study, a proof of principle was demonstrated in detecting inaccurate meters with high accuracy for practical usage to prevent unnecessary replacement and increase the service life span of smart meters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1907.11377v3-abstract-full').style.display = 'none'; document.getElementById('1907.11377v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 August, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 July, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1904.09978">arXiv:1904.09978</a> <span> [<a href="https://arxiv.org/pdf/1904.09978">pdf</a>, <a href="https://arxiv.org/format/1904.09978">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/BIBM.2009.40">10.1109/BIBM.2009.40 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A Fast, Semi-Automatic Brain Structure Segmentation Algorithm for Magnetic Resonance Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Karsch%2C+K">Kevin Karsch</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qing He</a>, <a href="/search/eess?searchtype=author&query=Duan%2C+Y">Ye Duan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1904.09978v1-abstract-short" style="display: inline;"> Medical image segmentation has become an essential technique in clinical and research-oriented applications. Because manual segmentation methods are tedious, and fully automatic segmentation lacks the flexibility of human intervention or correction, semi-automatic methods have become the preferred type of medical image segmentation. We present a hybrid, semi-automatic segmentation method in 3D tha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1904.09978v1-abstract-full').style.display = 'inline'; document.getElementById('1904.09978v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1904.09978v1-abstract-full" style="display: none;"> Medical image segmentation has become an essential technique in clinical and research-oriented applications. Because manual segmentation methods are tedious, and fully automatic segmentation lacks the flexibility of human intervention or correction, semi-automatic methods have become the preferred type of medical image segmentation. We present a hybrid, semi-automatic segmentation method in 3D that integrates both region-based and boundary-based procedures. Our method differs from previous hybrid methods in that we perform region-based and boundary-based approaches separately, which allows for more efficient segmentation. A region-based technique is used to generate an initial seed contour that roughly represents the boundary of a target brain structure, alleviating the local minima problem in the subsequent model deformation phase. The contour is deformed under a unique force equation independent of image edges. Experiments on MRI data show that this method can achieve high accuracy and efficiency primarily due to the unique seed initialization technique. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1904.09978v1-abstract-full').style.display = 'none'; document.getElementById('1904.09978v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 April, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2019. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=He%2C+Q&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=He%2C+Q&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=He%2C+Q&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository