Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 53 results for author: <span class="mathjax">Ni, Z</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Ni%2C+Z">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Ni, Z"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Ni%2C+Z&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Ni, Z"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Ni%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Ni%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Ni%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14260">arXiv:2410.14260</a> <span> [<a href="https://arxiv.org/pdf/2410.14260">pdf</a>, <a href="https://arxiv.org/format/2410.14260">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Parametric Digital Twins for Preserving Historic Buildings: A Case Study at L枚fstad Castle in 脰sterg枚tland, Sweden </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhongjun Ni</a>, <a href="/search/eess?searchtype=author&query=Hupkes%2C+J">Jelrik Hupkes</a>, <a href="/search/eess?searchtype=author&query=Eriksson%2C+P">Petra Eriksson</a>, <a href="/search/eess?searchtype=author&query=Leijonhufvud%2C+G">Gustaf Leijonhufvud</a>, <a href="/search/eess?searchtype=author&query=Karlsson%2C+M">Magnus Karlsson</a>, <a href="/search/eess?searchtype=author&query=Gong%2C+S">Shaofang Gong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14260v2-abstract-short" style="display: inline;"> This study showcases the digitalization of L枚fstad Castle in Sweden to contribute to preserving its heritage values. The castle and its collections are deteriorating due to an inappropriate indoor climate. To address this, thirteen cloud-connected sensor boxes, equipped with 84 sensors, were installed throughout the main building, from the basement to the attic, to continuously monitor various ind… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14260v2-abstract-full').style.display = 'inline'; document.getElementById('2410.14260v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14260v2-abstract-full" style="display: none;"> This study showcases the digitalization of L枚fstad Castle in Sweden to contribute to preserving its heritage values. The castle and its collections are deteriorating due to an inappropriate indoor climate. To address this, thirteen cloud-connected sensor boxes, equipped with 84 sensors, were installed throughout the main building, from the basement to the attic, to continuously monitor various indoor environmental parameters. The collected extensive multi-parametric data form the basis for creating a parametric digital twin of the building. The digital twin and detailed data analytics offer a deeper understanding of indoor climate and guide the adoption of appropriate heating and ventilation strategies. The results revealed the need to address high humidity problems in the basement and on the ground floor, such as installing vapor barriers. Opportunities for adopting energy-efficient heating and ventilation strategies on the upper floors were also highlighted. The digitalization solution and findings are not only applicable to L枚fstad Castle but also provide valuable guidance for the conservation of other historic buildings facing similar challenges. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14260v2-abstract-full').style.display = 'none'; document.getElementById('2410.14260v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been submitted to the IEEE for possible publication</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.5.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08861">arXiv:2410.08861</a> <span> [<a href="https://arxiv.org/pdf/2410.08861">pdf</a>, <a href="https://arxiv.org/format/2410.08861">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A foundation model for generalizable disease diagnosis in chest X-ray images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+L">Lijian Xu</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Ziyu Ni</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+H">Hao Sun</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Hongsheng Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shaoting Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08861v1-abstract-short" style="display: inline;"> Medical artificial intelligence (AI) is revolutionizing the interpretation of chest X-ray (CXR) images by providing robust tools for disease diagnosis. However, the effectiveness of these AI models is often limited by their reliance on large amounts of task-specific labeled data and their inability to generalize across diverse clinical settings. To address these challenges, we introduce CXRBase, a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08861v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08861v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08861v1-abstract-full" style="display: none;"> Medical artificial intelligence (AI) is revolutionizing the interpretation of chest X-ray (CXR) images by providing robust tools for disease diagnosis. However, the effectiveness of these AI models is often limited by their reliance on large amounts of task-specific labeled data and their inability to generalize across diverse clinical settings. To address these challenges, we introduce CXRBase, a foundational model designed to learn versatile representations from unlabelled CXR images, facilitating efficient adaptation to various clinical tasks. CXRBase is initially trained on a substantial dataset of 1.04 million unlabelled CXR images using self-supervised learning methods. This approach allows the model to discern meaningful patterns without the need for explicit labels. After this initial phase, CXRBase is fine-tuned with labeled data to enhance its performance in disease detection, enabling accurate classification of chest diseases. CXRBase provides a generalizable solution to improve model performance and alleviate the annotation workload of experts to enable broad clinical AI applications from chest imaging. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08861v1-abstract-full').style.display = 'none'; document.getElementById('2410.08861v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00815">arXiv:2409.00815</a> <span> [<a href="https://arxiv.org/pdf/2409.00815">pdf</a>, <a href="https://arxiv.org/format/2409.00815">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Serialized Speech Information Guidance with Overlapped Encoding Separation for Multi-Speaker Automatic Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shi%2C+H">Hao Shi</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yuan Gao</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/eess?searchtype=author&query=Kawahara%2C+T">Tatsuya Kawahara</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00815v3-abstract-short" style="display: inline;"> Serialized output training (SOT) attracts increasing attention due to its convenience and flexibility for multi-speaker automatic speech recognition (ASR). However, it is not easy to train with attention loss only. In this paper, we propose the overlapped encoding separation (EncSep) to fully utilize the benefits of the connectionist temporal classification (CTC) and attention hybrid loss. This ad… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00815v3-abstract-full').style.display = 'inline'; document.getElementById('2409.00815v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00815v3-abstract-full" style="display: none;"> Serialized output training (SOT) attracts increasing attention due to its convenience and flexibility for multi-speaker automatic speech recognition (ASR). However, it is not easy to train with attention loss only. In this paper, we propose the overlapped encoding separation (EncSep) to fully utilize the benefits of the connectionist temporal classification (CTC) and attention hybrid loss. This additional separator is inserted after the encoder to extract the multi-speaker information with CTC losses. Furthermore, we propose the serialized speech information guidance SOT (GEncSep) to further utilize the separated encodings. The separated streams are concatenated to provide single-speaker information to guide attention during decoding. The experimental results on LibriMix show that the single-speaker encoding can be separated from the overlapped encoding. The CTC loss helps to improve the encoder representation under complex scenarios. GEncSep further improved performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00815v3-abstract-full').style.display = 'none'; document.getElementById('2409.00815v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.03446">arXiv:2408.03446</a> <span> [<a href="https://arxiv.org/pdf/2408.03446">pdf</a>, <a href="https://arxiv.org/format/2408.03446">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Optimizing NOMA Transmissions to Advance Federated Learning in Vehicular Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Ziru Chen</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhou Ni</a>, <a href="/search/eess?searchtype=author&query=Guan%2C+P">Peiyuan Guan</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+L">Lu Wang</a>, <a href="/search/eess?searchtype=author&query=Cai%2C+L+X">Lin X. Cai</a>, <a href="/search/eess?searchtype=author&query=Hashemi%2C+M">Morteza Hashemi</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zongzhi Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.03446v1-abstract-short" style="display: inline;"> Diverse critical data, such as location information and driving patterns, can be collected by IoT devices in vehicular networks to improve driving experiences and road safety. However, drivers are often reluctant to share their data due to privacy concerns. The Federated Vehicular Network (FVN) is a promising technology that tackles these concerns by transmitting model parameters instead of raw da… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03446v1-abstract-full').style.display = 'inline'; document.getElementById('2408.03446v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.03446v1-abstract-full" style="display: none;"> Diverse critical data, such as location information and driving patterns, can be collected by IoT devices in vehicular networks to improve driving experiences and road safety. However, drivers are often reluctant to share their data due to privacy concerns. The Federated Vehicular Network (FVN) is a promising technology that tackles these concerns by transmitting model parameters instead of raw data, thereby protecting the privacy of drivers. Nevertheless, the performance of Federated Learning (FL) in a vehicular network depends on the joining ratio, which is restricted by the limited available wireless resources. To address these challenges, this paper proposes to apply Non-Orthogonal Multiple Access (NOMA) to improve the joining ratio in a FVN. Specifically, a vehicle selection and transmission power control algorithm is developed to exploit the power domain differences in the received signal to ensure the maximum number of vehicles capable of joining the FVN. Our simulation results demonstrate that the proposed NOMA-based strategy increases the joining ratio and significantly enhances the performance of the FVN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03446v1-abstract-full').style.display = 'none'; document.getElementById('2408.03446v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The paper is accepted by IEEE Globecom 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.03648">arXiv:2407.03648</a> <span> [<a href="https://arxiv.org/pdf/2407.03648">pdf</a>, <a href="https://arxiv.org/format/2407.03648">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> High Fidelity Text-Guided Music Editing via Single-Stage Flow Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lan%2C+G+L">Gael Le Lan</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+B">Bowen Shi</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/eess?searchtype=author&query=Srinivasan%2C+S">Sidd Srinivasan</a>, <a href="/search/eess?searchtype=author&query=Kumar%2C+A">Anurag Kumar</a>, <a href="/search/eess?searchtype=author&query=Ellis%2C+B">Brian Ellis</a>, <a href="/search/eess?searchtype=author&query=Kant%2C+D">David Kant</a>, <a href="/search/eess?searchtype=author&query=Nagaraja%2C+V">Varun Nagaraja</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+E">Ernie Chang</a>, <a href="/search/eess?searchtype=author&query=Hsu%2C+W">Wei-Ning Hsu</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+Y">Yangyang Shi</a>, <a href="/search/eess?searchtype=author&query=Chandra%2C+V">Vikas Chandra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.03648v2-abstract-short" style="display: inline;"> We introduce MelodyFlow, an efficient text-controllable high-fidelity music generation and editing model. It operates on continuous latent representations from a low frame rate 48 kHz stereo variational auto encoder codec. Based on a diffusion transformer architecture trained on a flow-matching objective the model can edit diverse high quality stereo samples of variable duration, with simple text… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03648v2-abstract-full').style.display = 'inline'; document.getElementById('2407.03648v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.03648v2-abstract-full" style="display: none;"> We introduce MelodyFlow, an efficient text-controllable high-fidelity music generation and editing model. It operates on continuous latent representations from a low frame rate 48 kHz stereo variational auto encoder codec. Based on a diffusion transformer architecture trained on a flow-matching objective the model can edit diverse high quality stereo samples of variable duration, with simple text descriptions. We adapt the ReNoise latent inversion method to flow matching and compare it with the original implementation and naive denoising diffusion implicit model (DDIM) inversion on a variety of music editing prompts. Our results indicate that our latent inversion outperforms both ReNoise and DDIM for zero-shot test-time text-guided editing on several objective metrics. Subjective evaluations exhibit a substantial improvement over previous state of the art for music editing. Code and model weights will be publicly made available. Samples are available at https://melodyflow.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03648v2-abstract-full').style.display = 'none'; document.getElementById('2407.03648v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.04660">arXiv:2406.04660</a> <span> [<a href="https://arxiv.org/pdf/2406.04660">pdf</a>, <a href="https://arxiv.org/format/2406.04660">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.21437/Interspeech.2024-1239">10.21437/Interspeech.2024-1239 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> URGENT Challenge: Universality, Robustness, and Generalizability For Speech Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+W">Wangyou Zhang</a>, <a href="/search/eess?searchtype=author&query=Scheibler%2C+R">Robin Scheibler</a>, <a href="/search/eess?searchtype=author&query=Saijo%2C+K">Kohei Saijo</a>, <a href="/search/eess?searchtype=author&query=Cornell%2C+S">Samuele Cornell</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Chenda Li</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/eess?searchtype=author&query=Kumar%2C+A">Anurag Kumar</a>, <a href="/search/eess?searchtype=author&query=Pirklbauer%2C+J">Jan Pirklbauer</a>, <a href="/search/eess?searchtype=author&query=Sach%2C+M">Marvin Sach</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Fingscheidt%2C+T">Tim Fingscheidt</a>, <a href="/search/eess?searchtype=author&query=Qian%2C+Y">Yanmin Qian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.04660v1-abstract-short" style="display: inline;"> The last decade has witnessed significant advancements in deep learning-based speech enhancement (SE). However, most existing SE research has limitations on the coverage of SE sub-tasks, data diversity and amount, and evaluation metrics. To fill this gap and promote research toward universal SE, we establish a new SE challenge, named URGENT, to focus on the universality, robustness, and generaliza… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04660v1-abstract-full').style.display = 'inline'; document.getElementById('2406.04660v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.04660v1-abstract-full" style="display: none;"> The last decade has witnessed significant advancements in deep learning-based speech enhancement (SE). However, most existing SE research has limitations on the coverage of SE sub-tasks, data diversity and amount, and evaluation metrics. To fill this gap and promote research toward universal SE, we establish a new SE challenge, named URGENT, to focus on the universality, robustness, and generalizability of SE. We aim to extend the SE definition to cover different sub-tasks to explore the limits of SE models, starting from denoising, dereverberation, bandwidth extension, and declipping. A novel framework is proposed to unify all these sub-tasks in a single model, allowing the use of all existing SE approaches. We collected public speech and noise data from different domains to construct diverse evaluation data. Finally, we discuss the insights gained from our preliminary baseline experiments based on both generative and discriminative SE methods with 12 curated metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04660v1-abstract-full').style.display = 'none'; document.getElementById('2406.04660v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 3 figures, 3 tables. Accepted by Interspeech 2024. An extended version of the accepted manuscript with appendix</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.02560">arXiv:2406.02560</a> <span> [<a href="https://arxiv.org/pdf/2406.02560">pdf</a>, <a href="https://arxiv.org/format/2406.02560">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Less Peaky and More Accurate CTC Forced Alignment by Label Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+R">Ruizhe Huang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiaohui Zhang</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+L">Li Sun</a>, <a href="/search/eess?searchtype=author&query=Hira%2C+M">Moto Hira</a>, <a href="/search/eess?searchtype=author&query=Hwang%2C+J">Jeff Hwang</a>, <a href="/search/eess?searchtype=author&query=Manohar%2C+V">Vimal Manohar</a>, <a href="/search/eess?searchtype=author&query=Pratap%2C+V">Vineel Pratap</a>, <a href="/search/eess?searchtype=author&query=Wiesner%2C+M">Matthew Wiesner</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Povey%2C+D">Daniel Povey</a>, <a href="/search/eess?searchtype=author&query=Khudanpur%2C+S">Sanjeev Khudanpur</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.02560v3-abstract-short" style="display: inline;"> Connectionist temporal classification (CTC) models are known to have peaky output distributions. Such behavior is not a problem for automatic speech recognition (ASR), but it can cause inaccurate forced alignments (FA), especially at finer granularity, e.g., phoneme level. This paper aims at alleviating the peaky behavior for CTC and improve its suitability for forced alignment generation, by leve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02560v3-abstract-full').style.display = 'inline'; document.getElementById('2406.02560v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.02560v3-abstract-full" style="display: none;"> Connectionist temporal classification (CTC) models are known to have peaky output distributions. Such behavior is not a problem for automatic speech recognition (ASR), but it can cause inaccurate forced alignments (FA), especially at finer granularity, e.g., phoneme level. This paper aims at alleviating the peaky behavior for CTC and improve its suitability for forced alignment generation, by leveraging label priors, so that the scores of alignment paths containing fewer blanks are boosted and maximized during training. As a result, our CTC model produces less peaky posteriors and is able to more accurately predict the offset of the tokens besides their onset. It outperforms the standard CTC model and a heuristics-based approach for obtaining CTC's token offset timestamps by 12-40% in phoneme and word boundary errors (PBE and WBE) measured on the Buckeye and TIMIT data. Compared with the most widely used FA toolkit Montreal Forced Aligner (MFA), our method performs similarly on PBE/WBE on Buckeye, yet falls behind MFA on TIMIT. Nevertheless, our method has a much simpler training pipeline and better runtime efficiency. Our training recipe and pretrained model are released in TorchAudio. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02560v3-abstract-full').style.display = 'none'; document.getElementById('2406.02560v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2024. Github repo: https://github.com/huangruizhe/audio/tree/aligner_label_priors</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.18790">arXiv:2405.18790</a> <span> [<a href="https://arxiv.org/pdf/2405.18790">pdf</a>, <a href="https://arxiv.org/format/2405.18790">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Opinion-Unaware Blind Image Quality Assessment using Multi-Scale Deep Feature Statistics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhangkai Ni</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yue Liu</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+K">Keyan Ding</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wenhan Yang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hanli Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shiqi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.18790v1-abstract-short" style="display: inline;"> Deep learning-based methods have significantly influenced the blind image quality assessment (BIQA) field, however, these methods often require training using large amounts of human rating data. In contrast, traditional knowledge-based methods are cost-effective for training but face challenges in effectively extracting features aligned with human visual perception. To bridge these gaps, we propos… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18790v1-abstract-full').style.display = 'inline'; document.getElementById('2405.18790v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.18790v1-abstract-full" style="display: none;"> Deep learning-based methods have significantly influenced the blind image quality assessment (BIQA) field, however, these methods often require training using large amounts of human rating data. In contrast, traditional knowledge-based methods are cost-effective for training but face challenges in effectively extracting features aligned with human visual perception. To bridge these gaps, we propose integrating deep features from pre-trained visual models with a statistical analysis model into a Multi-scale Deep Feature Statistics (MDFS) model for achieving opinion-unaware BIQA (OU-BIQA), thereby eliminating the reliance on human rating data and significantly improving training efficiency. Specifically, we extract patch-wise multi-scale features from pre-trained vision models, which are subsequently fitted into a multivariate Gaussian (MVG) model. The final quality score is determined by quantifying the distance between the MVG model derived from the test image and the benchmark MVG model derived from the high-quality image set. A comprehensive series of experiments conducted on various datasets show that our proposed model exhibits superior consistency with human visual perception compared to state-of-the-art BIQA models. Furthermore, it shows improved generalizability across diverse target-specific BIQA tasks. Our code is available at: https://github.com/eezkni/MDFS <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18790v1-abstract-full').style.display = 'none'; document.getElementById('2405.18790v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IEEE Transactions on Multimedia 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.08203">arXiv:2403.08203</a> <span> [<a href="https://arxiv.org/pdf/2403.08203">pdf</a>, <a href="https://arxiv.org/format/2403.08203">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Learnable Community-Aware Transformer for Brain Connectome Analysis with Token Clustering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+Y">Yanting Yang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+B">Beidi Zhao</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhuohao Ni</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Y">Yize Zhao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xiaoxiao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.08203v1-abstract-short" style="display: inline;"> Neuroscientific research has revealed that the complex brain network can be organized into distinct functional communities, each characterized by a cohesive group of regions of interest (ROIs) with strong interconnections. These communities play a crucial role in comprehending the functional organization of the brain and its implications for neurological conditions, including Autism Spectrum Disor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08203v1-abstract-full').style.display = 'inline'; document.getElementById('2403.08203v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.08203v1-abstract-full" style="display: none;"> Neuroscientific research has revealed that the complex brain network can be organized into distinct functional communities, each characterized by a cohesive group of regions of interest (ROIs) with strong interconnections. These communities play a crucial role in comprehending the functional organization of the brain and its implications for neurological conditions, including Autism Spectrum Disorder (ASD) and biological differences, such as in gender. Traditional models have been constrained by the necessity of predefined community clusters, limiting their flexibility and adaptability in deciphering the brain's functional organization. Furthermore, these models were restricted by a fixed number of communities, hindering their ability to accurately represent the brain's dynamic nature. In this study, we present a token clustering brain transformer-based model ($\texttt{TC-BrainTF}$) for joint community clustering and classification. Our approach proposes a novel token clustering (TC) module based on the transformer architecture, which utilizes learnable prompt tokens with orthogonal loss where each ROI embedding is projected onto the prompt embedding space, effectively clustering ROIs into communities and reducing the dimensions of the node representation via merging with communities. Our results demonstrate that our learnable community-aware model $\texttt{TC-BrainTF}$ offers improved accuracy in identifying ASD and classifying genders through rigorous testing on ABIDE and HCP datasets. Additionally, the qualitative analysis on $\texttt{TC-BrainTF}$ has demonstrated the effectiveness of the designed TC module and its relevance to neuroscience interpretations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08203v1-abstract-full').style.display = 'none'; document.getElementById('2403.08203v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.04326">arXiv:2403.04326</a> <span> [<a href="https://arxiv.org/pdf/2403.04326">pdf</a>, <a href="https://arxiv.org/format/2403.04326">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Edge-based Parametric Digital Twins for Intelligent Building Indoor Climate Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhongjun Ni</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+C">Chi Zhang</a>, <a href="/search/eess?searchtype=author&query=Karlsson%2C+M">Magnus Karlsson</a>, <a href="/search/eess?searchtype=author&query=Gong%2C+S">Shaofang Gong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.04326v1-abstract-short" style="display: inline;"> Digital transformation in the built environment generates vast data for developing data-driven models to optimize building operations. This study presents an integrated solution utilizing edge computing, digital twins, and deep learning to enhance the understanding of climate in buildings. Parametric digital twins, created using an ontology, ensure consistent data representation across diverse ser… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.04326v1-abstract-full').style.display = 'inline'; document.getElementById('2403.04326v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.04326v1-abstract-full" style="display: none;"> Digital transformation in the built environment generates vast data for developing data-driven models to optimize building operations. This study presents an integrated solution utilizing edge computing, digital twins, and deep learning to enhance the understanding of climate in buildings. Parametric digital twins, created using an ontology, ensure consistent data representation across diverse service systems equipped by different buildings. Based on created digital twins and collected data, deep learning methods are employed to develop predictive models for identifying patterns in indoor climate and providing insights. Both the parametric digital twin and deep learning models are deployed on edge for low latency and privacy compliance. As a demonstration, a case study was conducted in a historic building in 脰sterg枚tland, Sweden, to compare the performance of five deep learning architectures. The results indicate that the time-series dense encoder model exhibited strong competitiveness in performing multi-horizon forecasts of indoor temperature and relative humidity with low computational costs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.04326v1-abstract-full').style.display = 'none'; document.getElementById('2403.04326v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 8 figures, accepted in the 20th IEEE International Conference on Factory Communication Systems</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.5.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.18192">arXiv:2402.18192</a> <span> [<a href="https://arxiv.org/pdf/2402.18192">pdf</a>, <a href="https://arxiv.org/format/2402.18192">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Misalignment-Robust Frequency Distribution Loss for Image Transformation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhangkai Ni</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+J">Juncheng Wu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zian Wang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wenhan Yang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hanli Wang</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+L">Lin Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.18192v1-abstract-short" style="display: inline;"> This paper aims to address a common challenge in deep learning-based image transformation methods, such as image enhancement and super-resolution, which heavily rely on precisely aligned paired datasets with pixel-level alignments. However, creating precisely aligned paired images presents significant challenges and hinders the advancement of methods trained on such data. To overcome this challeng… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.18192v1-abstract-full').style.display = 'inline'; document.getElementById('2402.18192v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.18192v1-abstract-full" style="display: none;"> This paper aims to address a common challenge in deep learning-based image transformation methods, such as image enhancement and super-resolution, which heavily rely on precisely aligned paired datasets with pixel-level alignments. However, creating precisely aligned paired images presents significant challenges and hinders the advancement of methods trained on such data. To overcome this challenge, this paper introduces a novel and simple Frequency Distribution Loss (FDL) for computing distribution distance within the frequency domain. Specifically, we transform image features into the frequency domain using Discrete Fourier Transformation (DFT). Subsequently, frequency components (amplitude and phase) are processed separately to form the FDL loss function. Our method is empirically proven effective as a training constraint due to the thoughtful utilization of global information in the frequency domain. Extensive experimental evaluations, focusing on image enhancement and super-resolution tasks, demonstrate that FDL outperforms existing misalignment-robust loss functions. Furthermore, we explore the potential of our FDL for image style transfer that relies solely on completely misaligned data. Our code is available at: https://github.com/eezkni/FDL <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.18192v1-abstract-full').style.display = 'none'; document.getElementById('2402.18192v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Computer Vision and Pattern Recognition Conference (CVPR) 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.09686">arXiv:2401.09686</a> <span> [<a href="https://arxiv.org/pdf/2401.09686">pdf</a>, <a href="https://arxiv.org/format/2401.09686">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> An Empirical Study on the Impact of Positional Encoding in Transformer-based Monaural Speech Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qiquan Zhang</a>, <a href="/search/eess?searchtype=author&query=Ge%2C+M">Meng Ge</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+H">Hongxu Zhu</a>, <a href="/search/eess?searchtype=author&query=Ambikairajah%2C+E">Eliathamby Ambikairajah</a>, <a href="/search/eess?searchtype=author&query=Song%2C+Q">Qi Song</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Haizhou Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.09686v2-abstract-short" style="display: inline;"> Transformer architecture has enabled recent progress in speech enhancement. Since Transformers are position-agostic, positional encoding is the de facto standard component used to enable Transformers to distinguish the order of elements in a sequence. However, it remains unclear how positional encoding exactly impacts speech enhancement based on Transformer architectures. In this paper, we perform… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.09686v2-abstract-full').style.display = 'inline'; document.getElementById('2401.09686v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.09686v2-abstract-full" style="display: none;"> Transformer architecture has enabled recent progress in speech enhancement. Since Transformers are position-agostic, positional encoding is the de facto standard component used to enable Transformers to distinguish the order of elements in a sequence. However, it remains unclear how positional encoding exactly impacts speech enhancement based on Transformer architectures. In this paper, we perform a comprehensive empirical study evaluating five positional encoding methods, i.e., Sinusoidal and learned absolute position embedding (APE), T5-RPE, KERPLE, as well as the Transformer without positional encoding (No-Pos), across both causal and noncausal configurations. We conduct extensive speech enhancement experiments, involving spectral mapping and masking methods. Our findings establish that positional encoding is not quite helpful for the models in a causal configuration, which indicates that causal attention may implicitly incorporate position information. In a noncausal configuration, the models significantly benefit from the use of positional encoding. In addition, we find that among the four position embeddings, relative position embeddings outperform APEs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.09686v2-abstract-full').style.display = 'none'; document.getElementById('2401.09686v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.08264">arXiv:2312.08264</a> <span> [<a href="https://arxiv.org/pdf/2312.08264">pdf</a>, <a href="https://arxiv.org/format/2312.08264">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Atmospheric and Oceanic Physics">physics.ao-ph</span> </div> </div> <p class="title is-5 mathjax"> Kunyu: A High-Performing Global Weather Model Beyond Regression Losses </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zekun Ni</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.08264v1-abstract-short" style="display: inline;"> Over the past year, data-driven global weather forecasting has emerged as a new alternative to traditional numerical weather prediction. This innovative approach yields forecasts of comparable accuracy at a tiny fraction of computational costs. Regrettably, as far as I know, existing models exclusively rely on regression losses, producing forecasts with substantial blurring. Such blurring, althoug… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.08264v1-abstract-full').style.display = 'inline'; document.getElementById('2312.08264v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.08264v1-abstract-full" style="display: none;"> Over the past year, data-driven global weather forecasting has emerged as a new alternative to traditional numerical weather prediction. This innovative approach yields forecasts of comparable accuracy at a tiny fraction of computational costs. Regrettably, as far as I know, existing models exclusively rely on regression losses, producing forecasts with substantial blurring. Such blurring, although compromises practicality, enjoys an unfair advantage on evaluation metrics. In this paper, I present Kunyu, a global data-driven weather forecasting model which delivers accurate predictions across a comprehensive array of atmospheric variables at 0.35掳 resolution. With both regression and adversarial losses integrated in its training framework, Kunyu generates forecasts with enhanced clarity and realism. Its performance outpaces even ECMWF HRES in some aspects such as the estimation of anomaly extremes, while remaining competitive with ECMWF HRES on evaluation metrics such as RMSE and ACC. Kunyu is an important step forward in closing the utility gap between numerical and data-driven weather prediction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.08264v1-abstract-full').style.display = 'none'; document.getElementById('2312.08264v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.00897">arXiv:2311.00897</a> <span> [<a href="https://arxiv.org/pdf/2311.00897">pdf</a>, <a href="https://arxiv.org/format/2311.00897">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> On The Open Prompt Challenge In Conditional Audio Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chang%2C+E">Ernie Chang</a>, <a href="/search/eess?searchtype=author&query=Srinivasan%2C+S">Sidd Srinivasan</a>, <a href="/search/eess?searchtype=author&query=Luthra%2C+M">Mahi Luthra</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+P">Pin-Jie Lin</a>, <a href="/search/eess?searchtype=author&query=Nagaraja%2C+V">Varun Nagaraja</a>, <a href="/search/eess?searchtype=author&query=Iandola%2C+F">Forrest Iandola</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Z">Zechun Liu</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+C">Changsheng Zhao</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+Y">Yangyang Shi</a>, <a href="/search/eess?searchtype=author&query=Chandra%2C+V">Vikas Chandra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.00897v1-abstract-short" style="display: inline;"> Text-to-audio generation (TTA) produces audio from a text description, learning from pairs of audio samples and hand-annotated text. However, commercializing audio generation is challenging as user-input prompts are often under-specified when compared to text descriptions used to train TTA models. In this work, we treat TTA models as a ``blackbox'' and address the user prompt challenge with two ke… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.00897v1-abstract-full').style.display = 'inline'; document.getElementById('2311.00897v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.00897v1-abstract-full" style="display: none;"> Text-to-audio generation (TTA) produces audio from a text description, learning from pairs of audio samples and hand-annotated text. However, commercializing audio generation is challenging as user-input prompts are often under-specified when compared to text descriptions used to train TTA models. In this work, we treat TTA models as a ``blackbox'' and address the user prompt challenge with two key insights: (1) User prompts are generally under-specified, leading to a large alignment gap between user prompts and training prompts. (2) There is a distribution of audio descriptions for which TTA models are better at generating higher quality audio, which we refer to as ``audionese''. To this end, we rewrite prompts with instruction-tuned models and propose utilizing text-audio alignment as feedback signals via margin ranking learning for audio improvements. On both objective and subjective human evaluations, we observed marked improvements in both text-audio alignment and music audio quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.00897v1-abstract-full').style.display = 'none'; document.getElementById('2311.00897v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.17864">arXiv:2310.17864</a> <span> [<a href="https://arxiv.org/pdf/2310.17864">pdf</a>, <a href="https://arxiv.org/format/2310.17864">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> TorchAudio 2.1: Advancing speech recognition, self-supervised learning, and audio processing components for PyTorch </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hwang%2C+J">Jeff Hwang</a>, <a href="/search/eess?searchtype=author&query=Hira%2C+M">Moto Hira</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+C">Caroline Chen</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiaohui Zhang</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+G">Guangzhi Sun</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+P">Pingchuan Ma</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Ruizhe Huang</a>, <a href="/search/eess?searchtype=author&query=Pratap%2C+V">Vineel Pratap</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yuekai Zhang</a>, <a href="/search/eess?searchtype=author&query=Kumar%2C+A">Anurag Kumar</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+C">Chin-Yun Yu</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+C">Chuang Zhu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+C">Chunxi Liu</a>, <a href="/search/eess?searchtype=author&query=Kahn%2C+J">Jacob Kahn</a>, <a href="/search/eess?searchtype=author&query=Ravanelli%2C+M">Mirco Ravanelli</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+P">Peng Sun</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+Y">Yangyang Shi</a>, <a href="/search/eess?searchtype=author&query=Tao%2C+Y">Yumeng Tao</a>, <a href="/search/eess?searchtype=author&query=Scheibler%2C+R">Robin Scheibler</a>, <a href="/search/eess?searchtype=author&query=Cornell%2C+S">Samuele Cornell</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+S">Sean Kim</a>, <a href="/search/eess?searchtype=author&query=Petridis%2C+S">Stavros Petridis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.17864v1-abstract-short" style="display: inline;"> TorchAudio is an open-source audio and speech processing library built for PyTorch. It aims to accelerate the research and development of audio and speech technologies by providing well-designed, easy-to-use, and performant PyTorch components. Its contributors routinely engage with users to understand their needs and fulfill them by developing impactful features. Here, we survey TorchAudio's devel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.17864v1-abstract-full').style.display = 'inline'; document.getElementById('2310.17864v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.17864v1-abstract-full" style="display: none;"> TorchAudio is an open-source audio and speech processing library built for PyTorch. It aims to accelerate the research and development of audio and speech technologies by providing well-designed, easy-to-use, and performant PyTorch components. Its contributors routinely engage with users to understand their needs and fulfill them by developing impactful features. Here, we survey TorchAudio's development principles and contents and highlight key features we include in its latest version (2.1): self-supervised learning pre-trained pipelines and training recipes, high-performance CTC decoders, speech recognition models and training recipes, advanced media I/O capabilities, and tools for performing forced alignment, multi-channel speech enhancement, and reference-less speech assessment. For a selection of these features, through empirical studies, we demonstrate their efficacy and show that they achieve competitive or state-of-the-art performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.17864v1-abstract-full').style.display = 'none'; document.getElementById('2310.17864v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.01981">arXiv:2310.01981</a> <span> [<a href="https://arxiv.org/pdf/2310.01981">pdf</a>, <a href="https://arxiv.org/format/2310.01981">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.3384/9789180753050">10.3384/9789180753050 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A Digitalization Framework for Smart Maintenance of Historic Buildings </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhongjun Ni</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.01981v1-abstract-short" style="display: inline;"> Smart maintenance of historic buildings involves integration of digital technologies and data analysis methods to help maintain functionalities of these buildings and preserve their heritage values. However, the maintenance of historic buildings is a long-term process. During the process, the digital transformation requires overcoming various challenges, such as stable and scalable storage and com… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.01981v1-abstract-full').style.display = 'inline'; document.getElementById('2310.01981v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.01981v1-abstract-full" style="display: none;"> Smart maintenance of historic buildings involves integration of digital technologies and data analysis methods to help maintain functionalities of these buildings and preserve their heritage values. However, the maintenance of historic buildings is a long-term process. During the process, the digital transformation requires overcoming various challenges, such as stable and scalable storage and computing resources, a consistent format for organizing and representing building data, and a flexible design to integrate data analytics to deliver applications. This licentiate thesis aims to address these challenges by proposing a digitalization framework that integrates Internet of Things (IoT), cloud computing, ontology, and machine learning. IoT devices enable data collection from historic buildings to reveal their latest status. Using a public cloud platform brings stable and scalable resources for storing data, performing analytics, and deploying applications. Ontologies provide a clear and concise way to organize and represent building data, which makes it easier to understand the relationships between different building components and systems. Combined with IoT devices and ontologies, parametric digital twins can be created to evolve with their physical counterparts. Furthermore, with machine learning, digital twins can identify patterns from data and provide decision-makers with insights to achieve smart maintenance. Overall, this thesis contributes to the field of preservation of historic buildings by proposing a comprehensive digitalization framework that integrates various advanced digital technologies to provide a holistic approach to achieve smart maintenance of historic buildings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.01981v1-abstract-full').style.display = 'none'; document.getElementById('2310.01981v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Licentiate Thesis</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.10795">arXiv:2309.10795</a> <span> [<a href="https://arxiv.org/pdf/2309.10795">pdf</a>, <a href="https://arxiv.org/format/2309.10795">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Exploring Speech Enhancement for Low-resource Speech Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/eess?searchtype=author&query=Popuri%2C+S">Sravya Popuri</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+N">Ning Dong</a>, <a href="/search/eess?searchtype=author&query=Saijo%2C+K">Kohei Saijo</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiaohui Zhang</a>, <a href="/search/eess?searchtype=author&query=Lan%2C+G+L">Gael Le Lan</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+Y">Yangyang Shi</a>, <a href="/search/eess?searchtype=author&query=Chandra%2C+V">Vikas Chandra</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Changhan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.10795v1-abstract-short" style="display: inline;"> High-quality and intelligible speech is essential to text-to-speech (TTS) model training, however, obtaining high-quality data for low-resource languages is challenging and expensive. Applying speech enhancement on Automatic Speech Recognition (ASR) corpus mitigates the issue by augmenting the training data, while how the nonlinear speech distortion brought by speech enhancement models affects TTS… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.10795v1-abstract-full').style.display = 'inline'; document.getElementById('2309.10795v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.10795v1-abstract-full" style="display: none;"> High-quality and intelligible speech is essential to text-to-speech (TTS) model training, however, obtaining high-quality data for low-resource languages is challenging and expensive. Applying speech enhancement on Automatic Speech Recognition (ASR) corpus mitigates the issue by augmenting the training data, while how the nonlinear speech distortion brought by speech enhancement models affects TTS training still needs to be investigated. In this paper, we train a TF-GridNet speech enhancement model and apply it to low-resource datasets that were collected for the ASR task, then train a discrete unit based TTS model on the enhanced speech. We use Arabic datasets as an example and show that the proposed pipeline significantly improves the low-resource TTS system compared with other baseline methods in terms of ASR WER metric. We also run empirical analysis on the correlation between speech enhancement and TTS performances. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.10795v1-abstract-full').style.display = 'none'; document.getElementById('2309.10795v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.10537">arXiv:2309.10537</a> <span> [<a href="https://arxiv.org/pdf/2309.10537">pdf</a>, <a href="https://arxiv.org/format/2309.10537">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> FoleyGen: Visually-Guided Audio Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Mei%2C+X">Xinhao Mei</a>, <a href="/search/eess?searchtype=author&query=Nagaraja%2C+V">Varun Nagaraja</a>, <a href="/search/eess?searchtype=author&query=Lan%2C+G+L">Gael Le Lan</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+E">Ernie Chang</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+Y">Yangyang Shi</a>, <a href="/search/eess?searchtype=author&query=Chandra%2C+V">Vikas Chandra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.10537v1-abstract-short" style="display: inline;"> Recent advancements in audio generation have been spurred by the evolution of large-scale deep learning models and expansive datasets. However, the task of video-to-audio (V2A) generation continues to be a challenge, principally because of the intricate relationship between the high-dimensional visual and auditory data, and the challenges associated with temporal synchronization. In this study, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.10537v1-abstract-full').style.display = 'inline'; document.getElementById('2309.10537v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.10537v1-abstract-full" style="display: none;"> Recent advancements in audio generation have been spurred by the evolution of large-scale deep learning models and expansive datasets. However, the task of video-to-audio (V2A) generation continues to be a challenge, principally because of the intricate relationship between the high-dimensional visual and auditory data, and the challenges associated with temporal synchronization. In this study, we introduce FoleyGen, an open-domain V2A generation system built on a language modeling paradigm. FoleyGen leverages an off-the-shelf neural audio codec for bidirectional conversion between waveforms and discrete tokens. The generation of audio tokens is facilitated by a single Transformer model, which is conditioned on visual features extracted from a visual encoder. A prevalent problem in V2A generation is the misalignment of generated audio with the visible actions in the video. To address this, we explore three novel visual attention mechanisms. We further undertake an exhaustive evaluation of multiple visual encoders, each pretrained on either single-modal or multi-modal tasks. The experimental results on VGGSound dataset show that our proposed FoleyGen outperforms previous systems across all objective metrics and human evaluations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.10537v1-abstract-full').style.display = 'none'; document.getElementById('2309.10537v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.08804">arXiv:2309.08804</a> <span> [<a href="https://arxiv.org/pdf/2309.08804">pdf</a>, <a href="https://arxiv.org/format/2309.08804">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Stack-and-Delay: a new codebook pattern for music generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lan%2C+G+L">Gael Le Lan</a>, <a href="/search/eess?searchtype=author&query=Nagaraja%2C+V">Varun Nagaraja</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+E">Ernie Chang</a>, <a href="/search/eess?searchtype=author&query=Kant%2C+D">David Kant</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+Y">Yangyang Shi</a>, <a href="/search/eess?searchtype=author&query=Iandola%2C+F">Forrest Iandola</a>, <a href="/search/eess?searchtype=author&query=Chandra%2C+V">Vikas Chandra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.08804v1-abstract-short" style="display: inline;"> In language modeling based music generation, a generated waveform is represented by a sequence of hierarchical token stacks that can be decoded either in an auto-regressive manner or in parallel, depending on the codebook patterns. In particular, flattening the codebooks represents the highest quality decoding strategy, while being notoriously slow. To this end, we propose a novel stack-and-delay… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.08804v1-abstract-full').style.display = 'inline'; document.getElementById('2309.08804v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.08804v1-abstract-full" style="display: none;"> In language modeling based music generation, a generated waveform is represented by a sequence of hierarchical token stacks that can be decoded either in an auto-regressive manner or in parallel, depending on the codebook patterns. In particular, flattening the codebooks represents the highest quality decoding strategy, while being notoriously slow. To this end, we propose a novel stack-and-delay style of decoding strategy to improve upon the flat pattern decoding where generation speed is four times faster as opposed to vanilla flat decoding. This brings the inference time close to that of the delay decoding strategy, and allows for faster inference on GPU for small batch sizes. For the same inference efficiency budget as the delay pattern, we show that the proposed approach performs better in objective evaluations, almost closing the gap with the flat pattern in terms of quality. The results are corroborated by subjective evaluations which show that samples generated by the new model are slightly more often preferred to samples generated by the competing model given the same text prompts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.08804v1-abstract-full').style.display = 'none'; document.getElementById('2309.08804v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.08773">arXiv:2309.08773</a> <span> [<a href="https://arxiv.org/pdf/2309.08773">pdf</a>, <a href="https://arxiv.org/format/2309.08773">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Enhance audio generation controllability through representation similarity regularization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shi%2C+Y">Yangyang Shi</a>, <a href="/search/eess?searchtype=author&query=Lan%2C+G+L">Gael Le Lan</a>, <a href="/search/eess?searchtype=author&query=Nagaraja%2C+V">Varun Nagaraja</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/eess?searchtype=author&query=Mei%2C+X">Xinhao Mei</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+E">Ernie Chang</a>, <a href="/search/eess?searchtype=author&query=Iandola%2C+F">Forrest Iandola</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/eess?searchtype=author&query=Chandra%2C+V">Vikas Chandra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.08773v1-abstract-short" style="display: inline;"> This paper presents an innovative approach to enhance control over audio generation by emphasizing the alignment between audio and text representations during model training. In the context of language model-based audio generation, the model leverages input from both textual and audio token representations to predict subsequent audio tokens. However, the current configuration lacks explicit regula… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.08773v1-abstract-full').style.display = 'inline'; document.getElementById('2309.08773v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.08773v1-abstract-full" style="display: none;"> This paper presents an innovative approach to enhance control over audio generation by emphasizing the alignment between audio and text representations during model training. In the context of language model-based audio generation, the model leverages input from both textual and audio token representations to predict subsequent audio tokens. However, the current configuration lacks explicit regularization to ensure the alignment between the chosen text representation and the language model's predictions. Our proposal involves the incorporation of audio and text representation regularization, particularly during the classifier-free guidance (CFG) phase, where the text condition is excluded from cross attention during language model training. The aim of this proposed representation regularization is to minimize discrepancies in audio and text similarity compared to other samples within the same training batch. Experimental results on both music and audio generation tasks demonstrate that our proposed methods lead to improvements in objective metrics for both audio and music generation, as well as an enhancement in the human perception for audio generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.08773v1-abstract-full').style.display = 'none'; document.getElementById('2309.08773v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.07988">arXiv:2309.07988</a> <span> [<a href="https://arxiv.org/pdf/2309.07988">pdf</a>, <a href="https://arxiv.org/format/2309.07988">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Folding Attention: Memory and Power Optimization for On-Device Transformer-based Streaming Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yang Li</a>, <a href="/search/eess?searchtype=author&query=Lai%2C+L">Liangzhen Lai</a>, <a href="/search/eess?searchtype=author&query=Shangguan%2C+Y">Yuan Shangguan</a>, <a href="/search/eess?searchtype=author&query=Iandola%2C+F+N">Forrest N. Iandola</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+E">Ernie Chang</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+Y">Yangyang Shi</a>, <a href="/search/eess?searchtype=author&query=Chandra%2C+V">Vikas Chandra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.07988v3-abstract-short" style="display: inline;"> Transformer-based models excel in speech recognition. Existing efforts to optimize Transformer inference, typically for long-context applications, center on simplifying attention score calculations. However, streaming speech recognition models usually process a limited number of tokens each time, making attention score calculation less of a bottleneck. Instead, the bottleneck lies in the linear pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.07988v3-abstract-full').style.display = 'inline'; document.getElementById('2309.07988v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.07988v3-abstract-full" style="display: none;"> Transformer-based models excel in speech recognition. Existing efforts to optimize Transformer inference, typically for long-context applications, center on simplifying attention score calculations. However, streaming speech recognition models usually process a limited number of tokens each time, making attention score calculation less of a bottleneck. Instead, the bottleneck lies in the linear projection layers of multi-head attention and feedforward networks, constituting a substantial portion of the model size and contributing significantly to computation, memory, and power usage. To address this bottleneck, we propose folding attention, a technique targeting these linear layers, significantly reducing model size and improving memory and power efficiency. Experiments on on-device Transformer-based streaming speech recognition models show that folding attention reduces model size (and corresponding memory consumption) by up to 24% and power consumption by up to 23%, all without compromising model accuracy or computation overhead. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.07988v3-abstract-full').style.display = 'none'; document.getElementById('2309.07988v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.06672">arXiv:2306.06672</a> <span> [<a href="https://arxiv.org/pdf/2306.06672">pdf</a>, <a href="https://arxiv.org/format/2306.06672">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Reducing Barriers to Self-Supervised Learning: HuBERT Pre-training with Academic Compute </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+W">William Chen</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+X">Xuankai Chang</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+Y">Yifan Peng</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/eess?searchtype=author&query=Maiti%2C+S">Soumi Maiti</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.06672v1-abstract-short" style="display: inline;"> Self-supervised learning (SSL) has led to great strides in speech processing. However, the resources needed to train these models has become prohibitively large as they continue to scale. Currently, only a few groups with substantial resources are capable of creating SSL models, which harms reproducibility. In this work, we optimize HuBERT SSL to fit in academic constraints. We reproduce HuBERT in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.06672v1-abstract-full').style.display = 'inline'; document.getElementById('2306.06672v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.06672v1-abstract-full" style="display: none;"> Self-supervised learning (SSL) has led to great strides in speech processing. However, the resources needed to train these models has become prohibitively large as they continue to scale. Currently, only a few groups with substantial resources are capable of creating SSL models, which harms reproducibility. In this work, we optimize HuBERT SSL to fit in academic constraints. We reproduce HuBERT independently from the original implementation, with no performance loss. Our code and training optimizations make SSL feasible with only 8 GPUs, instead of the 32 used in the original work. We also explore a semi-supervised route, using an ASR model to skip the first pre-training iteration. Within one iteration of pre-training, our models improve over HuBERT on several tasks. Furthermore, our HuBERT Large variant requires only 8 GPUs, achieving similar performance to the original trained on 128. As our contribution to the community, all models, configurations, and code are made open-source in ESPnet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.06672v1-abstract-full').style.display = 'none'; document.getElementById('2306.06672v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at INTERSPEECH 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.19972">arXiv:2305.19972</a> <span> [<a href="https://arxiv.org/pdf/2305.19972">pdf</a>, <a href="https://arxiv.org/format/2305.19972">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> VILAS: Exploring the Effects of Vision and Language Context in Automatic Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Ziyi Ni</a>, <a href="/search/eess?searchtype=author&query=Han%2C+M">Minglun Han</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+F">Feilong Chen</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+L">Linghui Meng</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+J">Jing Shi</a>, <a href="/search/eess?searchtype=author&query=Lv%2C+P">Pin Lv</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+B">Bo Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.19972v2-abstract-short" style="display: inline;"> Enhancing automatic speech recognition (ASR) performance by leveraging additional multimodal information has shown promising results in previous studies. However, most of these works have primarily focused on utilizing visual cues derived from human lip motions. In fact, context-dependent visual and linguistic cues can also benefit in many scenarios. In this paper, we first propose ViLaS (Vision a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.19972v2-abstract-full').style.display = 'inline'; document.getElementById('2305.19972v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.19972v2-abstract-full" style="display: none;"> Enhancing automatic speech recognition (ASR) performance by leveraging additional multimodal information has shown promising results in previous studies. However, most of these works have primarily focused on utilizing visual cues derived from human lip motions. In fact, context-dependent visual and linguistic cues can also benefit in many scenarios. In this paper, we first propose ViLaS (Vision and Language into Automatic Speech Recognition), a novel multimodal ASR model based on the continuous integrate-and-fire (CIF) mechanism, which can integrate visual and textual context simultaneously or separately, to facilitate speech recognition. Next, we introduce an effective training strategy that improves performance in modal-incomplete test scenarios. Then, to explore the effects of integrating vision and language, we create VSDial, a multimodal ASR dataset with multimodal context cues in both Chinese and English versions. Finally, empirical results are reported on the public Flickr8K and self-constructed VSDial datasets. We explore various cross-modal fusion schemes, analyze fine-grained crossmodal alignment on VSDial, and provide insights into the effects of integrating multimodal information on speech recognition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.19972v2-abstract-full').style.display = 'none'; document.getElementById('2305.19972v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.13516">arXiv:2305.13516</a> <span> [<a href="https://arxiv.org/pdf/2305.13516">pdf</a>, <a href="https://arxiv.org/format/2305.13516">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Scaling Speech Technology to 1,000+ Languages </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Pratap%2C+V">Vineel Pratap</a>, <a href="/search/eess?searchtype=author&query=Tjandra%2C+A">Andros Tjandra</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+B">Bowen Shi</a>, <a href="/search/eess?searchtype=author&query=Tomasello%2C+P">Paden Tomasello</a>, <a href="/search/eess?searchtype=author&query=Babu%2C+A">Arun Babu</a>, <a href="/search/eess?searchtype=author&query=Kundu%2C+S">Sayani Kundu</a>, <a href="/search/eess?searchtype=author&query=Elkahky%2C+A">Ali Elkahky</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/eess?searchtype=author&query=Vyas%2C+A">Apoorv Vyas</a>, <a href="/search/eess?searchtype=author&query=Fazel-Zarandi%2C+M">Maryam Fazel-Zarandi</a>, <a href="/search/eess?searchtype=author&query=Baevski%2C+A">Alexei Baevski</a>, <a href="/search/eess?searchtype=author&query=Adi%2C+Y">Yossi Adi</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiaohui Zhang</a>, <a href="/search/eess?searchtype=author&query=Hsu%2C+W">Wei-Ning Hsu</a>, <a href="/search/eess?searchtype=author&query=Conneau%2C+A">Alexis Conneau</a>, <a href="/search/eess?searchtype=author&query=Auli%2C+M">Michael Auli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.13516v1-abstract-short" style="display: inline;"> Expanding the language coverage of speech technology has the potential to improve access to information for many more people. However, current speech technology is restricted to about one hundred languages which is a small fraction of the over 7,000 languages spoken around the world. The Massively Multilingual Speech (MMS) project increases the number of supported languages by 10-40x, depending on… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.13516v1-abstract-full').style.display = 'inline'; document.getElementById('2305.13516v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.13516v1-abstract-full" style="display: none;"> Expanding the language coverage of speech technology has the potential to improve access to information for many more people. However, current speech technology is restricted to about one hundred languages which is a small fraction of the over 7,000 languages spoken around the world. The Massively Multilingual Speech (MMS) project increases the number of supported languages by 10-40x, depending on the task. The main ingredients are a new dataset based on readings of publicly available religious texts and effectively leveraging self-supervised learning. We built pre-trained wav2vec 2.0 models covering 1,406 languages, a single multilingual automatic speech recognition model for 1,107 languages, speech synthesis models for the same number of languages, as well as a language identification model for 4,017 languages. Experiments show that our multilingual speech recognition model more than halves the word error rate of Whisper on 54 languages of the FLEURS benchmark while being trained on a small fraction of the labeled data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.13516v1-abstract-full').style.display = 'none'; document.getElementById('2305.13516v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.08541">arXiv:2305.08541</a> <span> [<a href="https://arxiv.org/pdf/2305.08541">pdf</a>, <a href="https://arxiv.org/format/2305.08541">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Ripple sparse self-attention for monaural speech enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qiquan Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+H">Hongxu Zhu</a>, <a href="/search/eess?searchtype=author&query=Song%2C+Q">Qi Song</a>, <a href="/search/eess?searchtype=author&query=Qian%2C+X">Xinyuan Qian</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Haizhou Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.08541v1-abstract-short" style="display: inline;"> The use of Transformer represents a recent success in speech enhancement. However, as its core component, self-attention suffers from quadratic complexity, which is computationally prohibited for long speech recordings. Moreover, it allows each time frame to attend to all time frames, neglecting the strong local correlations of speech signals. This study presents a simple yet effective sparse self… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.08541v1-abstract-full').style.display = 'inline'; document.getElementById('2305.08541v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.08541v1-abstract-full" style="display: none;"> The use of Transformer represents a recent success in speech enhancement. However, as its core component, self-attention suffers from quadratic complexity, which is computationally prohibited for long speech recordings. Moreover, it allows each time frame to attend to all time frames, neglecting the strong local correlations of speech signals. This study presents a simple yet effective sparse self-attention for speech enhancement, called ripple attention, which simultaneously performs fine- and coarse-grained modeling for local and global dependencies, respectively. Specifically, we employ local band attention to enable each frame to attend to its closest neighbor frames in a window at fine granularity, while employing dilated attention outside the window to model the global dependencies at a coarse granularity. We evaluate the efficacy of our ripple attention for speech enhancement on two commonly used training objectives. Extensive experimental results consistently confirm the superior performance of the ripple attention design over standard full self-attention, blockwise attention, and dual-path attention (Sep-Former) in terms of speech quality and intelligibility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.08541v1-abstract-full').style.display = 'none'; document.getElementById('2305.08541v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, ICASSP 2023 published</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.04498">arXiv:2305.04498</a> <span> [<a href="https://arxiv.org/pdf/2305.04498">pdf</a>, <a href="https://arxiv.org/format/2305.04498">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/IESES53571.2023.10253721">10.1109/IESES53571.2023.10253721 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Leveraging Deep Learning and Digital Twins to Improve Energy Performance of Buildings </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhongjun Ni</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+C">Chi Zhang</a>, <a href="/search/eess?searchtype=author&query=Karlsson%2C+M">Magnus Karlsson</a>, <a href="/search/eess?searchtype=author&query=Gong%2C+S">Shaofang Gong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.04498v3-abstract-short" style="display: inline;"> Digital transformation in buildings accumulates massive operational data, which calls for smart solutions to utilize these data to improve energy performance. This study has proposed a solution, namely Deep Energy Twin, for integrating deep learning and digital twins to better understand building energy use and identify the potential for improving energy efficiency. Ontology was adopted to create… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.04498v3-abstract-full').style.display = 'inline'; document.getElementById('2305.04498v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.04498v3-abstract-full" style="display: none;"> Digital transformation in buildings accumulates massive operational data, which calls for smart solutions to utilize these data to improve energy performance. This study has proposed a solution, namely Deep Energy Twin, for integrating deep learning and digital twins to better understand building energy use and identify the potential for improving energy efficiency. Ontology was adopted to create parametric digital twins to provide consistency of data format across different systems in a building. Based on created digital twins and collected data, deep learning methods were used for performing data analytics to identify patterns and provide insights for energy optimization. As a demonstration, a case study was conducted in a public historic building in Norrk枚ping, Sweden, to compare the performance of state-of-the-art deep learning architectures in building energy forecasting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.04498v3-abstract-full').style.display = 'none'; document.getElementById('2305.04498v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 5 figures, accepted in the 3rd IEEE International Conference on Industrial Electronics for Sustainable Energy Systems</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.5.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.04596">arXiv:2304.04596</a> <span> [<a href="https://arxiv.org/pdf/2304.04596">pdf</a>, <a href="https://arxiv.org/format/2304.04596">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> ESPnet-ST-v2: Multipurpose Spoken Language Translation Toolkit </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yan%2C+B">Brian Yan</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+Y">Yun Tang</a>, <a href="/search/eess?searchtype=author&query=Inaguma%2C+H">Hirofumi Inaguma</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+Y">Yifan Peng</a>, <a href="/search/eess?searchtype=author&query=Dalmia%2C+S">Siddharth Dalmia</a>, <a href="/search/eess?searchtype=author&query=Pol%C3%A1k%2C+P">Peter Pol谩k</a>, <a href="/search/eess?searchtype=author&query=Fernandes%2C+P">Patrick Fernandes</a>, <a href="/search/eess?searchtype=author&query=Berrebbi%2C+D">Dan Berrebbi</a>, <a href="/search/eess?searchtype=author&query=Hayashi%2C+T">Tomoki Hayashi</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiaohui Zhang</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/eess?searchtype=author&query=Hira%2C+M">Moto Hira</a>, <a href="/search/eess?searchtype=author&query=Maiti%2C+S">Soumi Maiti</a>, <a href="/search/eess?searchtype=author&query=Pino%2C+J">Juan Pino</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.04596v3-abstract-short" style="display: inline;"> ESPnet-ST-v2 is a revamp of the open-source ESPnet-ST toolkit necessitated by the broadening interests of the spoken language translation community. ESPnet-ST-v2 supports 1) offline speech-to-text translation (ST), 2) simultaneous speech-to-text translation (SST), and 3) offline speech-to-speech translation (S2ST) -- each task is supported with a wide variety of approaches, differentiating ESPnet-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.04596v3-abstract-full').style.display = 'inline'; document.getElementById('2304.04596v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.04596v3-abstract-full" style="display: none;"> ESPnet-ST-v2 is a revamp of the open-source ESPnet-ST toolkit necessitated by the broadening interests of the spoken language translation community. ESPnet-ST-v2 supports 1) offline speech-to-text translation (ST), 2) simultaneous speech-to-text translation (SST), and 3) offline speech-to-speech translation (S2ST) -- each task is supported with a wide variety of approaches, differentiating ESPnet-ST-v2 from other open source spoken language translation toolkits. This toolkit offers state-of-the-art architectures such as transducers, hybrid CTC/attention, multi-decoders with searchable intermediates, time-synchronous blockwise CTC/attention, Translatotron models, and direct discrete unit models. In this paper, we describe the overall design, example models for each task, and performance benchmarking behind ESPnet-ST-v2, which is publicly available at https://github.com/espnet/espnet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.04596v3-abstract-full').style.display = 'none'; document.getElementById('2304.04596v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACL 2023; System Demonstration</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.01448">arXiv:2304.01448</a> <span> [<a href="https://arxiv.org/pdf/2304.01448">pdf</a>, <a href="https://arxiv.org/format/2304.01448">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> TorchAudio-Squim: Reference-less Speech Quality and Intelligibility measures in TorchAudio </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kumar%2C+A">Anurag Kumar</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+K">Ke Tan</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/eess?searchtype=author&query=Manocha%2C+P">Pranay Manocha</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiaohui Zhang</a>, <a href="/search/eess?searchtype=author&query=Henderson%2C+E">Ethan Henderson</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+B">Buye Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.01448v1-abstract-short" style="display: inline;"> Measuring quality and intelligibility of a speech signal is usually a critical step in development of speech processing systems. To enable this, a variety of metrics to measure quality and intelligibility under different assumptions have been developed. Through this paper, we introduce tools and a set of models to estimate such known metrics using deep neural networks. These models are made availa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.01448v1-abstract-full').style.display = 'inline'; document.getElementById('2304.01448v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.01448v1-abstract-full" style="display: none;"> Measuring quality and intelligibility of a speech signal is usually a critical step in development of speech processing systems. To enable this, a variety of metrics to measure quality and intelligibility under different assumptions have been developed. Through this paper, we introduce tools and a set of models to estimate such known metrics using deep neural networks. These models are made available in the well-established TorchAudio library, the core audio and speech processing library within the PyTorch deep learning framework. We refer to it as TorchAudio-Squim, TorchAudio-Speech QUality and Intelligibility Measures. More specifically, in the current version of TorchAudio-squim, we establish and release models for estimating PESQ, STOI and SI-SDR among objective metrics and MOS among subjective metrics. We develop a novel approach for objective metric estimation and use a recently developed approach for subjective metric estimation. These models operate in a ``reference-less" manner, that is they do not require the corresponding clean speech as reference for speech assessment. Given the unavailability of clean speech and the effortful process of subjective evaluation in real-world situations, such easy-to-use tools would greatly benefit speech processing research and development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.01448v1-abstract-full').style.display = 'none'; document.getElementById('2304.01448v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICASSP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.03250">arXiv:2211.03250</a> <span> [<a href="https://arxiv.org/pdf/2211.03250">pdf</a>, <a href="https://arxiv.org/format/2211.03250">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Uplink Sensing Using CSI Ratio in Perceptive Mobile Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhitong Ni</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J+A">J. Andrew Zhang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+K">Kai Wu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+R+P">Ren Ping Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.03250v1-abstract-short" style="display: inline;"> Uplink sensing in perceptive mobile networks (PMNs), which uses uplink communication signals for sensing the environment around a base station, faces challenging issues of clock asynchronism and the requirement of a line-of-sight (LOS) path between transmitters and receivers. The channel state information (CSI) ratio has been applied to resolve these issues, however, current research on the CSI ra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.03250v1-abstract-full').style.display = 'inline'; document.getElementById('2211.03250v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.03250v1-abstract-full" style="display: none;"> Uplink sensing in perceptive mobile networks (PMNs), which uses uplink communication signals for sensing the environment around a base station, faces challenging issues of clock asynchronism and the requirement of a line-of-sight (LOS) path between transmitters and receivers. The channel state information (CSI) ratio has been applied to resolve these issues, however, current research on the CSI ratio is limited to Doppler estimation in a single dynamic path. This paper proposes an advanced parameter estimation scheme that can extract multiple dynamic parameters, including Doppler frequency, angle-of-arrival (AoA), and delay, in a communication uplink channel and completes the localization of multiple moving targets. Our scheme is based on the multi-element Taylor series of the CSI ratio that converts a nonlinear function of sensing parameters to linear forms and enables the applications of traditional sensing algorithms. Using the truncated Taylor series, we develop novel multiple-signal-classification grid searching algorithms for estimating Doppler frequencies and AoAs and use the least-square method to obtain delays. Both experimental and simulation results are provided, demonstrating that our proposed scheme can achieve good performances for sensing both single and multiple dynamic paths, without requiring the presence of a LOS path. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.03250v1-abstract-full').style.display = 'none'; document.getElementById('2211.03250v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.04504">arXiv:2210.04504</a> <span> [<a href="https://arxiv.org/pdf/2210.04504">pdf</a>, <a href="https://arxiv.org/format/2210.04504">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Sampling of Correlated Bandlimited Continuous Signals by Joint Time-vertex Graph Fourier Transform </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhongyi Ni</a>, <a href="/search/eess?searchtype=author&query=Ji%2C+F">Feng Ji</a>, <a href="/search/eess?searchtype=author&query=Sheng%2C+H">Hang Sheng</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+H">Hui Feng</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+B">Bo Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.04504v1-abstract-short" style="display: inline;"> When sampling multiple signals, the correlation between the signals can be exploited to reduce the overall number of samples. In this paper, we study the sampling theory of multiple correlated signals, using correlation to sample them at the lowest sampling rate. Based on the correlation between signal sources, we model multiple continuous-time signals as continuous time-vertex graph signals. The… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.04504v1-abstract-full').style.display = 'inline'; document.getElementById('2210.04504v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.04504v1-abstract-full" style="display: none;"> When sampling multiple signals, the correlation between the signals can be exploited to reduce the overall number of samples. In this paper, we study the sampling theory of multiple correlated signals, using correlation to sample them at the lowest sampling rate. Based on the correlation between signal sources, we model multiple continuous-time signals as continuous time-vertex graph signals. The graph signals are projected onto orthogonal bases to remove spatial correlation and reduce dimensions by graph Fourier transform. When the bandwidths of the original signals and the reduced dimension signals are given, we prove the minimum sampling rate required for recovery of the original signals, and propose a feasible sampling scheme. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.04504v1-abstract-full').style.display = 'none'; document.getElementById('2210.04504v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.02285">arXiv:2209.02285</a> <span> [<a href="https://arxiv.org/pdf/2209.02285">pdf</a>, <a href="https://arxiv.org/format/2209.02285">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> High Dynamic Range Image Quality Assessment Based on Frequency Disparity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yue Liu</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhangkai Ni</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shiqi Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hanli Wang</a>, <a href="/search/eess?searchtype=author&query=Kwong%2C+S">Sam Kwong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.02285v1-abstract-short" style="display: inline;"> In this paper, a novel and effective image quality assessment (IQA) algorithm based on frequency disparity for high dynamic range (HDR) images is proposed, termed as local-global frequency feature-based model (LGFM). Motivated by the assumption that the human visual system is highly adapted for extracting structural information and partial frequencies when perceiving the visual scene, the Gabor an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.02285v1-abstract-full').style.display = 'inline'; document.getElementById('2209.02285v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.02285v1-abstract-full" style="display: none;"> In this paper, a novel and effective image quality assessment (IQA) algorithm based on frequency disparity for high dynamic range (HDR) images is proposed, termed as local-global frequency feature-based model (LGFM). Motivated by the assumption that the human visual system is highly adapted for extracting structural information and partial frequencies when perceiving the visual scene, the Gabor and the Butterworth filters are applied to the luminance of the HDR image to extract local and global frequency features, respectively. The similarity measurement and feature pooling are sequentially performed on the frequency features to obtain the predicted quality score. The experiments evaluated on four widely used benchmarks demonstrate that the proposed LGFM can provide a higher consistency with the subjective perception compared with the state-of-the-art HDR IQA methods. Our code is available at: \url{https://github.com/eezkni/LGFM}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.02285v1-abstract-full').style.display = 'none'; document.getElementById('2209.02285v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.09791">arXiv:2208.09791</a> <span> [<a href="https://arxiv.org/pdf/2208.09791">pdf</a>, <a href="https://arxiv.org/format/2208.09791">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Joint Communications and Sensing Employing Optimized MIMO-OFDM Signals </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+K">Kai Wu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J+A">J. Andrew Zhang</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhitong Ni</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+X">Xiaojing Huang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y+J">Y. Jay Guo</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+S">Shanzhi Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.09791v1-abstract-short" style="display: inline;"> Joint communication and sensing (JCAS) has the potential to improve the overall energy, cost and frequency efficiency of IoT systems. As a first effort, we propose to optimize the MIMO-OFDM data symbols carried by sub-carriers for better time- and spatial-domain signal orthogonality. This not only boosts the availability of usable signals for JCAS, but also significantly facilitates Internet-of-Th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.09791v1-abstract-full').style.display = 'inline'; document.getElementById('2208.09791v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.09791v1-abstract-full" style="display: none;"> Joint communication and sensing (JCAS) has the potential to improve the overall energy, cost and frequency efficiency of IoT systems. As a first effort, we propose to optimize the MIMO-OFDM data symbols carried by sub-carriers for better time- and spatial-domain signal orthogonality. This not only boosts the availability of usable signals for JCAS, but also significantly facilitates Internet-of-Things (IoT) devices to perform high-quality sensing. We establish an optimization problem that modifies data symbols on sub-carriers to enhance the above-mentioned signal orthogonality. We also develop an efficient algorithm to solve the problem based on the majorization-minimization framework. Moreover, we discover unique signal structures and features from the newly modeled problem, which substantially reduce the complexity of majorizing the objective function. We also develop new projectors to enforce the feasibility of the obtained solution. Simulations show that, compared with the original communication waveform to achieve the same sensing performance, the optimized waveform can reduce the signal-to-noise ratio (SNR) requirement by 3~4.5 dB, while the SNR loss for the uncoded bit error rate is only 1~1.5 dB. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.09791v1-abstract-full').style.display = 'none'; document.getElementById('2208.09791v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 7 figures; submitted to an IEEE journal</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.09514">arXiv:2207.09514</a> <span> [<a href="https://arxiv.org/pdf/2207.09514">pdf</a>, <a href="https://arxiv.org/format/2207.09514">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> ESPnet-SE++: Speech Enhancement for Robust Speech Recognition, Translation, and Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lu%2C+Y">Yen-Ju Lu</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+X">Xuankai Chang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Chenda Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+W">Wangyou Zhang</a>, <a href="/search/eess?searchtype=author&query=Cornell%2C+S">Samuele Cornell</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/eess?searchtype=author&query=Masuyama%2C+Y">Yoshiki Masuyama</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+B">Brian Yan</a>, <a href="/search/eess?searchtype=author&query=Scheibler%2C+R">Robin Scheibler</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zhong-Qiu Wang</a>, <a href="/search/eess?searchtype=author&query=Tsao%2C+Y">Yu Tsao</a>, <a href="/search/eess?searchtype=author&query=Qian%2C+Y">Yanmin Qian</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.09514v1-abstract-short" style="display: inline;"> This paper presents recent progress on integrating speech separation and enhancement (SSE) into the ESPnet toolkit. Compared with the previous ESPnet-SE work, numerous features have been added, including recent state-of-the-art speech enhancement models with their respective training and evaluation recipes. Importantly, a new interface has been designed to flexibly combine speech enhancement front… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.09514v1-abstract-full').style.display = 'inline'; document.getElementById('2207.09514v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.09514v1-abstract-full" style="display: none;"> This paper presents recent progress on integrating speech separation and enhancement (SSE) into the ESPnet toolkit. Compared with the previous ESPnet-SE work, numerous features have been added, including recent state-of-the-art speech enhancement models with their respective training and evaluation recipes. Importantly, a new interface has been designed to flexibly combine speech enhancement front-ends with other tasks, including automatic speech recognition (ASR), speech translation (ST), and spoken language understanding (SLU). To showcase such integration, we performed experiments on carefully designed synthetic datasets for noisy-reverberant multi-channel ST and SLU tasks, which can be used as benchmark corpora for future research. In addition to these new tasks, we also use CHiME-4 and WSJ0-2Mix to benchmark multi- and single-channel SE approaches. Results show that the integration of SE front-ends with back-end tasks is a promising research direction even for tasks besides ASR, especially in the multi-channel scenario. The code is available online at https://github.com/ESPnet/ESPnet. The multi-channel ST and SLU datasets, which are another contribution of this work, are released on HuggingFace. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.09514v1-abstract-full').style.display = 'none'; document.getElementById('2207.09514v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in Interspeech 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.00965">arXiv:2207.00965</a> <span> [<a href="https://arxiv.org/pdf/2207.00965">pdf</a>, <a href="https://arxiv.org/format/2207.00965">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Cycle-Interactive Generative Adversarial Network for Robust Unsupervised Low-Light Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhangkai Ni</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wenhan Yang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hanli Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shiqi Wang</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+L">Lin Ma</a>, <a href="/search/eess?searchtype=author&query=Kwong%2C+S">Sam Kwong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.00965v1-abstract-short" style="display: inline;"> Getting rid of the fundamental limitations in fitting to the paired training data, recent unsupervised low-light enhancement methods excel in adjusting illumination and contrast of images. However, for unsupervised low light enhancement, the remaining noise suppression issue due to the lacking of supervision of detailed signal largely impedes the wide deployment of these methods in real-world appl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.00965v1-abstract-full').style.display = 'inline'; document.getElementById('2207.00965v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.00965v1-abstract-full" style="display: none;"> Getting rid of the fundamental limitations in fitting to the paired training data, recent unsupervised low-light enhancement methods excel in adjusting illumination and contrast of images. However, for unsupervised low light enhancement, the remaining noise suppression issue due to the lacking of supervision of detailed signal largely impedes the wide deployment of these methods in real-world applications. Herein, we propose a novel Cycle-Interactive Generative Adversarial Network (CIGAN) for unsupervised low-light image enhancement, which is capable of not only better transferring illumination distributions between low/normal-light images but also manipulating detailed signals between two domains, e.g., suppressing/synthesizing realistic noise in the cyclic enhancement/degradation process. In particular, the proposed low-light guided transformation feed-forwards the features of low-light images from the generator of enhancement GAN (eGAN) into the generator of degradation GAN (dGAN). With the learned information of real low-light images, dGAN can synthesize more realistic diverse illumination and contrast in low-light images. Moreover, the feature randomized perturbation module in dGAN learns to increase the feature randomness to produce diverse feature distributions, persuading the synthesized low-light images to contain realistic noise. Extensive experiments demonstrate both the superiority of the proposed method and the effectiveness of each module in CIGAN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.00965v1-abstract-full').style.display = 'none'; document.getElementById('2207.00965v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 7 figures, accepted to ACM MM 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.12298">arXiv:2202.12298</a> <span> [<a href="https://arxiv.org/pdf/2202.12298">pdf</a>, <a href="https://arxiv.org/format/2202.12298">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Towards Low-distortion Multi-channel Speech Enhancement: The ESPNet-SE Submission to The L3DAS22 Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lu%2C+Y">Yen-Ju Lu</a>, <a href="/search/eess?searchtype=author&query=Cornell%2C+S">Samuele Cornell</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+X">Xuankai Chang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+W">Wangyou Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Chenda Li</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zhong-Qiu Wang</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.12298v1-abstract-short" style="display: inline;"> This paper describes our submission to the L3DAS22 Challenge Task 1, which consists of speech enhancement with 3D Ambisonic microphones. The core of our approach combines Deep Neural Network (DNN) driven complex spectral mapping with linear beamformers such as the multi-frame multi-channel Wiener filter. Our proposed system has two DNNs and a linear beamformer in between. Both DNNs are trained to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.12298v1-abstract-full').style.display = 'inline'; document.getElementById('2202.12298v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.12298v1-abstract-full" style="display: none;"> This paper describes our submission to the L3DAS22 Challenge Task 1, which consists of speech enhancement with 3D Ambisonic microphones. The core of our approach combines Deep Neural Network (DNN) driven complex spectral mapping with linear beamformers such as the multi-frame multi-channel Wiener filter. Our proposed system has two DNNs and a linear beamformer in between. Both DNNs are trained to perform complex spectral mapping, using a combination of waveform and magnitude spectrum losses. The estimated signal from the first DNN is used to drive a linear beamformer, and the beamforming result, together with this enhanced signal, are used as extra inputs for the second DNN which refines the estimation. Then, from this new estimated signal, the linear beamformer and second DNN are run iteratively. The proposed method was ranked first in the challenge, achieving, on the evaluation set, a ranking metric of 0.984, versus 0.833 of the challenge baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.12298v1-abstract-full').style.display = 'none'; document.getElementById('2202.12298v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">to be published in IEEE ICASSP 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2201.11975">arXiv:2201.11975</a> <span> [<a href="https://arxiv.org/pdf/2201.11975">pdf</a>, <a href="https://arxiv.org/format/2201.11975">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Generalized Visual Quality Assessment of GAN-Generated Face Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Tian%2C+Y">Yu Tian</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhangkai Ni</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+B">Baoliang Chen</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shiqi Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hanli Wang</a>, <a href="/search/eess?searchtype=author&query=Kwong%2C+S">Sam Kwong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2201.11975v1-abstract-short" style="display: inline;"> Recent years have witnessed the dramatically increased interest in face generation with generative adversarial networks (GANs). A number of successful GAN algorithms have been developed to produce vivid face images towards different application scenarios. However, little work has been dedicated to automatic quality assessment of such GAN-generated face images (GFIs), even less have been devoted to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.11975v1-abstract-full').style.display = 'inline'; document.getElementById('2201.11975v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2201.11975v1-abstract-full" style="display: none;"> Recent years have witnessed the dramatically increased interest in face generation with generative adversarial networks (GANs). A number of successful GAN algorithms have been developed to produce vivid face images towards different application scenarios. However, little work has been dedicated to automatic quality assessment of such GAN-generated face images (GFIs), even less have been devoted to generalized and robust quality assessment of GFIs generated with unseen GAN model. Herein, we make the first attempt to study the subjective and objective quality towards generalized quality assessment of GFIs. More specifically, we establish a large-scale database consisting of GFIs from four GAN algorithms, the pseudo labels from image quality assessment (IQA) measures, as well as the human opinion scores via subjective testing. Subsequently, we develop a quality assessment model that is able to deliver accurate quality predictions for GFIs from both available and unseen GAN algorithms based on meta-learning. In particular, to learn shared knowledge from GFIs pairs that are born of limited GAN algorithms, we develop the convolutional block attention (CBA) and facial attributes-based analysis (ABA) modules, ensuring that the learned knowledge tends to be consistent with human visual perception. Extensive experiments exhibit that the proposed model achieves better performance compared with the state-of-the-art IQA models, and is capable of retaining the effectiveness when evaluating GFIs from the unseen GAN algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.11975v1-abstract-full').style.display = 'none'; document.getElementById('2201.11975v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 8 figures, journal paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.15299">arXiv:2112.15299</a> <span> [<a href="https://arxiv.org/pdf/2112.15299">pdf</a>, <a href="https://arxiv.org/format/2112.15299">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CSformer: Bridging Convolution and Transformer for Compressive Sensing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ye%2C+D">Dongjie Ye</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhangkai Ni</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hanli Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Jian Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shiqi Wang</a>, <a href="/search/eess?searchtype=author&query=Kwong%2C+S">Sam Kwong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.15299v1-abstract-short" style="display: inline;"> Convolution neural networks (CNNs) have succeeded in compressive image sensing. However, due to the inductive bias of locality and weight sharing, the convolution operations demonstrate the intrinsic limitations in modeling the long-range dependency. Transformer, designed initially as a sequence-to-sequence model, excels at capturing global contexts due to the self-attention-based architectures ev… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.15299v1-abstract-full').style.display = 'inline'; document.getElementById('2112.15299v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.15299v1-abstract-full" style="display: none;"> Convolution neural networks (CNNs) have succeeded in compressive image sensing. However, due to the inductive bias of locality and weight sharing, the convolution operations demonstrate the intrinsic limitations in modeling the long-range dependency. Transformer, designed initially as a sequence-to-sequence model, excels at capturing global contexts due to the self-attention-based architectures even though it may be equipped with limited localization abilities. This paper proposes CSformer, a hybrid framework that integrates the advantages of leveraging both detailed spatial information from CNN and the global context provided by transformer for enhanced representation learning. The proposed approach is an end-to-end compressive image sensing method, composed of adaptive sampling and recovery. In the sampling module, images are measured block-by-block by the learned sampling matrix. In the reconstruction stage, the measurement is projected into dual stems. One is the CNN stem for modeling the neighborhood relationships by convolution, and the other is the transformer stem for adopting global self-attention mechanism. The dual branches structure is concurrent, and the local features and global representations are fused under different resolutions to maximize the complementary of features. Furthermore, we explore a progressive strategy and window-based transformer block to reduce the parameter and computational complexity. The experimental results demonstrate the effectiveness of the dedicated transformer-based architecture for compressive sensing, which achieves superior performance compared to state-of-the-art methods on different datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.15299v1-abstract-full').style.display = 'none'; document.getElementById('2112.15299v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.07518">arXiv:2111.07518</a> <span> [<a href="https://arxiv.org/pdf/2111.07518">pdf</a>, <a href="https://arxiv.org/format/2111.07518">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Time-Frequency Attention for Monaural Speech Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qiquan Zhang</a>, <a href="/search/eess?searchtype=author&query=Song%2C+Q">Qi Song</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/eess?searchtype=author&query=Nicolson%2C+A">Aaron Nicolson</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Haizhou Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.07518v3-abstract-short" style="display: inline;"> Most studies on speech enhancement generally don't consider the energy distribution of speech in time-frequency (T-F) representation, which is important for accurate prediction of mask or spectra. In this paper, we present a simple yet effective T-F attention (TFA) module, where a 2-D attention map is produced to provide differentiated weights to the spectral components of T-F representation. To v… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.07518v3-abstract-full').style.display = 'inline'; document.getElementById('2111.07518v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.07518v3-abstract-full" style="display: none;"> Most studies on speech enhancement generally don't consider the energy distribution of speech in time-frequency (T-F) representation, which is important for accurate prediction of mask or spectra. In this paper, we present a simple yet effective T-F attention (TFA) module, where a 2-D attention map is produced to provide differentiated weights to the spectral components of T-F representation. To validate the effectiveness of our proposed TFA module, we use the residual temporal convolution network (ResTCN) as the backbone network and conduct extensive experiments on two commonly used training targets. Our experiments demonstrate that applying our TFA module significantly improves the performance in terms of five objective evaluation metrics with negligible parameter overhead. The evaluation results show that the proposed ResTCN with the TFA module (ResTCN+TFA) consistently outperforms other baselines by a large margin. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.07518v3-abstract-full').style.display = 'none'; document.getElementById('2111.07518v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 4 figures, Accepted and presented at ICASSP 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.15018">arXiv:2110.15018</a> <span> [<a href="https://arxiv.org/pdf/2110.15018">pdf</a>, <a href="https://arxiv.org/format/2110.15018">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> TorchAudio: Building Blocks for Audio and Speech Processing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+Y">Yao-Yuan Yang</a>, <a href="/search/eess?searchtype=author&query=Hira%2C+M">Moto Hira</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/eess?searchtype=author&query=Chourdia%2C+A">Anjali Chourdia</a>, <a href="/search/eess?searchtype=author&query=Astafurov%2C+A">Artyom Astafurov</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+C">Caroline Chen</a>, <a href="/search/eess?searchtype=author&query=Yeh%2C+C">Ching-Feng Yeh</a>, <a href="/search/eess?searchtype=author&query=Puhrsch%2C+C">Christian Puhrsch</a>, <a href="/search/eess?searchtype=author&query=Pollack%2C+D">David Pollack</a>, <a href="/search/eess?searchtype=author&query=Genzel%2C+D">Dmitriy Genzel</a>, <a href="/search/eess?searchtype=author&query=Greenberg%2C+D">Donny Greenberg</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+E+Z">Edward Z. Yang</a>, <a href="/search/eess?searchtype=author&query=Lian%2C+J">Jason Lian</a>, <a href="/search/eess?searchtype=author&query=Mahadeokar%2C+J">Jay Mahadeokar</a>, <a href="/search/eess?searchtype=author&query=Hwang%2C+J">Jeff Hwang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+J">Ji Chen</a>, <a href="/search/eess?searchtype=author&query=Goldsborough%2C+P">Peter Goldsborough</a>, <a href="/search/eess?searchtype=author&query=Roy%2C+P">Prabhat Roy</a>, <a href="/search/eess?searchtype=author&query=Narenthiran%2C+S">Sean Narenthiran</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Chintala%2C+S">Soumith Chintala</a>, <a href="/search/eess?searchtype=author&query=Quenneville-B%C3%A9lair%2C+V">Vincent Quenneville-B茅lair</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+Y">Yangyang Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.15018v2-abstract-short" style="display: inline;"> This document describes version 0.10 of TorchAudio: building blocks for machine learning applications in the audio and speech processing domain. The objective of TorchAudio is to accelerate the development and deployment of machine learning applications for researchers and engineers by providing off-the-shelf building blocks. The building blocks are designed to be GPU-compatible, automatically dif… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.15018v2-abstract-full').style.display = 'inline'; document.getElementById('2110.15018v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.15018v2-abstract-full" style="display: none;"> This document describes version 0.10 of TorchAudio: building blocks for machine learning applications in the audio and speech processing domain. The objective of TorchAudio is to accelerate the development and deployment of machine learning applications for researchers and engineers by providing off-the-shelf building blocks. The building blocks are designed to be GPU-compatible, automatically differentiable, and production-ready. TorchAudio can be easily installed from Python Package Index repository and the source code is publicly available under a BSD-2-Clause License (as of September 2021) at https://github.com/pytorch/audio. In this document, we provide an overview of the design principles, functionalities, and benchmarks of TorchAudio. We also benchmark our implementation of several audio and speech operations and models. We verify through the benchmarks that our implementations of various operations and models are valid and perform similarly to other publicly available implementations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.15018v2-abstract-full').style.display = 'none'; document.getElementById('2110.15018v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.10965">arXiv:2110.10965</a> <span> [<a href="https://arxiv.org/pdf/2110.10965">pdf</a>, <a href="https://arxiv.org/format/2110.10965">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> 2020 CATARACTS Semantic Segmentation Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Luengo%2C+I">Imanol Luengo</a>, <a href="/search/eess?searchtype=author&query=Grammatikopoulou%2C+M">Maria Grammatikopoulou</a>, <a href="/search/eess?searchtype=author&query=Mohammadi%2C+R">Rahim Mohammadi</a>, <a href="/search/eess?searchtype=author&query=Walsh%2C+C">Chris Walsh</a>, <a href="/search/eess?searchtype=author&query=Nwoye%2C+C+I">Chinedu Innocent Nwoye</a>, <a href="/search/eess?searchtype=author&query=Alapatt%2C+D">Deepak Alapatt</a>, <a href="/search/eess?searchtype=author&query=Padoy%2C+N">Nicolas Padoy</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhen-Liang Ni</a>, <a href="/search/eess?searchtype=author&query=Fan%2C+C">Chen-Chen Fan</a>, <a href="/search/eess?searchtype=author&query=Bian%2C+G">Gui-Bin Bian</a>, <a href="/search/eess?searchtype=author&query=Hou%2C+Z">Zeng-Guang Hou</a>, <a href="/search/eess?searchtype=author&query=Ha%2C+H">Heonjin Ha</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jiacheng Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Haojie Wang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+D">Dong Guo</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+L">Lu Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+G">Guotai Wang</a>, <a href="/search/eess?searchtype=author&query=Islam%2C+M">Mobarakol Islam</a>, <a href="/search/eess?searchtype=author&query=Giddwani%2C+B">Bharat Giddwani</a>, <a href="/search/eess?searchtype=author&query=Hongliang%2C+R">Ren Hongliang</a>, <a href="/search/eess?searchtype=author&query=Pissas%2C+T">Theodoros Pissas</a>, <a href="/search/eess?searchtype=author&query=Ravasio%2C+C">Claudio Ravasio</a>, <a href="/search/eess?searchtype=author&query=Huber%2C+M">Martin Huber</a>, <a href="/search/eess?searchtype=author&query=Birch%2C+J">Jeremy Birch</a>, <a href="/search/eess?searchtype=author&query=Rio%2C+J+M+N+D">Joan M. Nunez Do Rio</a> , et al. (15 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.10965v2-abstract-short" style="display: inline;"> Surgical scene segmentation is essential for anatomy and instrument localization which can be further used to assess tissue-instrument interactions during a surgical procedure. In 2017, the Challenge on Automatic Tool Annotation for cataRACT Surgery (CATARACTS) released 50 cataract surgery videos accompanied by instrument usage annotations. These annotations included frame-level instrument presenc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.10965v2-abstract-full').style.display = 'inline'; document.getElementById('2110.10965v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.10965v2-abstract-full" style="display: none;"> Surgical scene segmentation is essential for anatomy and instrument localization which can be further used to assess tissue-instrument interactions during a surgical procedure. In 2017, the Challenge on Automatic Tool Annotation for cataRACT Surgery (CATARACTS) released 50 cataract surgery videos accompanied by instrument usage annotations. These annotations included frame-level instrument presence information. In 2020, we released pixel-wise semantic annotations for anatomy and instruments for 4670 images sampled from 25 videos of the CATARACTS training set. The 2020 CATARACTS Semantic Segmentation Challenge, which was a sub-challenge of the 2020 MICCAI Endoscopic Vision (EndoVis) Challenge, presented three sub-tasks to assess participating solutions on anatomical structure and instrument segmentation. Their performance was assessed on a hidden test set of 531 images from 10 videos of the CATARACTS test set. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.10965v2-abstract-full').style.display = 'none'; document.getElementById('2110.10965v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2105.06270">arXiv:2105.06270</a> <span> [<a href="https://arxiv.org/pdf/2105.06270">pdf</a>, <a href="https://arxiv.org/format/2105.06270">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Group Feature Learning and Domain Adversarial Neural Network for aMCI Diagnosis System Based on EEG </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Fan%2C+C">Chen-Chen Fan</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+H">Haiqun Xie</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+L">Liang Peng</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+H">Hongjun Yang</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhen-Liang Ni</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+G">Guan'an Wang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yan-Jie Zhou</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+S">Sheng Chen</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+Z">Zhijie Fang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+S">Shuyun Huang</a>, <a href="/search/eess?searchtype=author&query=Hou%2C+Z">Zeng-Guang Hou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2105.06270v1-abstract-short" style="display: inline;"> Medical diagnostic robot systems have been paid more and more attention due to its objectivity and accuracy. The diagnosis of mild cognitive impairment (MCI) is considered an effective means to prevent Alzheimer's disease (AD). Doctors diagnose MCI based on various clinical examinations, which are expensive and the diagnosis results rely on the knowledge of doctors. Therefore, it is necessary to d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2105.06270v1-abstract-full').style.display = 'inline'; document.getElementById('2105.06270v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2105.06270v1-abstract-full" style="display: none;"> Medical diagnostic robot systems have been paid more and more attention due to its objectivity and accuracy. The diagnosis of mild cognitive impairment (MCI) is considered an effective means to prevent Alzheimer's disease (AD). Doctors diagnose MCI based on various clinical examinations, which are expensive and the diagnosis results rely on the knowledge of doctors. Therefore, it is necessary to develop a robot diagnostic system to eliminate the influence of human factors and obtain a higher accuracy rate. In this paper, we propose a novel Group Feature Domain Adversarial Neural Network (GF-DANN) for amnestic MCI (aMCI) diagnosis, which involves two important modules. A Group Feature Extraction (GFE) module is proposed to reduce individual differences by learning group-level features through adversarial learning. A Dual Branch Domain Adaptation (DBDA) module is carefully designed to reduce the distribution difference between the source and target domain in a domain adaption way. On three types of data set, GF-DANN achieves the best accuracy compared with classic machine learning and deep learning methods. On the DMS data set, GF-DANN has obtained an accuracy rate of 89.47%, and the sensitivity and specificity are 90% and 89%. In addition, by comparing three EEG data collection paradigms, our results demonstrate that the DMS paradigm has the potential to build an aMCI diagnose robot system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2105.06270v1-abstract-full').style.display = 'none'; document.getElementById('2105.06270v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted by 2021 International Conference on Robotics and Automation (ICRA 2021)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2012.15052">arXiv:2012.15052</a> <span> [<a href="https://arxiv.org/pdf/2012.15052">pdf</a>, <a href="https://arxiv.org/format/2012.15052">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unpaired Image Enhancement with Quality-Attention Generative Adversarial Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhangkai Ni</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wenhan Yang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shiqi Wang</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+L">Lin Ma</a>, <a href="/search/eess?searchtype=author&query=Kwong%2C+S">Sam Kwong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2012.15052v1-abstract-short" style="display: inline;"> In this work, we aim to learn an unpaired image enhancement model, which can enrich low-quality images with the characteristics of high-quality images provided by users. We propose a quality attention generative adversarial network (QAGAN) trained on unpaired data based on the bidirectional Generative Adversarial Network (GAN) embedded with a quality attention module (QAM). The key novelty of the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.15052v1-abstract-full').style.display = 'inline'; document.getElementById('2012.15052v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2012.15052v1-abstract-full" style="display: none;"> In this work, we aim to learn an unpaired image enhancement model, which can enrich low-quality images with the characteristics of high-quality images provided by users. We propose a quality attention generative adversarial network (QAGAN) trained on unpaired data based on the bidirectional Generative Adversarial Network (GAN) embedded with a quality attention module (QAM). The key novelty of the proposed QAGAN lies in the injected QAM for the generator such that it learns domain-relevant quality attention directly from the two domains. More specifically, the proposed QAM allows the generator to effectively select semantic-related characteristics from the spatial-wise and adaptively incorporate style-related attributes from the channel-wise, respectively. Therefore, in our proposed QAGAN, not only discriminators but also the generator can directly access both domains which significantly facilitates the generator to learn the mapping function. Extensive experimental results show that, compared with the state-of-the-art methods based on unpaired learning, our proposed method achieves better performance in both objective and subjective evaluations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.15052v1-abstract-full').style.display = 'none'; document.getElementById('2012.15052v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2012.03388">arXiv:2012.03388</a> <span> [<a href="https://arxiv.org/pdf/2012.03388">pdf</a>, <a href="https://arxiv.org/format/2012.03388">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Combining Spatial Clustering with LSTM Speech Models for Multichannel Speech Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Grezes%2C+F">Felix Grezes</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/eess?searchtype=author&query=Trinh%2C+V+A">Viet Anh Trinh</a>, <a href="/search/eess?searchtype=author&query=Mandel%2C+M">Michael Mandel</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2012.03388v1-abstract-short" style="display: inline;"> Recurrent neural networks using the LSTM architecture can achieve significant single-channel noise reduction. It is not obvious, however, how to apply them to multi-channel inputs in a way that can generalize to new microphone configurations. In contrast, spatial clustering techniques can achieve such generalization, but lack a strong signal model. This paper combines the two approaches to attain… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.03388v1-abstract-full').style.display = 'inline'; document.getElementById('2012.03388v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2012.03388v1-abstract-full" style="display: none;"> Recurrent neural networks using the LSTM architecture can achieve significant single-channel noise reduction. It is not obvious, however, how to apply them to multi-channel inputs in a way that can generalize to new microphone configurations. In contrast, spatial clustering techniques can achieve such generalization, but lack a strong signal model. This paper combines the two approaches to attain both the spatial separation performance and generality of multichannel spatial clustering and the signal modeling performance of multiple parallel single-channel LSTM speech enhancers. The system is compared to several baselines on the CHiME3 dataset in terms of speech quality predicted by the PESQ algorithm and word error rate of a recognizer trained on mis-matched conditions, in order to focus on generalization. Our experiments show that by combining the LSTM models with the spatial clustering, we reduce word error rate by 4.6\% absolute (17.2\% relative) on the development set and 11.2\% absolute (25.5\% relative) on test set compared with spatial clustering system, and reduce by 10.75\% (32.72\% relative) on development set and 6.12\% absolute (15.76\% relative) on test data compared with LSTM model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.03388v1-abstract-full').style.display = 'none'; document.getElementById('2012.03388v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2012.01576, arXiv:2012.02191</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2012.02191">arXiv:2012.02191</a> <span> [<a href="https://arxiv.org/pdf/2012.02191">pdf</a>, <a href="https://arxiv.org/format/2012.02191">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Improved MVDR Beamforming Using LSTM Speech Models to Clean Spatial Clustering Masks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/eess?searchtype=author&query=Grezes%2C+F">Felix Grezes</a>, <a href="/search/eess?searchtype=author&query=Trinh%2C+V+A">Viet Anh Trinh</a>, <a href="/search/eess?searchtype=author&query=Mandel%2C+M+I">Michael I. Mandel</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2012.02191v1-abstract-short" style="display: inline;"> Spatial clustering techniques can achieve significant multi-channel noise reduction across relatively arbitrary microphone configurations, but have difficulty incorporating a detailed speech/noise model. In contrast, LSTM neural networks have successfully been trained to recognize speech from noise on single-channel inputs, but have difficulty taking full advantage of the information in multi-chan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.02191v1-abstract-full').style.display = 'inline'; document.getElementById('2012.02191v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2012.02191v1-abstract-full" style="display: none;"> Spatial clustering techniques can achieve significant multi-channel noise reduction across relatively arbitrary microphone configurations, but have difficulty incorporating a detailed speech/noise model. In contrast, LSTM neural networks have successfully been trained to recognize speech from noise on single-channel inputs, but have difficulty taking full advantage of the information in multi-channel recordings. This paper integrates these two approaches, training LSTM speech models to clean the masks generated by the Model-based EM Source Separation and Localization (MESSL) spatial clustering method. By doing so, it attains both the spatial separation performance and generality of multi-channel spatial clustering and the signal modeling performance of multiple parallel single-channel LSTM speech enhancers. Our experiments show that when our system is applied to the CHiME-3 dataset of noisy tablet recordings, it increases speech quality as measured by the Perceptual Evaluation of Speech Quality (PESQ) algorithm and reduces the word error rate of the baseline CHiME-3 speech recognizer, as compared to the default BeamformIt beamformer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.02191v1-abstract-full').style.display = 'none'; document.getElementById('2012.02191v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: substantial text overlap with arXiv:2012.01576</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2012.01576">arXiv:2012.01576</a> <span> [<a href="https://arxiv.org/pdf/2012.01576">pdf</a>, <a href="https://arxiv.org/format/2012.01576">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Enhancement of Spatial Clustering-Based Time-Frequency Masks using LSTM Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Grezes%2C+F">Felix Grezes</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/eess?searchtype=author&query=Trinh%2C+V+A">Viet Anh Trinh</a>, <a href="/search/eess?searchtype=author&query=Mandel%2C+M">Michael Mandel</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2012.01576v1-abstract-short" style="display: inline;"> Recent works have shown that Deep Recurrent Neural Networks using the LSTM architecture can achieve strong single-channel speech enhancement by estimating time-frequency masks. However, these models do not naturally generalize to multi-channel inputs from varying microphone configurations. In contrast, spatial clustering techniques can achieve such generalization but lack a strong signal model. Ou… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.01576v1-abstract-full').style.display = 'inline'; document.getElementById('2012.01576v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2012.01576v1-abstract-full" style="display: none;"> Recent works have shown that Deep Recurrent Neural Networks using the LSTM architecture can achieve strong single-channel speech enhancement by estimating time-frequency masks. However, these models do not naturally generalize to multi-channel inputs from varying microphone configurations. In contrast, spatial clustering techniques can achieve such generalization but lack a strong signal model. Our work proposes a combination of the two approaches. By using LSTMs to enhance spatial clustering based time-frequency masks, we achieve both the signal modeling performance of multiple single-channel LSTM-DNN speech enhancers and the signal separation performance and generality of multi-channel spatial clustering. We compare our proposed system to several baselines on the CHiME-3 dataset. We evaluate the quality of the audio from each system using SDR from the BSS\_eval toolkit and PESQ. We evaluate the intelligibility of the output of each system using word error rate from a Kaldi automatic speech recognizer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.01576v1-abstract-full').style.display = 'none'; document.getElementById('2012.01576v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.10943">arXiv:2011.10943</a> <span> [<a href="https://arxiv.org/pdf/2011.10943">pdf</a>, <a href="https://arxiv.org/ps/2011.10943">ps</a>, <a href="https://arxiv.org/format/2011.10943">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Waveform Optimization with Multiple Performance Metrics for Broadband Joint Communication and Radar Sensing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhitong Ni</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J+A">J. Andrew Zhang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+K">Kai Yang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+X">Xiaojing Huang</a>, <a href="/search/eess?searchtype=author&query=Tsiftsis%2C+T+A">Theodoros A. Tsiftsis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.10943v1-abstract-short" style="display: inline;"> Joint communication and radar sensing (JCAS) integrates communication and radar/radio sensing into one system, sharing one transmitted signal. In this paper, we investigate JCAS waveform optimization underlying communication signals, where a base station detects radar targets and communicates with mobile users simultaneously. We first develop individual novel waveform optimization problems for com… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.10943v1-abstract-full').style.display = 'inline'; document.getElementById('2011.10943v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.10943v1-abstract-full" style="display: none;"> Joint communication and radar sensing (JCAS) integrates communication and radar/radio sensing into one system, sharing one transmitted signal. In this paper, we investigate JCAS waveform optimization underlying communication signals, where a base station detects radar targets and communicates with mobile users simultaneously. We first develop individual novel waveform optimization problems for communications and sensing, respectively. For communications, we propose a novel lower bound of sum rate by integrating multi-user interference and effective channel gain into one metric that simplifies the optimization of the sum rate. For radar sensing, we consider optimizing one of two metrics, the mutual information or the Cramer-Rao bound. Then, we formulate the JCAS problem by optimizing the communication metric under different constraints of the radar metric, and we obtain both closed-form solutions and iterative solutions to the non-convex JCAS optimization problem. Numerical results are provided and verify the proposed optimization solutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.10943v1-abstract-full').style.display = 'none'; document.getElementById('2011.10943v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.09162">arXiv:2011.09162</a> <span> [<a href="https://arxiv.org/pdf/2011.09162">pdf</a>, <a href="https://arxiv.org/format/2011.09162">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> WPD++: An Improved Neural Beamformer for Simultaneous Speech Separation and Dereverberation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+Y">Yong Xu</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+M">Meng Yu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+B">Bo Wu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shixiong Zhang</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+D">Dong Yu</a>, <a href="/search/eess?searchtype=author&query=Mandel%2C+M+I">Michael I Mandel</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.09162v1-abstract-short" style="display: inline;"> This paper aims at eliminating the interfering speakers' speech, additive noise, and reverberation from the noisy multi-talker speech mixture that benefits automatic speech recognition (ASR) backend. While the recently proposed Weighted Power minimization Distortionless response (WPD) beamformer can perform separation and dereverberation simultaneously, the noise cancellation component still has t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.09162v1-abstract-full').style.display = 'inline'; document.getElementById('2011.09162v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.09162v1-abstract-full" style="display: none;"> This paper aims at eliminating the interfering speakers' speech, additive noise, and reverberation from the noisy multi-talker speech mixture that benefits automatic speech recognition (ASR) backend. While the recently proposed Weighted Power minimization Distortionless response (WPD) beamformer can perform separation and dereverberation simultaneously, the noise cancellation component still has the potential to progress. We propose an improved neural WPD beamformer called "WPD++" by an enhanced beamforming module in the conventional WPD and a multi-objective loss function for the joint training. The beamforming module is improved by utilizing the spatio-temporal correlation. A multi-objective loss, including the complex spectra domain scale-invariant signal-to-noise ratio (C-Si-SNR) and the magnitude domain mean square error (Mag-MSE), is properly designed to make multiple constraints on the enhanced speech and the desired power of the dry clean signal. Joint training is conducted to optimize the complex-valued mask estimator and the WPD++ beamformer in an end-to-end way. The results show that the proposed WPD++ outperforms several state-of-the-art beamformers on the enhanced speech quality and word error rate (WER) of ASR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.09162v1-abstract-full').style.display = 'none'; document.getElementById('2011.09162v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by SLT 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.09098">arXiv:2011.09098</a> <span> [<a href="https://arxiv.org/pdf/2011.09098">pdf</a>, <a href="https://arxiv.org/format/2011.09098">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Uplink Sensing in Perceptive Mobile Networks with Asynchronous Transceivers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhitong Ni</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J+A">J. Andrew Zhang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+X">Xiaojing Huang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+K">Kai Yang</a>, <a href="/search/eess?searchtype=author&query=Yuan%2C+J">Jinhong Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.09098v1-abstract-short" style="display: inline;"> Perceptive mobile network (PMN) is a recently proposed next-generation network that integrates radar sensing into communication. One major challenge for realizing sensing in PMNs is how to deal with spatially-separated asynchronous transceivers. The asynchrony between sensing receiver and transmitter will cause both timing offsets (TOs) and carrier frequency offsets (CFOs) and lead to degraded sen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.09098v1-abstract-full').style.display = 'inline'; document.getElementById('2011.09098v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.09098v1-abstract-full" style="display: none;"> Perceptive mobile network (PMN) is a recently proposed next-generation network that integrates radar sensing into communication. One major challenge for realizing sensing in PMNs is how to deal with spatially-separated asynchronous transceivers. The asynchrony between sensing receiver and transmitter will cause both timing offsets (TOs) and carrier frequency offsets (CFOs) and lead to degraded sensing accuracy in both ranging and velocity measurements. In this paper, we propose an uplink sensing scheme for PMNs with asynchronous transceivers, targeting at resolving the sensing ambiguity and improving the sensing accuracy. We first adopt a cross-antenna cross-correlation (CACC) operation to remove the sensing ambiguity associated with both TOs and CFOs. Without sensing ambiguity, both actual propagation delay and actual Doppler frequency of multiple targets can be obtained using CACC outputs. To exploit the redundancy of the CACC outputs and reduce the complexity, we then propose a novel mirrored-MUSIC algorithm, which halves the number of unknown parameters to be estimated, for obtaining actual values of delays and Doppler frequencies. Finally, we propose a high-resolution angles-of-arrival (AoAs) estimation algorithm, which jointly processes all measurements from spatial, temporal, and frequency domains. The proposed AoAs estimation algorithm can achieve significantly higher estimation accuracy than that of using samples from the spatial domain only. We also derive the theoretical mean-square-error of the proposed algorithms. Numerical results are provided and validate the effectiveness of the proposed scheme. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.09098v1-abstract-full').style.display = 'none'; document.getElementById('2011.09098v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2004.09249">arXiv:2004.09249</a> <span> [<a href="https://arxiv.org/pdf/2004.09249">pdf</a>, <a href="https://arxiv.org/format/2004.09249">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CHiME-6 Challenge:Tackling Multispeaker Speech Recognition for Unsegmented Recordings </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Mandel%2C+M">Michael Mandel</a>, <a href="/search/eess?searchtype=author&query=Barker%2C+J">Jon Barker</a>, <a href="/search/eess?searchtype=author&query=Vincent%2C+E">Emmanuel Vincent</a>, <a href="/search/eess?searchtype=author&query=Arora%2C+A">Ashish Arora</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+X">Xuankai Chang</a>, <a href="/search/eess?searchtype=author&query=Khudanpur%2C+S">Sanjeev Khudanpur</a>, <a href="/search/eess?searchtype=author&query=Manohar%2C+V">Vimal Manohar</a>, <a href="/search/eess?searchtype=author&query=Povey%2C+D">Daniel Povey</a>, <a href="/search/eess?searchtype=author&query=Raj%2C+D">Desh Raj</a>, <a href="/search/eess?searchtype=author&query=Snyder%2C+D">David Snyder</a>, <a href="/search/eess?searchtype=author&query=Subramanian%2C+A+S">Aswin Shanmugam Subramanian</a>, <a href="/search/eess?searchtype=author&query=Trmal%2C+J">Jan Trmal</a>, <a href="/search/eess?searchtype=author&query=Yair%2C+B+B">Bar Ben Yair</a>, <a href="/search/eess?searchtype=author&query=Boeddeker%2C+C">Christoph Boeddeker</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/eess?searchtype=author&query=Fujita%2C+Y">Yusuke Fujita</a>, <a href="/search/eess?searchtype=author&query=Horiguchi%2C+S">Shota Horiguchi</a>, <a href="/search/eess?searchtype=author&query=Kanda%2C+N">Naoyuki Kanda</a>, <a href="/search/eess?searchtype=author&query=Yoshioka%2C+T">Takuya Yoshioka</a>, <a href="/search/eess?searchtype=author&query=Ryant%2C+N">Neville Ryant</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2004.09249v2-abstract-short" style="display: inline;"> Following the success of the 1st, 2nd, 3rd, 4th and 5th CHiME challenges we organize the 6th CHiME Speech Separation and Recognition Challenge (CHiME-6). The new challenge revisits the previous CHiME-5 challenge and further considers the problem of distant multi-microphone conversational speech diarization and recognition in everyday home environments. Speech material is the same as the previous C… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.09249v2-abstract-full').style.display = 'inline'; document.getElementById('2004.09249v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2004.09249v2-abstract-full" style="display: none;"> Following the success of the 1st, 2nd, 3rd, 4th and 5th CHiME challenges we organize the 6th CHiME Speech Separation and Recognition Challenge (CHiME-6). The new challenge revisits the previous CHiME-5 challenge and further considers the problem of distant multi-microphone conversational speech diarization and recognition in everyday home environments. Speech material is the same as the previous CHiME-5 recordings except for accurate array synchronization. The material was elicited using a dinner party scenario with efforts taken to capture data that is representative of natural conversational speech. This paper provides a baseline description of the CHiME-6 challenge for both segmented multispeaker speech recognition (Track 1) and unsegmented multispeaker speech recognition (Track 2). Of note, Track 2 is the first challenge activity in the community to tackle an unsegmented multispeaker speech recognition scenario with a complete set of reproducible open source baselines providing speech enhancement, speaker diarization, and speech recognition modules. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.09249v2-abstract-full').style.display = 'none'; document.getElementById('2004.09249v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 April, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2001.07093">arXiv:2001.07093</a> <span> [<a href="https://arxiv.org/pdf/2001.07093">pdf</a>, <a href="https://arxiv.org/format/2001.07093">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> BARNet: Bilinear Attention Network with Adaptive Receptive Fields for Surgical Instrument Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhen-Liang Ni</a>, <a href="/search/eess?searchtype=author&query=Bian%2C+G">Gui-Bin Bian</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+G">Guan-An Wang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+X">Xiao-Hu Zhou</a>, <a href="/search/eess?searchtype=author&query=Hou%2C+Z">Zeng-Guang Hou</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+X">Xiao-Liang Xie</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhen Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yu-Han Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2001.07093v4-abstract-short" style="display: inline;"> Surgical instrument segmentation is extremely important for computer-assisted surgery. Different from common object segmentation, it is more challenging due to the large illumination and scale variation caused by the special surgical scenes. In this paper, we propose a novel bilinear attention network with adaptive receptive field to solve these two challenges. For the illumination variation, the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.07093v4-abstract-full').style.display = 'inline'; document.getElementById('2001.07093v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2001.07093v4-abstract-full" style="display: none;"> Surgical instrument segmentation is extremely important for computer-assisted surgery. Different from common object segmentation, it is more challenging due to the large illumination and scale variation caused by the special surgical scenes. In this paper, we propose a novel bilinear attention network with adaptive receptive field to solve these two challenges. For the illumination variation, the bilinear attention module can capture second-order statistics to encode global contexts and semantic dependencies between local pixels. With them, semantic features in challenging areas can be inferred from their neighbors and the distinction of various semantics can be boosted. For the scale variation, our adaptive receptive field module aggregates multi-scale features and automatically fuses them with different weights. Specifically, it encodes the semantic relationship between channels to emphasize feature maps with appropriate scales, changing the receptive field of subsequent convolutions. The proposed network achieves the best performance 97.47% mean IOU on Cata7 and comes first place on EndoVis 2017 by 10.10% IOU overtaking second-ranking method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.07093v4-abstract-full').style.display = 'none'; document.getElementById('2001.07093v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 January, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2020. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Ni%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Ni%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Ni%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository