CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 72 results for author: <span class="mathjax">Wu, B</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&amp;query=Wu%2C+B">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Wu, B"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Wu%2C+B&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Wu, B"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Wu%2C+B&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Wu%2C+B&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Wu%2C+B&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.19873">arXiv:2502.19873</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.19873">pdf</a>, <a href="https://arxiv.org/ps/2502.19873">ps</a>, <a href="https://arxiv.org/format/2502.19873">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> NeRFCom: Feature Transform Coding Meets Neural Radiance Field for Free-View 3D Scene Semantic Transmission </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Yue%2C+W">Weijie Yue</a>, <a href="/search/eess?searchtype=author&amp;query=Si%2C+Z">Zhongwei Si</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bolin Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+S">Sixian Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+X">Xiaoqi Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Niu%2C+K">Kai Niu</a>, <a href="/search/eess?searchtype=author&amp;query=Dai%2C+J">Jincheng Dai</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+P">Ping Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.19873v1-abstract-short" style="display: inline;"> We introduce NeRFCom, a novel communication system designed for end-to-end 3D scene transmission. Compared to traditional systems relying on handcrafted NeRF semantic feature decomposition for compression and well-adaptive channel coding for transmission error correction, our NeRFCom employs a nonlinear transform and learned probabilistic models, enabling flexible variable-rate joint source-channe&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19873v1-abstract-full').style.display = 'inline'; document.getElementById('2502.19873v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.19873v1-abstract-full" style="display: none;"> We introduce NeRFCom, a novel communication system designed for end-to-end 3D scene transmission. Compared to traditional systems relying on handcrafted NeRF semantic feature decomposition for compression and well-adaptive channel coding for transmission error correction, our NeRFCom employs a nonlinear transform and learned probabilistic models, enabling flexible variable-rate joint source-channel coding and efficient bandwidth allocation aligned with the NeRF semantic feature&#39;s different contribution to the 3D scene synthesis fidelity. Experimental results demonstrate that NeRFCom achieves free-view 3D scene efficient transmission while maintaining robustness under adverse channel conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19873v1-abstract-full').style.display = 'none'; document.getElementById('2502.19873v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11946">arXiv:2502.11946</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.11946">pdf</a>, <a href="https://arxiv.org/format/2502.11946">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Step-Audio: Unified Understanding and Generation in Intelligent Speech Interaction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Huang%2C+A">Ailin Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Boyong Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+B">Bruce Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Yan%2C+C">Chao Yan</a>, <a href="/search/eess?searchtype=author&amp;query=Hu%2C+C">Chen Hu</a>, <a href="/search/eess?searchtype=author&amp;query=Feng%2C+C">Chengli Feng</a>, <a href="/search/eess?searchtype=author&amp;query=Tian%2C+F">Fei Tian</a>, <a href="/search/eess?searchtype=author&amp;query=Shen%2C+F">Feiyu Shen</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+J">Jingbei Li</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+M">Mingrui Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+P">Peng Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Miao%2C+R">Ruihang Miao</a>, <a href="/search/eess?searchtype=author&amp;query=You%2C+W">Wang You</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+X">Xi Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+X">Xuerui Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+Y">Yechang Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Y">Yuxiang Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Gong%2C+Z">Zheng Gong</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Z">Zixin Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+H">Hongyu Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+J">Jianjian Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+B">Brian Li</a>, <a href="/search/eess?searchtype=author&amp;query=Feng%2C+C">Chengting Feng</a>, <a href="/search/eess?searchtype=author&amp;query=Wan%2C+C">Changyi Wan</a>, <a href="/search/eess?searchtype=author&amp;query=Hu%2C+H">Hanpeng Hu</a> , et al. (120 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11946v2-abstract-short" style="display: inline;"> Real-time speech interaction, serving as a fundamental interface for human-machine collaboration, holds immense potential. However, current open-source models face limitations such as high costs in voice data collection, weakness in dynamic control, and limited intelligence. To address these challenges, this paper introduces Step-Audio, the first production-ready open-source solution. Key contribu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11946v2-abstract-full').style.display = 'inline'; document.getElementById('2502.11946v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11946v2-abstract-full" style="display: none;"> Real-time speech interaction, serving as a fundamental interface for human-machine collaboration, holds immense potential. However, current open-source models face limitations such as high costs in voice data collection, weakness in dynamic control, and limited intelligence. To address these challenges, this paper introduces Step-Audio, the first production-ready open-source solution. Key contributions include: 1) a 130B-parameter unified speech-text multi-modal model that achieves unified understanding and generation, with the Step-Audio-Chat version open-sourced; 2) a generative speech data engine that establishes an affordable voice cloning framework and produces the open-sourced lightweight Step-Audio-TTS-3B model through distillation; 3) an instruction-driven fine control system enabling dynamic adjustments across dialects, emotions, singing, and RAP; 4) an enhanced cognitive architecture augmented with tool calling and role-playing abilities to manage complex tasks effectively. Based on our new StepEval-Audio-360 evaluation benchmark, Step-Audio achieves state-of-the-art performance in human evaluations, especially in terms of instruction following. On open-source benchmarks like LLaMA Question, shows 9.3% average performance improvement, demonstrating our commitment to advancing the development of open-source multi-modal language technologies. Our code and models are available at https://github.com/stepfun-ai/Step-Audio. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11946v2-abstract-full').style.display = 'none'; document.getElementById('2502.11946v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15921">arXiv:2411.15921</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15921">pdf</a>, <a href="https://arxiv.org/format/2411.15921">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> A Tunable Despeckling Neural Network Stabilized via Diffusion Equation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Ran%2C+Y">Yi Ran</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+Z">Zhichang Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+J">Jia Li</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Y">Yao Li</a>, <a href="/search/eess?searchtype=author&amp;query=Burger%2C+M">Martin Burger</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Boying Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15921v2-abstract-short" style="display: inline;"> The removal of multiplicative Gamma noise is a critical research area in the application of synthetic aperture radar (SAR) imaging, where neural networks serve as a potent tool. However, real-world data often diverges from theoretical models, exhibiting various disturbances, which makes the neural network less effective. Adversarial attacks can be used as a criterion for judging the adaptability o&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15921v2-abstract-full').style.display = 'inline'; document.getElementById('2411.15921v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15921v2-abstract-full" style="display: none;"> The removal of multiplicative Gamma noise is a critical research area in the application of synthetic aperture radar (SAR) imaging, where neural networks serve as a potent tool. However, real-world data often diverges from theoretical models, exhibiting various disturbances, which makes the neural network less effective. Adversarial attacks can be used as a criterion for judging the adaptability of neural networks to real data, since adversarial attacks can find the most extreme perturbations that make neural networks ineffective. In this work, the diffusion equation is designed as a regularization block to provide sufficient regularity to the whole neural network, due to its spontaneous dissipative nature. We propose a tunable, regularized neural network framework that unrolls a shallow denoising neural network block and a diffusion regularity block into a single network for end-to-end training. The linear heat equation, known for its inherent smoothness and low-pass filtering properties, is adopted as the diffusion regularization block. In our model, a single time step hyperparameter governs the smoothness of the outputs and can be adjusted dynamically, significantly enhancing flexibility. The stability and convergence of our model are theoretically proven. Experimental results demonstrate that the proposed model effectively eliminates high-frequency oscillations induced by adversarial attacks. Finally, the proposed model is benchmarked against several state-of-the-art denoising methods on simulated images, adversarial samples, and real SAR images, achieving superior performance in both quantitative and visual evaluations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15921v2-abstract-full').style.display = 'none'; document.getElementById('2411.15921v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23279">arXiv:2410.23279</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.23279">pdf</a>, <a href="https://arxiv.org/format/2410.23279">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> A Transformer Model for Segmentation, Classification, and Caller Identification of Marmoset Vocalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bin Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Takamichi%2C+S">Shinnosuke Takamichi</a>, <a href="/search/eess?searchtype=author&amp;query=Sakti%2C+S">Sakriani Sakti</a>, <a href="/search/eess?searchtype=author&amp;query=Nakamura%2C+S">Satoshi Nakamura</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23279v3-abstract-short" style="display: inline;"> Marmoset, a highly vocalized primate, has become a popular animal model for studying social-communicative behavior and its underlying mechanism comparing with human infant linguistic developments. In the study of vocal communication, it is vital to know the caller identities, call contents, and vocal exchanges. Previous work of a CNN has achieved a joint model for call segmentation, classification&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23279v3-abstract-full').style.display = 'inline'; document.getElementById('2410.23279v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23279v3-abstract-full" style="display: none;"> Marmoset, a highly vocalized primate, has become a popular animal model for studying social-communicative behavior and its underlying mechanism comparing with human infant linguistic developments. In the study of vocal communication, it is vital to know the caller identities, call contents, and vocal exchanges. Previous work of a CNN has achieved a joint model for call segmentation, classification, and caller identification for marmoset vocalizations. However, the CNN has limitations in modeling long-range acoustic patterns; the Transformer architecture that has been shown to outperform CNNs, utilizes the self-attention mechanism that efficiently segregates information parallelly over long distances and captures the global structure of marmoset vocalization. We propose using the Transformer to jointly segment and classify the marmoset calls and identify the callers for each vocalization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23279v3-abstract-full').style.display = 'none'; document.getElementById('2410.23279v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07085">arXiv:2408.07085</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.07085">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Classical Physics">physics.class-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> </div> </div> <p class="title is-5 mathjax"> New Bounds on Spherical Antenna Bandwidth and Directivity: Updates to the Chu-Harrington Limits </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Pfeiffer%2C+C">Carl Pfeiffer</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bae-Ian Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07085v3-abstract-short" style="display: inline;"> The Chu circuit model provides the basis for analyzing the minimum radiation quality factor, Q, of a given spherical mode. However, examples of electrically large spherical radiators readily demonstrate that this Q limit has limitations in predicting bandwidth. Spherical mode radiation is reexamined and an equivalent 1D transmission line model is derived that exactly models the fields. This model&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07085v3-abstract-full').style.display = 'inline'; document.getElementById('2408.07085v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07085v3-abstract-full" style="display: none;"> The Chu circuit model provides the basis for analyzing the minimum radiation quality factor, Q, of a given spherical mode. However, examples of electrically large spherical radiators readily demonstrate that this Q limit has limitations in predicting bandwidth. Spherical mode radiation is reexamined and an equivalent 1D transmission line model is derived that exactly models the fields. This model leads to a precise cutoff frequency of the spherical waveguide, which provides a clear boundary between propagating and evanescent fields. A new delineation of &#39;stored&#39; and &#39;radiated&#39; electromagnetic energy is postulated, which leads to a new definition of spherical mode Q. Next, attention is turned to the Harrington bound on the directivity-bandwidth tradeoff of an antenna with an arbitrary size. Harrington derived the maximum directivity for a specified number of spherical harmonics such that the Q is not &#39;large&#39;. Here, the method of Lagrange multipliers is used to quantify the maximum directivity for a given bandwidth. It is shown that optimally exciting all spherical harmonics (including n&gt;ka) enables both larger directivity and bandwidth than Harrington&#39;s previous limit. While Chu and Harrington&#39;s analyses are generally good approximations for most situations, the new self-consistent theory that defines fundamental antenna limits leads to updated results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07085v3-abstract-full').style.display = 'none'; document.getElementById('2408.07085v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 15 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.11277">arXiv:2407.11277</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.11277">pdf</a>, <a href="https://arxiv.org/format/2407.11277">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.21437/Interspeech.2024-225">10.21437/Interspeech.2024-225 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Target conversation extraction: Source separation using turn-taking dynamics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Chen%2C+T">Tuochao Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Q">Qirui Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bohan Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Itani%2C+M">Malek Itani</a>, <a href="/search/eess?searchtype=author&amp;query=Eskimez%2C+S+E">Sefik Emre Eskimez</a>, <a href="/search/eess?searchtype=author&amp;query=Yoshioka%2C+T">Takuya Yoshioka</a>, <a href="/search/eess?searchtype=author&amp;query=Gollakota%2C+S">Shyamnath Gollakota</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.11277v2-abstract-short" style="display: inline;"> Extracting the speech of participants in a conversation amidst interfering speakers and noise presents a challenging problem. In this paper, we introduce the novel task of target conversation extraction, where the goal is to extract the audio of a target conversation based on the speaker embedding of one of its participants. To accomplish this, we propose leveraging temporal patterns inherent in h&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11277v2-abstract-full').style.display = 'inline'; document.getElementById('2407.11277v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.11277v2-abstract-full" style="display: none;"> Extracting the speech of participants in a conversation amidst interfering speakers and noise presents a challenging problem. In this paper, we introduce the novel task of target conversation extraction, where the goal is to extract the audio of a target conversation based on the speaker embedding of one of its participants. To accomplish this, we propose leveraging temporal patterns inherent in human conversations, particularly turn-taking dynamics, which uniquely characterize speakers engaged in conversation and distinguish them from interfering speakers and noise. Using neural networks, we show the feasibility of our approach on English and Mandarin conversation datasets. In the presence of interfering speakers, our results show an 8.19 dB improvement in signal-to-noise ratio for 2-speaker conversations and a 7.92 dB improvement for 2-4-speaker conversations. Code, dataset available at https://github.com/chentuochao/Target-Conversation-Extraction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11277v2-abstract-full').style.display = 'none'; document.getElementById('2407.11277v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18373">arXiv:2406.18373</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.18373">pdf</a>, <a href="https://arxiv.org/format/2406.18373">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Data Pruning for Automatic Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xiao%2C+Q">Qiao Xiao</a>, <a href="/search/eess?searchtype=author&amp;query=Ma%2C+P">Pingchuan Ma</a>, <a href="/search/eess?searchtype=author&amp;query=Fernandez-Lopez%2C+A">Adriana Fernandez-Lopez</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Boqian Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Yin%2C+L">Lu Yin</a>, <a href="/search/eess?searchtype=author&amp;query=Petridis%2C+S">Stavros Petridis</a>, <a href="/search/eess?searchtype=author&amp;query=Pechenizkiy%2C+M">Mykola Pechenizkiy</a>, <a href="/search/eess?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a>, <a href="/search/eess?searchtype=author&amp;query=Mocanu%2C+D+C">Decebal Constantin Mocanu</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+S">Shiwei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18373v1-abstract-short" style="display: inline;"> The recent success of Automatic Speech Recognition (ASR) is largely attributed to the ever-growing amount of training data. However, this trend has made model training prohibitively costly and imposed computational demands. While data pruning has been proposed to mitigate this issue by identifying a small subset of relevant data, its application in ASR has been barely explored, and existing works&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18373v1-abstract-full').style.display = 'inline'; document.getElementById('2406.18373v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18373v1-abstract-full" style="display: none;"> The recent success of Automatic Speech Recognition (ASR) is largely attributed to the ever-growing amount of training data. However, this trend has made model training prohibitively costly and imposed computational demands. While data pruning has been proposed to mitigate this issue by identifying a small subset of relevant data, its application in ASR has been barely explored, and existing works often entail significant overhead to achieve meaningful results. To fill this gap, this paper presents the first investigation of dynamic data pruning for ASR, finding that we can reach the full-data performance by dynamically selecting 70% of data. Furthermore, we introduce Dynamic Data Pruning for ASR (DDP-ASR), which offers several fine-grained pruning granularities specifically tailored for speech-related datasets, going beyond the conventional pruning of entire time sequences. Our intensive experiments show that DDP-ASR can save up to 1.6x training time with negligible performance loss. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18373v1-abstract-full').style.display = 'none'; document.getElementById('2406.18373v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.13357">arXiv:2406.13357</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.13357">pdf</a>, <a href="https://arxiv.org/format/2406.13357">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Transferable speech-to-text large language model alignment module </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Boyong Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Yan%2C+C">Chao Yan</a>, <a href="/search/eess?searchtype=author&amp;query=Pu%2C+H">Haoran Pu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.13357v1-abstract-short" style="display: inline;"> By leveraging the power of Large Language Models(LLMs) and speech foundation models, state of the art speech-text bimodal works can achieve challenging tasks like spoken translation(ST) and question answering(SQA) altogether with much simpler architectures. In this paper, we utilize the capability of Whisper encoder and pre-trained Yi-6B. Empirical results reveal that modal alignment can be achiev&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13357v1-abstract-full').style.display = 'inline'; document.getElementById('2406.13357v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.13357v1-abstract-full" style="display: none;"> By leveraging the power of Large Language Models(LLMs) and speech foundation models, state of the art speech-text bimodal works can achieve challenging tasks like spoken translation(ST) and question answering(SQA) altogether with much simpler architectures. In this paper, we utilize the capability of Whisper encoder and pre-trained Yi-6B. Empirical results reveal that modal alignment can be achieved with one layer module and hundred hours of speech-text multitask corpus. We further swap the Yi-6B with human preferences aligned version of Yi-6B-Chat during inference, and discover that the alignment capability is applicable as well. In addition, the alignment subspace revealed by singular value decomposition(SVD) also implies linear alignment subspace is sparse, which leaves the possibility to concatenate other features like voice-print or video to expand modality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13357v1-abstract-full').style.display = 'none'; document.getElementById('2406.13357v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by InterSpeech 2024; 5 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.20617">arXiv:2405.20617</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.20617">pdf</a>, <a href="https://arxiv.org/format/2405.20617">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Large-scale Outdoor Cell-free mMIMO Channel Measurement in an Urban Scenario at 3.5 GHz </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Y">Yuning Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Choi%2C+T">Thomas Choi</a>, <a href="/search/eess?searchtype=author&amp;query=Cheng%2C+Z">Zihang Cheng</a>, <a href="/search/eess?searchtype=author&amp;query=Kanno%2C+I">Issei Kanno</a>, <a href="/search/eess?searchtype=author&amp;query=Ito%2C+M">Masaaki Ito</a>, <a href="/search/eess?searchtype=author&amp;query=Gomez-Ponce%2C+J">Jorge Gomez-Ponce</a>, <a href="/search/eess?searchtype=author&amp;query=Hammoud%2C+H">Hussein Hammoud</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bowei Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Pradhan%2C+A">Ashwani Pradhan</a>, <a href="/search/eess?searchtype=author&amp;query=Arana%2C+K">Kelvin Arana</a>, <a href="/search/eess?searchtype=author&amp;query=Krishna%2C+P">Pramod Krishna</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+T">Tianyi Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+T">Tyler Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Vasishtha%2C+I">Ishita Vasishtha</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+H">Haoyu Xie</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+L">Linyu Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Molisch%2C+A+F">Andreas F. Molisch</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.20617v2-abstract-short" style="display: inline;"> The design of cell-free massive MIMO (CF-mMIMO) systems requires accurate, measurement-based channel models. This paper provides the first results from the by far most extensive outdoor measurement campaign for CF-mMIMO channels in an urban environment. We measured impulse responses between over 20,000 potential access point (AP) locations and 80 user equipments (UEs) at 3.5 GHz with 350 MHz bandw&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.20617v2-abstract-full').style.display = 'inline'; document.getElementById('2405.20617v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.20617v2-abstract-full" style="display: none;"> The design of cell-free massive MIMO (CF-mMIMO) systems requires accurate, measurement-based channel models. This paper provides the first results from the by far most extensive outdoor measurement campaign for CF-mMIMO channels in an urban environment. We measured impulse responses between over 20,000 potential access point (AP) locations and 80 user equipments (UEs) at 3.5 GHz with 350 MHz bandwidth (BW). Measurements use a &#34;virtual array&#34; approach at the AP and a hybrid switched/virtual approach at the UE. This paper describes the sounder design, measurement environment, data processing, and sample results, particularly the evolution of the power-delay profiles (PDPs) as a function of the AP locations, and its relation to the propagation environment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.20617v2-abstract-full').style.display = 'none'; document.getElementById('2405.20617v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to: VTC 2024-Fall</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.16484">arXiv:2404.16484</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.16484">pdf</a>, <a href="https://arxiv.org/format/2404.16484">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Real-Time 4K Super-Resolution of Compressed AVIF Images. AIS 2024 Challenge Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Conde%2C+M+V">Marcos V. Conde</a>, <a href="/search/eess?searchtype=author&amp;query=Lei%2C+Z">Zhijun Lei</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+W">Wen Li</a>, <a href="/search/eess?searchtype=author&amp;query=Stejerean%2C+C">Cosmin Stejerean</a>, <a href="/search/eess?searchtype=author&amp;query=Katsavounidis%2C+I">Ioannis Katsavounidis</a>, <a href="/search/eess?searchtype=author&amp;query=Timofte%2C+R">Radu Timofte</a>, <a href="/search/eess?searchtype=author&amp;query=Yoon%2C+K">Kihwan Yoon</a>, <a href="/search/eess?searchtype=author&amp;query=Gankhuyag%2C+G">Ganzorig Gankhuyag</a>, <a href="/search/eess?searchtype=author&amp;query=Lv%2C+J">Jiangtao Lv</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+L">Long Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jinshan Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Dong%2C+J">Jiangxin Dong</a>, <a href="/search/eess?searchtype=author&amp;query=Tang%2C+J">Jinhui Tang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Z">Zhiyuan Li</a>, <a href="/search/eess?searchtype=author&amp;query=Wei%2C+H">Hao Wei</a>, <a href="/search/eess?searchtype=author&amp;query=Ge%2C+C">Chenyang Ge</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+D">Dongyang Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+T">Tianle Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+H">Huaian Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Jin%2C+Y">Yi Jin</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+M">Menghan Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Yan%2C+Y">Yiqiang Yan</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+S">Si Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Biao Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+S">Shaoli Liu</a> , et al. (50 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.16484v1-abstract-short" style="display: inline;"> This paper introduces a novel benchmark as part of the AIS 2024 Real-Time Image Super-Resolution (RTSR) Challenge, which aims to upscale compressed images from 540p to 4K resolution (4x factor) in real-time on commercial GPUs. For this, we use a diverse test set containing a variety of 4K images ranging from digital art to gaming and photography. The images are compressed using the modern AVIF cod&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16484v1-abstract-full').style.display = 'inline'; document.getElementById('2404.16484v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.16484v1-abstract-full" style="display: none;"> This paper introduces a novel benchmark as part of the AIS 2024 Real-Time Image Super-Resolution (RTSR) Challenge, which aims to upscale compressed images from 540p to 4K resolution (4x factor) in real-time on commercial GPUs. For this, we use a diverse test set containing a variety of 4K images ranging from digital art to gaming and photography. The images are compressed using the modern AVIF codec, instead of JPEG. All the proposed methods improve PSNR fidelity over Lanczos interpolation, and process images under 10ms. Out of the 160 participants, 25 teams submitted their code and models. The solutions present novel designs tailored for memory-efficiency and runtime on edge devices. This survey describes the best solutions for real-time SR of compressed high-resolution images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16484v1-abstract-full').style.display = 'none'; document.getElementById('2404.16484v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024, AI for Streaming (AIS) Workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.05929">arXiv:2311.05929</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.05929">pdf</a>, <a href="https://arxiv.org/format/2311.05929">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Efficient Segmentation with Texture in Ore Images Based on Box-supervised Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Sun%2C+G">Guodong Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+D">Delong Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Peng%2C+Y">Yuting Peng</a>, <a href="/search/eess?searchtype=author&amp;query=Cheng%2C+L">Le Cheng</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bo Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Y">Yang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.05929v1-abstract-short" style="display: inline;"> Image segmentation methods have been utilized to determine the particle size distribution of crushed ores. Due to the complex working environment, high-powered computing equipment is difficult to deploy. At the same time, the ore distribution is stacked, and it is difficult to identify the complete features. To address this issue, an effective box-supervised technique with texture features is prov&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.05929v1-abstract-full').style.display = 'inline'; document.getElementById('2311.05929v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.05929v1-abstract-full" style="display: none;"> Image segmentation methods have been utilized to determine the particle size distribution of crushed ores. Due to the complex working environment, high-powered computing equipment is difficult to deploy. At the same time, the ore distribution is stacked, and it is difficult to identify the complete features. To address this issue, an effective box-supervised technique with texture features is provided for ore image segmentation that can identify complete and independent ores. Firstly, a ghost feature pyramid network (Ghost-FPN) is proposed to process the features obtained from the backbone to reduce redundant semantic information and computation generated by complex networks. Then, an optimized detection head is proposed to obtain the feature to maintain accuracy. Finally, Lab color space (Lab) and local binary patterns (LBP) texture features are combined to form a fusion feature similarity-based loss function to improve accuracy while incurring no loss. Experiments on MS COCO have shown that the proposed fusion features are also worth studying on other types of datasets. Extensive experimental results demonstrate the effectiveness of the proposed method, which achieves over 50 frames per second with a small model size of 21.6 MB. Meanwhile, the method maintains a high level of accuracy compared with the state-of-the-art approaches on ore image dataset. The source code is available at \url{https://github.com/MVME-HBUT/OREINST}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.05929v1-abstract-full').style.display = 'none'; document.getElementById('2311.05929v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.02894">arXiv:2306.02894</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2306.02894">pdf</a>, <a href="https://arxiv.org/ps/2306.02894">ps</a>, <a href="https://arxiv.org/format/2306.02894">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Recyclable Semi-supervised Method Based on Multi-model Ensemble for Video Scene Parsing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Biao Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+S">Shaoli Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+D">Diankai Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Zheng%2C+C">Chengjian Zheng</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+S">Si Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+X">Xiaofeng Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+N">Ning Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.02894v1-abstract-short" style="display: inline;"> Pixel-level Scene Understanding is one of the fundamental problems in computer vision, which aims at recognizing object classes, masks and semantics of each pixel in the given image. Since the real-world is actually video-based rather than a static state, learning to perform video semantic segmentation is more reasonable and practical for realistic applications. In this paper, we adopt Mask2Former&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.02894v1-abstract-full').style.display = 'inline'; document.getElementById('2306.02894v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.02894v1-abstract-full" style="display: none;"> Pixel-level Scene Understanding is one of the fundamental problems in computer vision, which aims at recognizing object classes, masks and semantics of each pixel in the given image. Since the real-world is actually video-based rather than a static state, learning to perform video semantic segmentation is more reasonable and practical for realistic applications. In this paper, we adopt Mask2Former as architecture and ViT-Adapter as backbone. Then, we propose a recyclable semi-supervised training method based on multi-model ensemble. Our method achieves the mIoU scores of 62.97% and 65.83% on Development test and final test respectively. Finally, we obtain the 2nd place in the Video Scene Parsing in the Wild Challenge at CVPR 2023. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.02894v1-abstract-full').style.display = 'none'; document.getElementById('2306.02894v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.01183">arXiv:2305.01183</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.01183">pdf</a>, <a href="https://arxiv.org/format/2305.01183">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Faster OreFSDet : A Lightweight and Effective Few-shot Object Detector for Ore Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Y">Yang Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Cheng%2C+L">Le Cheng</a>, <a href="/search/eess?searchtype=author&amp;query=Peng%2C+Y">Yuting Peng</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+C">Chengming Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+Y">Yanwei Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bo Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+G">Guodong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.01183v1-abstract-short" style="display: inline;"> For the ore particle size detection, obtaining a sizable amount of high-quality ore labeled data is time-consuming and expensive. General object detection methods often suffer from severe over-fitting with scarce labeled data. Despite their ability to eliminate over-fitting, existing few-shot object detectors encounter drawbacks such as slow detection speed and high memory requirements, making the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.01183v1-abstract-full').style.display = 'inline'; document.getElementById('2305.01183v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.01183v1-abstract-full" style="display: none;"> For the ore particle size detection, obtaining a sizable amount of high-quality ore labeled data is time-consuming and expensive. General object detection methods often suffer from severe over-fitting with scarce labeled data. Despite their ability to eliminate over-fitting, existing few-shot object detectors encounter drawbacks such as slow detection speed and high memory requirements, making them difficult to implement in a real-world deployment scenario. To this end, we propose a lightweight and effective few-shot detector to achieve competitive performance with general object detection with only a few samples for ore images. First, the proposed support feature mining block characterizes the importance of location information in support features. Next, the relationship guidance block makes full use of support features to guide the generation of accurate candidate proposals. Finally, the dual-scale semantic aggregation module retrieves detailed features at different resolutions to contribute with the prediction process. Experimental results show that our method consistently exceeds the few-shot detectors with an excellent performance gap on all metrics. Moreover, our method achieves the smallest model size of 19MB as well as being competitive at 50 FPS detection speed compared with general object detectors. The source code is available at https://github.com/MVME-HBUT/Faster-OreFSDet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.01183v1-abstract-full').style.display = 'none'; document.getElementById('2305.01183v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.13523">arXiv:2212.13523</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2212.13523">pdf</a>, <a href="https://arxiv.org/format/2212.13523">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TGRS.2023.3268554">10.1109/TGRS.2023.3268554 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> S2S-WTV: Seismic Data Noise Attenuation Using Weighted Total Variation Regularized Self-Supervised Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xu%2C+Z">Zitai Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Luo%2C+Y">Yisi Luo</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bangyu Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Meng%2C+D">Deyu Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.13523v1-abstract-short" style="display: inline;"> Seismic data often undergoes severe noise due to environmental factors, which seriously affects subsequent applications. Traditional hand-crafted denoisers such as filters and regularizations utilize interpretable domain knowledge to design generalizable denoising techniques, while their representation capacities may be inferior to deep learning denoisers, which can learn complex and representativ&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.13523v1-abstract-full').style.display = 'inline'; document.getElementById('2212.13523v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.13523v1-abstract-full" style="display: none;"> Seismic data often undergoes severe noise due to environmental factors, which seriously affects subsequent applications. Traditional hand-crafted denoisers such as filters and regularizations utilize interpretable domain knowledge to design generalizable denoising techniques, while their representation capacities may be inferior to deep learning denoisers, which can learn complex and representative denoising mappings from abundant training pairs. However, due to the scarcity of high-quality training pairs, deep learning denoisers may sustain some generalization issues over various scenarios. In this work, we propose a self-supervised method that combines the capacities of deep denoiser and the generalization abilities of hand-crafted regularization for seismic data random noise attenuation. Specifically, we leverage the Self2Self (S2S) learning framework with a trace-wise masking strategy for seismic data denoising by solely using the observed noisy data. Parallelly, we suggest the weighted total variation (WTV) to further capture the horizontal local smooth structure of seismic data. Our method, dubbed as S2S-WTV, enjoys both high representation abilities brought from the self-supervised deep network and good generalization abilities of the hand-crafted WTV regularizer and the self-supervised nature. Therefore, our method can more effectively and stably remove the random noise and preserve the details and edges of the clean signal. To tackle the S2S-WTV optimization model, we introduce an alternating direction multiplier method (ADMM)-based algorithm. Extensive experiments on synthetic and field noisy seismic data demonstrate the effectiveness of our method as compared with state-of-the-art traditional and deep learning-based seismic data denoising methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.13523v1-abstract-full').style.display = 'none'; document.getElementById('2212.13523v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> TGRS 2023 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.14522">arXiv:2211.14522</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2211.14522">pdf</a>, <a href="https://arxiv.org/format/2211.14522">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Visual Fault Detection of Multi-scale Key Components in Freight Trains </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Y">Yang Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+Y">Yang Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+H">Huilin Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bo Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+G">Guodong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.14522v1-abstract-short" style="display: inline;"> Fault detection for key components in the braking system of freight trains is critical for ensuring railway transportation safety. Despite the frequently employed methods based on deep learning, these fault detectors are highly reliant on hardware resources and are complex to implement. In addition, no train fault detectors consider the drop in accuracy induced by scale variation of fault parts. T&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.14522v1-abstract-full').style.display = 'inline'; document.getElementById('2211.14522v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.14522v1-abstract-full" style="display: none;"> Fault detection for key components in the braking system of freight trains is critical for ensuring railway transportation safety. Despite the frequently employed methods based on deep learning, these fault detectors are highly reliant on hardware resources and are complex to implement. In addition, no train fault detectors consider the drop in accuracy induced by scale variation of fault parts. This paper proposes a lightweight anchor-free framework to solve the above problems. Specifically, to reduce the amount of computation and model size, we introduce a lightweight backbone and adopt an anchor-free method for localization and regression. To improve detection accuracy for multi-scale parts, we design a feature pyramid network to generate rectangular layers of different sizes to map parts with similar aspect ratios. Experiments on four fault datasets show that our framework achieves 98.44% accuracy while the model size is only 22.5 MB, outperforming state-of-the-art detectors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.14522v1-abstract-full').style.display = 'none'; document.getElementById('2211.14522v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.05256">arXiv:2211.05256</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2211.05256">pdf</a>, <a href="https://arxiv.org/format/2211.05256">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Power Efficient Video Super-Resolution on Mobile NPUs with Deep Learning, Mobile AI &amp; AIM 2022 challenge: Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Ignatov%2C+A">Andrey Ignatov</a>, <a href="/search/eess?searchtype=author&amp;query=Timofte%2C+R">Radu Timofte</a>, <a href="/search/eess?searchtype=author&amp;query=Chiang%2C+C">Cheng-Ming Chiang</a>, <a href="/search/eess?searchtype=author&amp;query=Kuo%2C+H">Hsien-Kai Kuo</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+Y">Yu-Syuan Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+M">Man-Yu Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Lu%2C+A">Allen Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Cheng%2C+C">Chia-Ming Cheng</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+C">Chih-Cheng Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Yong%2C+J">Jia-Ying Yong</a>, <a href="/search/eess?searchtype=author&amp;query=Shuai%2C+H">Hong-Han Shuai</a>, <a href="/search/eess?searchtype=author&amp;query=Cheng%2C+W">Wen-Huang Cheng</a>, <a href="/search/eess?searchtype=author&amp;query=Jia%2C+Z">Zhuang Jia</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+T">Tianyu Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Y">Yijian Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Bao%2C+L">Long Bao</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+H">Heng Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+D">Diankai Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+S">Si Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+S">Shaoli Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Biao Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+X">Xiaofeng Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Zheng%2C+C">Chengjian Zheng</a>, <a href="/search/eess?searchtype=author&amp;query=Lu%2C+K">Kaidi Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+N">Ning Wang</a> , et al. (29 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.05256v1-abstract-short" style="display: inline;"> Video super-resolution is one of the most popular tasks on mobile devices, being widely used for an automatic improvement of low-bitrate and low-resolution video streams. While numerous solutions have been proposed for this problem, they are usually quite computationally demanding, demonstrating low FPS rates and power efficiency on mobile devices. In this Mobile AI challenge, we address this prob&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.05256v1-abstract-full').style.display = 'inline'; document.getElementById('2211.05256v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.05256v1-abstract-full" style="display: none;"> Video super-resolution is one of the most popular tasks on mobile devices, being widely used for an automatic improvement of low-bitrate and low-resolution video streams. While numerous solutions have been proposed for this problem, they are usually quite computationally demanding, demonstrating low FPS rates and power efficiency on mobile devices. In this Mobile AI challenge, we address this problem and propose the participants to design an end-to-end real-time video super-resolution solution for mobile NPUs optimized for low energy consumption. The participants were provided with the REDS training dataset containing video sequences for a 4X video upscaling task. The runtime and power efficiency of all models was evaluated on the powerful MediaTek Dimensity 9000 platform with a dedicated AI processing unit capable of accelerating floating-point and quantized neural networks. All proposed solutions are fully compatible with the above NPU, demonstrating an up to 500 FPS rate and 0.2 [Watt / 30 FPS] power consumption. A detailed description of all models developed in the challenge is provided in this paper. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.05256v1-abstract-full').style.display = 'none'; document.getElementById('2211.05256v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2105.08826, arXiv:2105.07809, arXiv:2211.04470, arXiv:2211.03885</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.12458">arXiv:2205.12458</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2205.12458">pdf</a>, <a href="https://arxiv.org/format/2205.12458">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TIM.2022.3176901">10.1109/TIM.2022.3176901 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A Lightweight NMS-free Framework for Real-time Visual Fault Detection System of Freight Trains </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Sun%2C+G">Guodong Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+Y">Yang Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+H">Huilin Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bo Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Hu%2C+Y">Ye Hu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Y">Yang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.12458v1-abstract-short" style="display: inline;"> Real-time vision-based system of fault detection (RVBS-FD) for freight trains is an essential part of ensuring railway transportation safety. Most existing vision-based methods still have high computational costs based on convolutional neural networks. The computational cost is mainly reflected in the backbone, neck, and post-processing, i.e., non-maximum suppression (NMS). In this paper, we propo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.12458v1-abstract-full').style.display = 'inline'; document.getElementById('2205.12458v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.12458v1-abstract-full" style="display: none;"> Real-time vision-based system of fault detection (RVBS-FD) for freight trains is an essential part of ensuring railway transportation safety. Most existing vision-based methods still have high computational costs based on convolutional neural networks. The computational cost is mainly reflected in the backbone, neck, and post-processing, i.e., non-maximum suppression (NMS). In this paper, we propose a lightweight NMS-free framework to achieve real-time detection and high accuracy simultaneously. First, we use a lightweight backbone for feature extraction and design a fault detection pyramid to process features. This fault detection pyramid includes three novel individual modules using attention mechanism, bottleneck, and dilated convolution for feature enhancement and computation reduction. Instead of using NMS, we calculate different loss functions, including classification and location costs in the detection head, to further reduce computation. Experimental results show that our framework achieves over 83 frames per second speed with a smaller model size and higher accuracy than the state-of-the-art detectors. Meanwhile, the hardware resource requirements of our method are low during the training and testing process. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.12458v1-abstract-full').style.display = 'none'; document.getElementById('2205.12458v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 5 figures, accepted by IEEE Transactions on Instrumentation and Measurement</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.16090">arXiv:2203.16090</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2203.16090">pdf</a>, <a href="https://arxiv.org/format/2203.16090">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/LCSYS.2022.3186236">10.1109/LCSYS.2022.3186236 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A simple suboptimal moving horizon estimation scheme with guaranteed robust stability </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Schiller%2C+J+D">Julian D. Schiller</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Boyang Wu</a>, <a href="/search/eess?searchtype=author&amp;query=M%C3%BCller%2C+M+A">Matthias A. M眉ller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.16090v2-abstract-short" style="display: inline;"> We propose a suboptimal moving horizon estimation (MHE) scheme for a general class of nonlinear systems. To this end, we consider an MHE formulation that optimizes over the trajectory of a robustly stable observer. Assuming that the observer admits a Lyapunov function, we show that this function is an M-step Lyapunov function for suboptimal MHE. The presented sufficient conditions can be easily ve&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.16090v2-abstract-full').style.display = 'inline'; document.getElementById('2203.16090v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.16090v2-abstract-full" style="display: none;"> We propose a suboptimal moving horizon estimation (MHE) scheme for a general class of nonlinear systems. To this end, we consider an MHE formulation that optimizes over the trajectory of a robustly stable observer. Assuming that the observer admits a Lyapunov function, we show that this function is an M-step Lyapunov function for suboptimal MHE. The presented sufficient conditions can be easily verified in practice. We illustrate the practicability of the proposed suboptimal MHE scheme with a standard nonlinear benchmark example. Here, performing a single iteration is sufficient to significantly improve the observer&#39;s estimation results under valid theoretical guarantees. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.16090v2-abstract-full').style.display = 'none'; document.getElementById('2203.16090v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Control Systems Letters, vol. 7, pp. 19-24, 2023 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.12224">arXiv:2110.12224</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2110.12224">pdf</a>, <a href="https://arxiv.org/format/2110.12224">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Generalized Polarization Transform: A Novel Coded Transmission Paradigm </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bolin Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Dai%2C+J">Jincheng Dai</a>, <a href="/search/eess?searchtype=author&amp;query=Niu%2C+K">Kai Niu</a>, <a href="/search/eess?searchtype=author&amp;query=Si%2C+Z">Zhongwei Si</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+P">Ping Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+S">Sen Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Yuan%2C+Y">Yifei Yuan</a>, <a href="/search/eess?searchtype=author&amp;query=I%2C+C">Chih-Lin I</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.12224v2-abstract-short" style="display: inline;"> For the upcoming 6G wireless networks, a new wave of applications and services will demand ultra-high data rates and reliability. To this end, future wireless systems are expected to pave the way for entirely new fundamental air interface technologies to attain a breakthrough in spectrum efficiency (SE). This article discusses a new paradigm, named generalized polarization transform (GPT), to achi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.12224v2-abstract-full').style.display = 'inline'; document.getElementById('2110.12224v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.12224v2-abstract-full" style="display: none;"> For the upcoming 6G wireless networks, a new wave of applications and services will demand ultra-high data rates and reliability. To this end, future wireless systems are expected to pave the way for entirely new fundamental air interface technologies to attain a breakthrough in spectrum efficiency (SE). This article discusses a new paradigm, named generalized polarization transform (GPT), to achieve an integrated design of coding, modulation, multi-antenna, multiple access, etc., in a real sense. The GPT enabled air interface develops far-reaching insights that the joint optimization of critical air interface ingredients can achieve remarkable gains on SE compared with the state-of-the-art module-stacking design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.12224v2-abstract-full').style.display = 'none'; document.getElementById('2110.12224v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2109.06715">arXiv:2109.06715</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2109.06715">pdf</a>, <a href="https://arxiv.org/format/2109.06715">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/MNET.001.2100266">10.1109/MNET.001.2100266 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> IGNNITION: Bridging the Gap Between Graph Neural Networks and Networking Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Pujol-Perich%2C+D">David Pujol-Perich</a>, <a href="/search/eess?searchtype=author&amp;query=Su%C3%A1rez-Varela%2C+J">Jos茅 Su谩rez-Varela</a>, <a href="/search/eess?searchtype=author&amp;query=Ferriol%2C+M">Miquel Ferriol</a>, <a href="/search/eess?searchtype=author&amp;query=Xiao%2C+S">Shihan Xiao</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bo Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Cabellos-Aparicio%2C+A">Albert Cabellos-Aparicio</a>, <a href="/search/eess?searchtype=author&amp;query=Barlet-Ros%2C+P">Pere Barlet-Ros</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2109.06715v2-abstract-short" style="display: inline;"> Recent years have seen the vast potential of Graph Neural Networks (GNN) in many fields where data is structured as graphs (e.g., chemistry, recommender systems). In particular, GNNs are becoming increasingly popular in the field of networking, as graphs are intrinsically present at many levels (e.g., topology, routing). The main novelty of GNNs is their ability to generalize to other networks uns&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.06715v2-abstract-full').style.display = 'inline'; document.getElementById('2109.06715v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2109.06715v2-abstract-full" style="display: none;"> Recent years have seen the vast potential of Graph Neural Networks (GNN) in many fields where data is structured as graphs (e.g., chemistry, recommender systems). In particular, GNNs are becoming increasingly popular in the field of networking, as graphs are intrinsically present at many levels (e.g., topology, routing). The main novelty of GNNs is their ability to generalize to other networks unseen during training, which is an essential feature for developing practical Machine Learning (ML) solutions for networking. However, implementing a functional GNN prototype is currently a cumbersome task that requires strong skills in neural network programming. This poses an important barrier to network engineers that often do not have the necessary ML expertise. In this article, we present IGNNITION, a novel open-source framework that enables fast prototyping of GNNs for networking systems. IGNNITION is based on an intuitive high-level abstraction that hides the complexity behind GNNs, while still offering great flexibility to build custom GNN architectures. To showcase the versatility and performance of this framework, we implement two state-of-the-art GNN models applied to different networking use cases. Our results show that the GNN models produced by IGNNITION are equivalent in terms of accuracy and performance to their native implementations in TensorFlow. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.06715v2-abstract-full').style.display = 'none'; document.getElementById('2109.06715v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Network, vol. 35, no. 6, pp. 171-177, 2021 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2107.14406">arXiv:2107.14406</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2107.14406">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1061/JTEPBS.TEENG-7699">10.1061/JTEPBS.TEENG-7699 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Optimal Variable Speed Limit Control Strategy on Freeway Segments under Fog Conditions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhai%2C+B">Ben Zhai</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Y">Yanli Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+W">Wenxuan Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bing Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2107.14406v1-abstract-short" style="display: inline;"> Fog is a critical external factor that threatens traffic safety on freeways. Variable speed limit (VSL) control can effectively harmonize vehicle speed and improve safety. However, most existing weather-related VSL controllers are limited to adapt to the dynamic traffic environment. This study developed optimal VSL control strategy under fog conditions with fully consideration of factors that affe&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.14406v1-abstract-full').style.display = 'inline'; document.getElementById('2107.14406v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2107.14406v1-abstract-full" style="display: none;"> Fog is a critical external factor that threatens traffic safety on freeways. Variable speed limit (VSL) control can effectively harmonize vehicle speed and improve safety. However, most existing weather-related VSL controllers are limited to adapt to the dynamic traffic environment. This study developed optimal VSL control strategy under fog conditions with fully consideration of factors that affect traffic safety risks. The crash risk under fog conditions was estimated using a crash risk prediction model based on Bayesian logistic regression. The traffic flow with VSL control was simulated by a modified cell transmission model (MCTM). The optimal factors of VSL control were obtained by solving an optimization problem that coordinated safety and mobility with the help of the genetic algorithm. An example of I-405 in California, USA was designed to simulate and evaluate the effects of the proposed VSL control strategy. The optimal VSL control factors under fog conditions were compared with sunny conditions, and different placements of VSL signs were evaluated. Results showed that the optimal VSL control strategy under fog conditions changed the speed limit more cautiously. The VSL control under fog conditions in this study effectively reduced crash risks without significantly increasing travel time, which is up to 37.15% reduction of risks and only 0.48% increase of total travel time. The proposed VSL control strategy is expected to be of great use in the development of VSL systems to enhance freeway safety under fog conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.14406v1-abstract-full').style.display = 'none'; document.getElementById('2107.14406v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2107.14137">arXiv:2107.14137</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2107.14137">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Radio Frequency Interference Management with Free-Space Optical Communication and Photonic Signal Processing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Qi%2C+Y">Yang Qi</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Ben Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2107.14137v1-abstract-short" style="display: inline;"> We design and experimentally demonstrate a radio frequency interference management system with free-space optical communication and photonic signal processing. The system provides real-time interference cancellation in 6 GHz wide bandwidth. </span> <span class="abstract-full has-text-grey-dark mathjax" id="2107.14137v1-abstract-full" style="display: none;"> We design and experimentally demonstrate a radio frequency interference management system with free-space optical communication and photonic signal processing. The system provides real-time interference cancellation in 6 GHz wide bandwidth. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.14137v1-abstract-full').style.display = 'none'; document.getElementById('2107.14137v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 July, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Frontier in Optics 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2107.14134">arXiv:2107.14134</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2107.14134">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Photonic Interference Cancellation with Hybrid Free Space Optical Communication and MIMO Receiver </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Shi%2C+T">Taichu Shi</a>, <a href="/search/eess?searchtype=author&amp;query=Qi%2C+Y">Yang Qi</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Ben Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2107.14134v1-abstract-short" style="display: inline;"> We proposed and demonstrated a hybrid blind source separation system which can switch between multiple-input and multi-output mode and free space optical communication mode depends on different situation to get best condition for separation. </span> <span class="abstract-full has-text-grey-dark mathjax" id="2107.14134v1-abstract-full" style="display: none;"> We proposed and demonstrated a hybrid blind source separation system which can switch between multiple-input and multi-output mode and free space optical communication mode depends on different situation to get best condition for separation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.14134v1-abstract-full').style.display = 'none'; document.getElementById('2107.14134v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 July, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Frontier in Optics 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2107.14133">arXiv:2107.14133</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2107.14133">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1364/OE.435282">10.1364/OE.435282 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Sub-Nyquist Sampling with Optical Pulses for Photonic Blind Source Separation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Shi%2C+T">Taichu Shi</a>, <a href="/search/eess?searchtype=author&amp;query=Qi%2C+Y">Yang Qi</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+W">Weipeng Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Prucnal%2C+P">Paul Prucnal</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Ben Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2107.14133v1-abstract-short" style="display: inline;"> We proposed and demonstrated an optical pulse sampling method for photonic blind source separation. It can separate large bandwidth of mixed signals by small sampling frequency, which can reduce the workload of digital signal processing. </span> <span class="abstract-full has-text-grey-dark mathjax" id="2107.14133v1-abstract-full" style="display: none;"> We proposed and demonstrated an optical pulse sampling method for photonic blind source separation. It can separate large bandwidth of mixed signals by small sampling frequency, which can reduce the workload of digital signal processing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.14133v1-abstract-full').style.display = 'none'; document.getElementById('2107.14133v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 July, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Frontier in Optics</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2107.10415">arXiv:2107.10415</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2107.10415">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Wideband photonic interference cancellation based on free space optical communication </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Qi%2C+Y">Yang Qi</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Ben Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2107.10415v2-abstract-short" style="display: inline;"> We propose and experimentally demonstrate an interference management system that removes wideband wireless interference by using photonic signal processing and free space optical communication. The receiver separates radio frequency interferences by upconverting the mixed signals to optical frequencies and processing the signals with the photonic circuits. Signals with GHz bandwidth are processed&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.10415v2-abstract-full').style.display = 'inline'; document.getElementById('2107.10415v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2107.10415v2-abstract-full" style="display: none;"> We propose and experimentally demonstrate an interference management system that removes wideband wireless interference by using photonic signal processing and free space optical communication. The receiver separates radio frequency interferences by upconverting the mixed signals to optical frequencies and processing the signals with the photonic circuits. Signals with GHz bandwidth are processed and separated in real-time. The reference signals for interference cancellation are transmitted in a free space optical communication link, which provides large bandwidth for multi-band operation and accelerates the mixed signal separation process by reducing the dimensions of the un-known mixing matrix. Experimental results show that the system achieves 30dB real-time cancellation depth with over 6GHz bandwidth. Multiple radio frequency bands can be processed at the same time with a single system. In addition, multiple radio frequency bands can be processed at the same time with a single system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.10415v2-abstract-full').style.display = 'none'; document.getElementById('2107.10415v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 July, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2107.10357">arXiv:2107.10357</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2107.10357">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> </div> </div> <p class="title is-5 mathjax"> Wideband photonic blind source separation with optical pulse sampling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Shi%2C+T">Taichu Shi</a>, <a href="/search/eess?searchtype=author&amp;query=Qi%2C+Y">Yang Qi</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+W">Weipeng Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Prucnal%2C+P+R">Paul R. Prucnal</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+J">Jie Li</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Ben Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2107.10357v1-abstract-short" style="display: inline;"> We propose and experimentally demonstrate an optical pulse sampling method for photonic blind source separation. The photonic system processes and separates wideband signals based on the statistical information of the mixed signals and thus the sampling frequency can be orders of magnitude lower than the bandwidth of the signals. The ultra-fast optical pulse functions as a tweezer that collects sa&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.10357v1-abstract-full').style.display = 'inline'; document.getElementById('2107.10357v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2107.10357v1-abstract-full" style="display: none;"> We propose and experimentally demonstrate an optical pulse sampling method for photonic blind source separation. The photonic system processes and separates wideband signals based on the statistical information of the mixed signals and thus the sampling frequency can be orders of magnitude lower than the bandwidth of the signals. The ultra-fast optical pulse functions as a tweezer that collects samples of the signals at very low sampling rates, and each sample is short enough to maintain the statistical properties of the signals. The low sampling frequency reduces the workloads of the analog to digital conversion and digital signal processing systems. In the meantime, the short pulse sampling maintains the accuracy of the sampled signals, so the statistical properties of the undersampling signals are the same as the statistical properties of the original signals. With the optical pulses generated from a mode-locked laser, the optical pulse sampling system is able to process and separate mixed signals with bandwidth over 100GHz and achieves a dynamic range of 30dB. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.10357v1-abstract-full').style.display = 'none'; document.getElementById('2107.10357v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 July, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2103.16849">arXiv:2103.16849</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2103.16849">pdf</a>, <a href="https://arxiv.org/format/2103.16849">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> TeCANet: Temporal-Contextual Attention Network for Environment-Aware Speech Dereverberation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+H">Helin Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bo Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+L">Lianwu Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+M">Meng Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+J">Jianwei Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+Y">Yong Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+S">Shi-Xiong Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Weng%2C+C">Chao Weng</a>, <a href="/search/eess?searchtype=author&amp;query=Su%2C+D">Dan Su</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+D">Dong Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2103.16849v2-abstract-short" style="display: inline;"> In this paper, we exploit the effective way to leverage contextual information to improve the speech dereverberation performance in real-world reverberant environments. We propose a temporal-contextual attention approach on the deep neural network (DNN) for environment-aware speech dereverberation, which can adaptively attend to the contextual information. More specifically, a FullBand based Tempo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.16849v2-abstract-full').style.display = 'inline'; document.getElementById('2103.16849v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2103.16849v2-abstract-full" style="display: none;"> In this paper, we exploit the effective way to leverage contextual information to improve the speech dereverberation performance in real-world reverberant environments. We propose a temporal-contextual attention approach on the deep neural network (DNN) for environment-aware speech dereverberation, which can adaptively attend to the contextual information. More specifically, a FullBand based Temporal Attention approach (FTA) is proposed, which models the correlations between the fullband information of the context frames. In addition, considering the difference between the attenuation of high frequency bands and low frequency bands (high frequency bands attenuate faster than low frequency bands) in the room impulse response (RIR), we also propose a SubBand based Temporal Attention approach (STA). In order to guide the network to be more aware of the reverberant environments, we jointly optimize the dereverberation network and the reverberation time (RT60) estimator in a multi-task manner. Our experimental results indicate that the proposed method outperforms our previously proposed reverberation-time-aware DNN and the learned attention weights are fully physical consistent. We also report a preliminary yet promising dereverberation and recognition experiment on real test data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.16849v2-abstract-full').style.display = 'none'; document.getElementById('2103.16849v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to Interspeech 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2012.10580">arXiv:2012.10580</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2012.10580">pdf</a>, <a href="https://arxiv.org/format/2012.10580">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Identifying Invariant Texture Violation for Robust Deepfake Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Sun%2C+X">Xinwei Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Botong Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+W">Wei Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2012.10580v1-abstract-short" style="display: inline;"> Existing deepfake detection methods have reported promising in-distribution results, by accessing published large-scale dataset. However, due to the non-smooth synthesis method, the fake samples in this dataset may expose obvious artifacts (e.g., stark visual contrast, non-smooth boundary), which were heavily relied on by most of the frame-level detection methods above. As these artifacts do not c&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.10580v1-abstract-full').style.display = 'inline'; document.getElementById('2012.10580v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2012.10580v1-abstract-full" style="display: none;"> Existing deepfake detection methods have reported promising in-distribution results, by accessing published large-scale dataset. However, due to the non-smooth synthesis method, the fake samples in this dataset may expose obvious artifacts (e.g., stark visual contrast, non-smooth boundary), which were heavily relied on by most of the frame-level detection methods above. As these artifacts do not come up in real media forgeries, the above methods can suffer from a large degradation when applied to fake images that close to reality. To improve the robustness for high-realism fake data, we propose the Invariant Texture Learning (InTeLe) framework, which only accesses the published dataset with low visual quality. Our method is based on the prior that the microscopic facial texture of the source face is inevitably violated by the texture transferred from the target person, which can hence be regarded as the invariant characterization shared among all fake images. To learn such an invariance for deepfake detection, our InTeLe introduces an auto-encoder framework with different decoders for pristine and fake images, which are further appended with a shallow classifier in order to separate out the obvious artifact-effect. Equipped with such a separation, the extracted embedding by encoder can capture the texture violation in fake images, followed by the classifier for the final pristine/fake prediction. As a theoretical guarantee, we prove the identifiability of such an invariance texture violation, i.e., to be precisely inferred from observational data. The effectiveness and utility of our method are demonstrated by promising generalization ability from low-quality images with obvious artifacts to fake images with high realism. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.10580v1-abstract-full').style.display = 'none'; document.getElementById('2012.10580v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.12985">arXiv:2011.12985</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2011.12985">pdf</a>, <a href="https://arxiv.org/format/2011.12985">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> FBWave: Efficient and Scalable Neural Vocoders for Streaming Text-To-Speech on the Edge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bichen Wu</a>, <a href="/search/eess?searchtype=author&amp;query=He%2C+Q">Qing He</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+P">Peizhao Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Koehler%2C+T">Thilo Koehler</a>, <a href="/search/eess?searchtype=author&amp;query=Keutzer%2C+K">Kurt Keutzer</a>, <a href="/search/eess?searchtype=author&amp;query=Vajda%2C+P">Peter Vajda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.12985v1-abstract-short" style="display: inline;"> Nowadays more and more applications can benefit from edge-based text-to-speech (TTS). However, most existing TTS models are too computationally expensive and are not flexible enough to be deployed on the diverse variety of edge devices with their equally diverse computational capacities. To address this, we propose FBWave, a family of efficient and scalable neural vocoders that can achieve optimal&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.12985v1-abstract-full').style.display = 'inline'; document.getElementById('2011.12985v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.12985v1-abstract-full" style="display: none;"> Nowadays more and more applications can benefit from edge-based text-to-speech (TTS). However, most existing TTS models are too computationally expensive and are not flexible enough to be deployed on the diverse variety of edge devices with their equally diverse computational capacities. To address this, we propose FBWave, a family of efficient and scalable neural vocoders that can achieve optimal performance-efficiency trade-offs for different edge devices. FBWave is a hybrid flow-based generative model that combines the advantages of autoregressive and non-autoregressive models. It produces high quality audio and supports streaming during inference while remaining highly computationally efficient. Our experiments show that FBWave can achieve similar audio quality to WaveRNN while reducing MACs by 40x. More efficient variants of FBWave can achieve up to 109x fewer MACs while still delivering acceptable audio quality. Audio demos are available at https://bichenwu09.github.io/vocoder_demos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.12985v1-abstract-full').style.display = 'none'; document.getElementById('2011.12985v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.09162">arXiv:2011.09162</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2011.09162">pdf</a>, <a href="https://arxiv.org/format/2011.09162">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> WPD++: An Improved Neural Beamformer for Simultaneous Speech Separation and Dereverberation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+Y">Yong Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+M">Meng Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bo Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+S">Shixiong Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+D">Dong Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Mandel%2C+M+I">Michael I Mandel</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.09162v1-abstract-short" style="display: inline;"> This paper aims at eliminating the interfering speakers&#39; speech, additive noise, and reverberation from the noisy multi-talker speech mixture that benefits automatic speech recognition (ASR) backend. While the recently proposed Weighted Power minimization Distortionless response (WPD) beamformer can perform separation and dereverberation simultaneously, the noise cancellation component still has t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.09162v1-abstract-full').style.display = 'inline'; document.getElementById('2011.09162v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.09162v1-abstract-full" style="display: none;"> This paper aims at eliminating the interfering speakers&#39; speech, additive noise, and reverberation from the noisy multi-talker speech mixture that benefits automatic speech recognition (ASR) backend. While the recently proposed Weighted Power minimization Distortionless response (WPD) beamformer can perform separation and dereverberation simultaneously, the noise cancellation component still has the potential to progress. We propose an improved neural WPD beamformer called &#34;WPD++&#34; by an enhanced beamforming module in the conventional WPD and a multi-objective loss function for the joint training. The beamforming module is improved by utilizing the spatio-temporal correlation. A multi-objective loss, including the complex spectra domain scale-invariant signal-to-noise ratio (C-Si-SNR) and the magnitude domain mean square error (Mag-MSE), is properly designed to make multiple constraints on the enhanced speech and the desired power of the dry clean signal. Joint training is conducted to optimize the complex-valued mask estimator and the WPD++ beamformer in an end-to-end way. The results show that the proposed WPD++ outperforms several state-of-the-art beamformers on the enhanced speech quality and word error rate (WER) of ASR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.09162v1-abstract-full').style.display = 'none'; document.getElementById('2011.09162v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by SLT 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.07755">arXiv:2011.07755</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2011.07755">pdf</a>, <a href="https://arxiv.org/format/2011.07755">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Audio-visual Multi-channel Integration and Recognition of Overlapped Speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Yu%2C+J">Jianwei Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+S">Shi-Xiong Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bo Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+S">Shansong Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Hu%2C+S">Shoukang Hu</a>, <a href="/search/eess?searchtype=author&amp;query=Geng%2C+M">Mengzhe Geng</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+X">Xunying Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Meng%2C+H">Helen Meng</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+D">Dong Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.07755v2-abstract-short" style="display: inline;"> Automatic speech recognition (ASR) technologies have been significantly advanced in the past few decades. However, recognition of overlapped speech remains a highly challenging task to date. To this end, multi-channel microphone array data are widely used in current ASR systems. Motivated by the invariance of visual modality to acoustic signal corruption and the additional cues they provide to sep&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.07755v2-abstract-full').style.display = 'inline'; document.getElementById('2011.07755v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.07755v2-abstract-full" style="display: none;"> Automatic speech recognition (ASR) technologies have been significantly advanced in the past few decades. However, recognition of overlapped speech remains a highly challenging task to date. To this end, multi-channel microphone array data are widely used in current ASR systems. Motivated by the invariance of visual modality to acoustic signal corruption and the additional cues they provide to separate the target speaker from the interfering sound sources, this paper presents an audio-visual multi-channel based recognition system for overlapped speech. It benefits from a tight integration between a speech separation front-end and recognition back-end, both of which incorporate additional video input. A series of audio-visual multi-channel speech separation front-end components based on TF masking, Filter&amp;Sum and mask-based MVDR neural channel integration approaches are developed. To reduce the error cost mismatch between the separation and recognition components, the entire system is jointly fine-tuned using a multi-task criterion interpolation of the scale-invariant signal to noise ratio (Si-SNR) with either the connectionist temporal classification (CTC), or lattice-free maximum mutual information (LF-MMI) loss function. Experiments suggest that: the proposed audio-visual multi-channel recognition system outperforms the baseline audio-only multi-channel ASR system by up to 8.04% (31.68% relative) and 22.86% (58.51% relative) absolute WER reduction on overlapped speech constructed using either simulation or replaying of the LRS2 dataset respectively. Consistent performance improvements are also obtained using the proposed audio-visual multi-channel recognition system when using occluded video input with the face region randomly covered up to 60%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.07755v2-abstract-full').style.display = 'none'; document.getElementById('2011.07755v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">TASLP 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2010.11607">arXiv:2010.11607</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2010.11607">pdf</a>, <a href="https://arxiv.org/format/2010.11607">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Backdoor Attack against Speaker Verification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhai%2C+T">Tongqing Zhai</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Y">Yiming Li</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Z">Ziqi Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Baoyuan Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Jiang%2C+Y">Yong Jiang</a>, <a href="/search/eess?searchtype=author&amp;query=Xia%2C+S">Shu-Tao Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2010.11607v3-abstract-short" style="display: inline;"> Speaker verification has been widely and successfully adopted in many mission-critical areas for user identification. The training of speaker verification requires a large amount of data, therefore users usually need to adopt third-party data ($e.g.$, data from the Internet or third-party data company). This raises the question of whether adopting untrusted third-party data can pose a security thr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.11607v3-abstract-full').style.display = 'inline'; document.getElementById('2010.11607v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2010.11607v3-abstract-full" style="display: none;"> Speaker verification has been widely and successfully adopted in many mission-critical areas for user identification. The training of speaker verification requires a large amount of data, therefore users usually need to adopt third-party data ($e.g.$, data from the Internet or third-party data company). This raises the question of whether adopting untrusted third-party data can pose a security threat. In this paper, we demonstrate that it is possible to inject the hidden backdoor for infecting speaker verification models by poisoning the training data. Specifically, we design a clustering-based attack scheme where poisoned samples from different clusters will contain different triggers ($i.e.$, pre-defined utterances), based on our understanding of verification tasks. The infected models behave normally on benign samples, while attacker-specified unenrolled triggers will successfully pass the verification even if the attacker has no information about the enrolled speaker. We also demonstrate that existing backdoor attacks cannot be directly adopted in attacking speaker verification. Our approach not only provides a new perspective for designing novel attacks, but also serves as a strong baseline for improving the robustness of verification methods. The code for reproducing main results is available at \url{https://github.com/zhaitongqing233/Backdoor-attack-against-speaker-verification}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.11607v3-abstract-full').style.display = 'none'; document.getElementById('2010.11607v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the ICASSP 2021. The first two authors contributed equally to this work</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2009.00155">arXiv:2009.00155</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2009.00155">pdf</a>, <a href="https://arxiv.org/format/2009.00155">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> A Review of Single-Source Deep Unsupervised Visual Domain Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+S">Sicheng Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Yue%2C+X">Xiangyu Yue</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+S">Shanghang Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+B">Bo Li</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+H">Han Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bichen Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Krishna%2C+R">Ravi Krishna</a>, <a href="/search/eess?searchtype=author&amp;query=Gonzalez%2C+J+E">Joseph E. Gonzalez</a>, <a href="/search/eess?searchtype=author&amp;query=Sangiovanni-Vincentelli%2C+A+L">Alberto L. Sangiovanni-Vincentelli</a>, <a href="/search/eess?searchtype=author&amp;query=Seshia%2C+S+A">Sanjit A. Seshia</a>, <a href="/search/eess?searchtype=author&amp;query=Keutzer%2C+K">Kurt Keutzer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2009.00155v3-abstract-short" style="display: inline;"> Large-scale labeled training datasets have enabled deep neural networks to excel across a wide range of benchmark vision tasks. However, in many applications, it is prohibitively expensive and time-consuming to obtain large quantities of labeled data. To cope with limited labeled training data, many have attempted to directly apply models trained on a large-scale labeled source domain to another s&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2009.00155v3-abstract-full').style.display = 'inline'; document.getElementById('2009.00155v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2009.00155v3-abstract-full" style="display: none;"> Large-scale labeled training datasets have enabled deep neural networks to excel across a wide range of benchmark vision tasks. However, in many applications, it is prohibitively expensive and time-consuming to obtain large quantities of labeled data. To cope with limited labeled training data, many have attempted to directly apply models trained on a large-scale labeled source domain to another sparsely labeled or unlabeled target domain. Unfortunately, direct transfer across domains often performs poorly due to the presence of domain shift or dataset bias. Domain adaptation is a machine learning paradigm that aims to learn a model from a source domain that can perform well on a different (but related) target domain. In this paper, we review the latest single-source deep unsupervised domain adaptation methods focused on visual tasks and discuss new perspectives for future research. We begin with the definitions of different domain adaptation strategies and the descriptions of existing benchmark datasets. We then summarize and compare different categories of single-source unsupervised domain adaptation methods, including discrepancy-based methods, adversarial discriminative methods, adversarial generative methods, and self-supervision-based methods. Finally, we discuss future research directions with challenges and possible solutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2009.00155v3-abstract-full').style.display = 'none'; document.getElementById('2009.00155v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 August, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2008.04768">arXiv:2008.04768</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2008.04768">pdf</a>, <a href="https://arxiv.org/format/2008.04768">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Constrained Active Classification Using Partially Observable Markov Decision Processes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bo Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Lauffer%2C+N">Niklas Lauffer</a>, <a href="/search/eess?searchtype=author&amp;query=Ahmadi%2C+M">Mohamadreza Ahmadi</a>, <a href="/search/eess?searchtype=author&amp;query=Bharadwaj%2C+S">Suda Bharadwaj</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+Z">Zhe Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Topcu%2C+U">Ufuk Topcu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2008.04768v2-abstract-short" style="display: inline;"> In this work, we study the problem of actively classifying the attributes of dynamical systems characterized as a finite set of Markov decision process (MDP) models. We are interested in finding strategies that actively interact with the dynamical system and observe its reactions so that the attribute of interest is classified efficiently with high confidence. We present a decision-theoretic frame&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.04768v2-abstract-full').style.display = 'inline'; document.getElementById('2008.04768v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2008.04768v2-abstract-full" style="display: none;"> In this work, we study the problem of actively classifying the attributes of dynamical systems characterized as a finite set of Markov decision process (MDP) models. We are interested in finding strategies that actively interact with the dynamical system and observe its reactions so that the attribute of interest is classified efficiently with high confidence. We present a decision-theoretic framework based on partially observable Markov decision processes (POMDPs). The proposed framework relies on assigning a classification belief (a probability distribution) to the attributes of interest. Given an initial belief, a confidence level over which a classification decision can be made, a cost bound, safe belief sets, and a finite time horizon, we compute POMDP strategies leading to classification decisions. We present three different algorithms to compute such strategies. The first algorithm computes the optimal strategy exactly by value iteration. To overcome the computational complexity of computing the exact solutions, we propose a second algorithm based on adaptive sampling and a third based on a Monte Carlo tree search to approximate the optimal probability of reaching a classification decision. We illustrate the proposed methodology using examples from medical diagnosis, security surveillance, and wildlife classification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.04768v2-abstract-full').style.display = 'none'; document.getElementById('2008.04768v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 August, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: substantial text overlap with arXiv:1810.00097</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2008.00164">arXiv:2008.00164</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2008.00164">pdf</a>, <a href="https://arxiv.org/format/2008.00164">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Byzantine-Resilient Distributed Hypothesis Testing With Time-Varying Network Topology </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bo Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Carr%2C+S">Steven Carr</a>, <a href="/search/eess?searchtype=author&amp;query=Bharadwaj%2C+S">Suda Bharadwaj</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+Z">Zhe Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Topcu%2C+U">Ufuk Topcu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2008.00164v2-abstract-short" style="display: inline;"> We study the problem of distributed hypothesis testing over a network of mobile agents with limited communication and sensing ranges to infer the true hypothesis collaboratively. In particular, we consider a scenario where there is an unknown subset of compromised agents that may deliberately share altered information to undermine the team objective. We propose two distributed algorithms where eac&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.00164v2-abstract-full').style.display = 'inline'; document.getElementById('2008.00164v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2008.00164v2-abstract-full" style="display: none;"> We study the problem of distributed hypothesis testing over a network of mobile agents with limited communication and sensing ranges to infer the true hypothesis collaboratively. In particular, we consider a scenario where there is an unknown subset of compromised agents that may deliberately share altered information to undermine the team objective. We propose two distributed algorithms where each agent maintains and updates two sets of beliefs (i.e., probability distributions over the hypotheses), namely local and actual beliefs (LB and AB respectively for brevity). In both algorithms, at every time step, each agent shares its AB with other agents within its communication range and makes a local observation to update its LB. Then both algorithms can use the shared information to update ABs under certain conditions. One requires receiving a certain number of shared ABs at each time instant; the other accumulates shared ABs over time and updates after the number of shared ABs exceeds a prescribed threshold. Otherwise, both algorithms rely on the agent&#39;s current LB and AB to update the new AB. We prove under mild assumptions that the AB for every non-compromised agent converges almost surely to the true hypothesis, without requiring connectivity in the underlying time-varying network topology. Using a simulation of a team of unmanned aerial vehicles aiming to classify adversarial agents among themselves, we illustrate and compare the proposed algorithms. Finally, we show experimentally that the second algorithm consistently outperforms the first algorithm in terms of the speed of convergence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.00164v2-abstract-full').style.display = 'none'; document.getElementById('2008.00164v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2007.15874">arXiv:2007.15874</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2007.15874">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Residual-CycleGAN based Camera Adaptation for Robust Diabetic Retinopathy Screening </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Yang%2C+D">Dalu Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+Y">Yehui Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+T">Tiantian Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Binghong Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+L">Lei Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+Y">Yanwu Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2007.15874v1-abstract-short" style="display: inline;"> There are extensive researches focusing on automated diabetic reti-nopathy (DR) detection from fundus images. However, the accuracy drop is ob-served when applying these models in real-world DR screening, where the fun-dus camera brands are different from the ones used to capture the training im-ages. How can we train a classification model on labeled fundus images ac-quired from only one camera b&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.15874v1-abstract-full').style.display = 'inline'; document.getElementById('2007.15874v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2007.15874v1-abstract-full" style="display: none;"> There are extensive researches focusing on automated diabetic reti-nopathy (DR) detection from fundus images. However, the accuracy drop is ob-served when applying these models in real-world DR screening, where the fun-dus camera brands are different from the ones used to capture the training im-ages. How can we train a classification model on labeled fundus images ac-quired from only one camera brand, yet still achieves good performance on im-ages taken by other brands of cameras? In this paper, we quantitatively verify the impact of fundus camera brands related domain shift on the performance of DR classification models, from an experimental perspective. Further, we pro-pose camera-oriented residual-CycleGAN to mitigate the camera brand differ-ence by domain adaptation and achieve increased classification performance on target camera images. Extensive ablation experiments on both the EyePACS da-taset and a private dataset show that the camera brand difference can signifi-cantly impact the classification performance and prove that our proposed meth-od can effectively improve the model performance on the target domain. We have inferred and labeled the camera brand for each image in the EyePACS da-taset and will publicize the camera brand labels for further research on domain adaptation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.15874v1-abstract-full').style.display = 'none'; document.getElementById('2007.15874v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2007.15114">arXiv:2007.15114</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2007.15114">pdf</a>, <a href="https://arxiv.org/format/2007.15114">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Populations and Evolution">q-bio.PE</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1371/journal.pone.0247660">10.1371/journal.pone.0247660 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Control Strategies for COVID-19 Epidemic with Vaccination, Shield Immunity and Quarantine: A Metric Temporal Logic Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xu%2C+Z">Zhe Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bo Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Topcu%2C+U">Ufuk Topcu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2007.15114v2-abstract-short" style="display: inline;"> Ever since the outbreak of the COVID-19 epidemic, various public health control strategies have been proposed and tested against the coronavirus SARS-CoV-2. We study three specific COVID-19 epidemic control models: the susceptible, exposed, infectious, recovered (SEIR) model with vaccination control; the SEIR model with shield immunity control; and the susceptible, un-quarantined infected, quarant&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.15114v2-abstract-full').style.display = 'inline'; document.getElementById('2007.15114v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2007.15114v2-abstract-full" style="display: none;"> Ever since the outbreak of the COVID-19 epidemic, various public health control strategies have been proposed and tested against the coronavirus SARS-CoV-2. We study three specific COVID-19 epidemic control models: the susceptible, exposed, infectious, recovered (SEIR) model with vaccination control; the SEIR model with shield immunity control; and the susceptible, un-quarantined infected, quarantined infected, confirmed infected (SUQC) model with quarantine control. We express the control requirement in metric temporal logic (MTL) formulas (a type of formal specification languages) which can specify the expected control outcomes such as &#34;the deaths from the infection should never exceed one thousand per day within the next three months&#34; or &#34;the population immune from the disease should eventually exceed 200 thousand within the next 100 to 120 days&#34;. We then develop methods for synthesizing control strategies with MTL specifications. To the best of our knowledge, this is the first paper to systematically synthesize control strategies based on the COVID-19 epidemic models with formal specifications. We provide simulation results in three different case studies: vaccination control for the COVID-19 epidemic with model parameters estimated from data in Lombardy, Italy; shield immunity control for the COVID-19 epidemic with model parameters estimated from data in Lombardy, Italy; and quarantine control for the COVID-19 epidemic with model parameters estimated from data in Wuhan, China. The results show that the proposed synthesis approach can generate control inputs such that the time-varying numbers of individuals in each category (e.g., infectious, immune) satisfy the MTL specifications. The results also show that early intervention is essential in mitigating the spread of COVID-19, and more control effort is needed for more stringent MTL specifications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.15114v2-abstract-full').style.display = 'none'; document.getElementById('2007.15114v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2007.01566">arXiv:2007.01566</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2007.01566">pdf</a>, <a href="https://arxiv.org/format/2007.01566">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Distortionless Multi-Channel Target Speech Enhancement for Overlapped Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bo Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+M">Meng Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+L">Lianwu Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+Y">Yong Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Weng%2C+C">Chao Weng</a>, <a href="/search/eess?searchtype=author&amp;query=Su%2C+D">Dan Su</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+D">Dong Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2007.01566v1-abstract-short" style="display: inline;"> Speech enhancement techniques based on deep learning have brought significant improvement on speech quality and intelligibility. Nevertheless, a large gain in speech quality measured by objective metrics, such as perceptual evaluation of speech quality (PESQ), does not necessarily lead to improved speech recognition performance due to speech distortion in the enhancement stage. In this paper, a mu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.01566v1-abstract-full').style.display = 'inline'; document.getElementById('2007.01566v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2007.01566v1-abstract-full" style="display: none;"> Speech enhancement techniques based on deep learning have brought significant improvement on speech quality and intelligibility. Nevertheless, a large gain in speech quality measured by objective metrics, such as perceptual evaluation of speech quality (PESQ), does not necessarily lead to improved speech recognition performance due to speech distortion in the enhancement stage. In this paper, a multi-channel dilated convolutional network based frequency domain modeling is presented to enhance target speaker in the far-field, noisy and multi-talker conditions. We study three approaches towards distortionless waveforms for overlapped speech recognition: estimating complex ideal ratio mask with an infinite range, incorporating the fbank loss in a multi-objective learning and finetuning the enhancement model by an acoustic model. Experimental results proved the effectiveness of all three approaches on reducing speech distortions and improving recognition accuracy. Particularly, the jointly tuned enhancement model works very well with other standalone acoustic model on real test data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.01566v1-abstract-full').style.display = 'none'; document.getElementById('2007.01566v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2006.08357">arXiv:2006.08357</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2006.08357">pdf</a>, <a href="https://arxiv.org/format/2006.08357">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> CoDeNet: Efficient Deployment of Input-Adaptive Object Detection on Embedded FPGAs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Dong%2C+Z">Zhen Dong</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+D">Dequan Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+Q">Qijing Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+Y">Yizhao Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Cai%2C+Y">Yaohui Cai</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+T">Tian Li</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bichen Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Keutzer%2C+K">Kurt Keutzer</a>, <a href="/search/eess?searchtype=author&amp;query=Wawrzynek%2C+J">John Wawrzynek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2006.08357v2-abstract-short" style="display: inline;"> Deploying deep learning models on embedded systems has been challenging due to limited computing resources. The majority of existing work focuses on accelerating image classification, while other fundamental vision problems, such as object detection, have not been adequately addressed. Compared with image classification, detection problems are more sensitive to the spatial variance of objects, and&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.08357v2-abstract-full').style.display = 'inline'; document.getElementById('2006.08357v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2006.08357v2-abstract-full" style="display: none;"> Deploying deep learning models on embedded systems has been challenging due to limited computing resources. The majority of existing work focuses on accelerating image classification, while other fundamental vision problems, such as object detection, have not been adequately addressed. Compared with image classification, detection problems are more sensitive to the spatial variance of objects, and therefore, require specialized convolutions to aggregate spatial information. To address this need, recent work introduces dynamic deformable convolution to augment regular convolutions. However, this will lead to inefficient memory accesses of inputs with existing hardware. In this work, we harness the flexibility of FPGAs to develop a novel object detection pipeline with deformable convolutions. We show the speed-accuracy tradeoffs for a set of algorithm modifications including irregular-access versus limited-range and fixed-shape. We then Co-Design a Network CoDeNet with the modified deformable convolution and quantize it to 4-bit weights and 8-bit activations. With our high-efficiency implementation, our solution reaches 26.9 frames per second with a tiny model size of 0.76 MB while achieving 61.7 AP50 on the standard object detection dataset, Pascal VOC. With our higher accuracy implementation, our model gets to 67.1 AP50 on Pascal VOC with only 2.9 MB of parameters-20.9x smaller but 10% more accurate than Tiny-YOLO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.08357v2-abstract-full').style.display = 'none'; document.getElementById('2006.08357v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Github repo: https://github.com/DequanWang/CoDeNet arXiv:2002.08357 is the preliminary version of this paper</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> FPGA 2021 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2006.03677">arXiv:2006.03677</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2006.03677">pdf</a>, <a href="https://arxiv.org/format/2006.03677">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Visual Transformers: Token-based Image Representation and Processing for Computer Vision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bichen Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+C">Chenfeng Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Dai%2C+X">Xiaoliang Dai</a>, <a href="/search/eess?searchtype=author&amp;query=Wan%2C+A">Alvin Wan</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+P">Peizhao Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Yan%2C+Z">Zhicheng Yan</a>, <a href="/search/eess?searchtype=author&amp;query=Tomizuka%2C+M">Masayoshi Tomizuka</a>, <a href="/search/eess?searchtype=author&amp;query=Gonzalez%2C+J">Joseph Gonzalez</a>, <a href="/search/eess?searchtype=author&amp;query=Keutzer%2C+K">Kurt Keutzer</a>, <a href="/search/eess?searchtype=author&amp;query=Vajda%2C+P">Peter Vajda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2006.03677v4-abstract-short" style="display: inline;"> Computer vision has achieved remarkable success by (a) representing images as uniformly-arranged pixel arrays and (b) convolving highly-localized features. However, convolutions treat all image pixels equally regardless of importance; explicitly model all concepts across all images, regardless of content; and struggle to relate spatially-distant concepts. In this work, we challenge this paradigm b&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.03677v4-abstract-full').style.display = 'inline'; document.getElementById('2006.03677v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2006.03677v4-abstract-full" style="display: none;"> Computer vision has achieved remarkable success by (a) representing images as uniformly-arranged pixel arrays and (b) convolving highly-localized features. However, convolutions treat all image pixels equally regardless of importance; explicitly model all concepts across all images, regardless of content; and struggle to relate spatially-distant concepts. In this work, we challenge this paradigm by (a) representing images as semantic visual tokens and (b) running transformers to densely model token relationships. Critically, our Visual Transformer operates in a semantic token space, judiciously attending to different image parts based on context. This is in sharp contrast to pixel-space transformers that require orders-of-magnitude more compute. Using an advanced training recipe, our VTs significantly outperform their convolutional counterparts, raising ResNet accuracy on ImageNet top-1 by 4.6 to 7 points while using fewer FLOPs and parameters. For semantic segmentation on LIP and COCO-stuff, VT-based feature pyramid networks (FPN) achieve 0.35 points higher mIoU while reducing the FPN module&#39;s FLOPs by 6.5x. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.03677v4-abstract-full').style.display = 'none'; document.getElementById('2006.03677v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2005.10386">arXiv:2005.10386</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2005.10386">pdf</a>, <a href="https://arxiv.org/format/2005.10386">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> End-to-End Multi-Look Keyword Spotting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Yu%2C+M">Meng Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Ji%2C+X">Xuan Ji</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bo Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Su%2C+D">Dan Su</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+D">Dong Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2005.10386v1-abstract-short" style="display: inline;"> The performance of keyword spotting (KWS), measured in false alarms and false rejects, degrades significantly under the far field and noisy conditions. In this paper, we propose a multi-look neural network modeling for speech enhancement which simultaneously steers to listen to multiple sampled look directions. The multi-look enhancement is then jointly trained with KWS to form an end-to-end KWS m&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.10386v1-abstract-full').style.display = 'inline'; document.getElementById('2005.10386v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2005.10386v1-abstract-full" style="display: none;"> The performance of keyword spotting (KWS), measured in false alarms and false rejects, degrades significantly under the far field and noisy conditions. In this paper, we propose a multi-look neural network modeling for speech enhancement which simultaneously steers to listen to multiple sampled look directions. The multi-look enhancement is then jointly trained with KWS to form an end-to-end KWS model which integrates the enhanced signals from multiple look directions and leverages an attention mechanism to dynamically tune the model&#39;s attention to the reliable sources. We demonstrate, on our large noisy and far-field evaluation sets, that the proposed approach significantly improves the KWS performance against the baseline KWS system and a recent beamformer based multi-beam KWS system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.10386v1-abstract-full').style.display = 'none'; document.getElementById('2005.10386v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to Interspeech2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2005.08571">arXiv:2005.08571</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2005.08571">pdf</a>, <a href="https://arxiv.org/format/2005.08571">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Audio-visual Multi-channel Recognition of Overlapped Speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Yu%2C+J">Jianwei Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bo Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Gu%2C+R">Rongzhi Gu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+S">Shi-Xiong Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+L">Lianwu Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+Y+X+M">Yong Xu. Meng Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Su%2C+D">Dan Su</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+D">Dong Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+X">Xunying Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2005.08571v2-abstract-short" style="display: inline;"> Automatic speech recognition (ASR) of overlapped speech remains a highly challenging task to date. To this end, multi-channel microphone array data are widely used in state-of-the-art ASR systems. Motivated by the invariance of visual modality to acoustic signal corruption, this paper presents an audio-visual multi-channel overlapped speech recognition system featuring tightly integrated separatio&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.08571v2-abstract-full').style.display = 'inline'; document.getElementById('2005.08571v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2005.08571v2-abstract-full" style="display: none;"> Automatic speech recognition (ASR) of overlapped speech remains a highly challenging task to date. To this end, multi-channel microphone array data are widely used in state-of-the-art ASR systems. Motivated by the invariance of visual modality to acoustic signal corruption, this paper presents an audio-visual multi-channel overlapped speech recognition system featuring tightly integrated separation front-end and recognition back-end. A series of audio-visual multi-channel speech separation front-end components based on \textit{TF masking}, \textit{filter\&amp;sum} and \textit{mask-based MVDR} beamforming approaches were developed. To reduce the error cost mismatch between the separation and recognition components, they were jointly fine-tuned using the connectionist temporal classification (CTC) loss function, or a multi-task criterion interpolation with scale-invariant signal to noise ratio (Si-SNR) error cost. Experiments suggest that the proposed multi-channel AVSR system outperforms the baseline audio-only ASR system by up to 6.81\% (26.83\% relative) and 22.22\% (56.87\% relative) absolute word error rate (WER) reduction on overlapped speech constructed using either simulation or replaying of the lipreading sentence 2 (LRS2) dataset respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.08571v2-abstract-full').style.display = 'none'; document.getElementById('2005.08571v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to Interspeech 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2002.08357">arXiv:2002.08357</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2002.08357">pdf</a>, <a href="https://arxiv.org/format/2002.08357">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Algorithm-hardware Co-design for Deformable Convolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Huang%2C+Q">Qijing Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+D">Dequan Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+Y">Yizhao Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Cai%2C+Y">Yaohui Cai</a>, <a href="/search/eess?searchtype=author&amp;query=Dong%2C+Z">Zhen Dong</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bichen Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Keutzer%2C+K">Kurt Keutzer</a>, <a href="/search/eess?searchtype=author&amp;query=Wawrzynek%2C+J">John Wawrzynek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2002.08357v1-abstract-short" style="display: inline;"> FPGAs provide a flexible and efficient platform to accelerate rapidly-changing algorithms for computer vision. The majority of existing work focuses on accelerating image classification, while other fundamental vision problems, including object detection and instance segmentation, have not been adequately addressed. Compared with image classification, detection problems are more sensitive to the s&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2002.08357v1-abstract-full').style.display = 'inline'; document.getElementById('2002.08357v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2002.08357v1-abstract-full" style="display: none;"> FPGAs provide a flexible and efficient platform to accelerate rapidly-changing algorithms for computer vision. The majority of existing work focuses on accelerating image classification, while other fundamental vision problems, including object detection and instance segmentation, have not been adequately addressed. Compared with image classification, detection problems are more sensitive to the spatial variance of objects, and therefore, require specialized convolutions to aggregate spatial information. To address this, recent work proposes dynamic deformable convolution to augment regular convolutions. Regular convolutions process a fixed grid of pixels across all the spatial locations in an image, while dynamic deformable convolutions may access arbitrary pixels in the image and the access pattern is input-dependent and varies per spatial location. These properties lead to inefficient memory accesses of inputs with existing hardware. In this work, we first investigate the overhead of the deformable convolution on embedded FPGA SoCs, and then show the accuracy-latency tradeoffs for a set of algorithm modifications including full versus depthwise, fixed-shape, and limited-range. These modifications benefit the energy efficiency for embedded devices in general as they reduce the compute complexity. We then build an efficient object detection network with modified deformable convolutions and quantize the network using state-of-the-art quantization methods. We implement a unified hardware engine on FPGA to support all the operations in the network. Preliminary experiments show that little accuracy is compromised and speedup can be achieved with our co-design optimization for the deformable convolution. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2002.08357v1-abstract-full').style.display = 'none'; document.getElementById('2002.08357v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> NeurIPS EMC2 2019 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2001.11539">arXiv:2001.11539</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2001.11539">pdf</a>, <a href="https://arxiv.org/format/2001.11539">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Adversarial Code Learning for Image Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Yuan%2C+J">Jiangbo Yuan</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bing Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Ding%2C+W">Wanying Ding</a>, <a href="/search/eess?searchtype=author&amp;query=Ping%2C+Q">Qing Ping</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+Z">Zhendong Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2001.11539v1-abstract-short" style="display: inline;"> We introduce the &#34;adversarial code learning&#34; (ACL) module that improves overall image generation performance to several types of deep models. Instead of performing a posterior distribution modeling in the pixel spaces of generators, ACLs aim to jointly learn a latent code with another image encoder/inference net, with a prior noise as its input. We conduct the learning in an adversarial learning p&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.11539v1-abstract-full').style.display = 'inline'; document.getElementById('2001.11539v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2001.11539v1-abstract-full" style="display: none;"> We introduce the &#34;adversarial code learning&#34; (ACL) module that improves overall image generation performance to several types of deep models. Instead of performing a posterior distribution modeling in the pixel spaces of generators, ACLs aim to jointly learn a latent code with another image encoder/inference net, with a prior noise as its input. We conduct the learning in an adversarial learning process, which bears a close resemblance to the original GAN but again shifts the learning from image spaces to prior and latent code spaces. ACL is a portable module that brings up much more flexibility and possibilities in generative model designs. First, it allows flexibility to convert non-generative models like Autoencoders and standard classification models to decent generative models. Second, it enhances existing GANs&#39; performance by generating meaningful codes and images from any part of the prior. We have incorporated our ACL module with the aforementioned frameworks and have performed experiments on synthetic, MNIST, CIFAR-10, and CelebA datasets. Our models have achieved significant improvements which demonstrated the generality for image generation tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.11539v1-abstract-full').style.display = 'none'; document.getElementById('2001.11539v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2001.09227">arXiv:2001.09227</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2001.09227">pdf</a>, <a href="https://arxiv.org/format/2001.09227">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/CDC42340.2020.9304190">10.1109/CDC42340.2020.9304190 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Active Task-Inference-Guided Deep Inverse Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Memarian%2C+F">Farzan Memarian</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+Z">Zhe Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bo Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Wen%2C+M">Min Wen</a>, <a href="/search/eess?searchtype=author&amp;query=Topcu%2C+U">Ufuk Topcu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2001.09227v3-abstract-short" style="display: inline;"> We consider the problem of reward learning for temporally extended tasks. For reward learning, inverse reinforcement learning (IRL) is a widely used paradigm. Given a Markov decision process (MDP) and a set of demonstrations for a task, IRL learns a reward function that assigns a real-valued reward to each state of the MDP. However, for temporally extended tasks, the underlying reward function may&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.09227v3-abstract-full').style.display = 'inline'; document.getElementById('2001.09227v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2001.09227v3-abstract-full" style="display: none;"> We consider the problem of reward learning for temporally extended tasks. For reward learning, inverse reinforcement learning (IRL) is a widely used paradigm. Given a Markov decision process (MDP) and a set of demonstrations for a task, IRL learns a reward function that assigns a real-valued reward to each state of the MDP. However, for temporally extended tasks, the underlying reward function may not be expressible as a function of individual states of the MDP. Instead, the history of visited states may need to be considered to determine the reward at the current state. To address this issue, we propose an iterative algorithm to learn a reward function for temporally extended tasks. At each iteration, the algorithm alternates between two modules, a task inference module that infers the underlying task structure and a reward learning module that uses the inferred task structure to learn a reward function. The task inference module produces a series of queries, where each query is a sequence of subgoals. The demonstrator provides a binary response to each query by attempting to execute it in the environment and observing the environment&#39;s feedback. After the queries are answered, the task inference module returns an automaton encoding its current hypothesis of the task structure. The reward learning module augments the state space of the MDP with the states of the automaton. The module then proceeds to learn a reward function over the augmented state space using a novel deep maximum entropy IRL algorithm. This iterative process continues until it learns a reward function with satisfactory performance. The experiments show that the proposed algorithm significantly outperforms several IRL baselines on temporally extended tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.09227v3-abstract-full').style.display = 'none'; document.getElementById('2001.09227v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 January, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To be published in IEEE Conference on Decision and Control (CDC) 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2001.05685">arXiv:2001.05685</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2001.05685">pdf</a>, <a href="https://arxiv.org/format/2001.05685">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SqueezeWave: Extremely Lightweight Vocoders for On-device Speech Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhai%2C+B">Bohan Zhai</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+T">Tianren Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Xue%2C+F">Flora Xue</a>, <a href="/search/eess?searchtype=author&amp;query=Rothchild%2C+D">Daniel Rothchild</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bichen Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Gonzalez%2C+J+E">Joseph E. Gonzalez</a>, <a href="/search/eess?searchtype=author&amp;query=Keutzer%2C+K">Kurt Keutzer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2001.05685v1-abstract-short" style="display: inline;"> Automatic speech synthesis is a challenging task that is becoming increasingly important as edge devices begin to interact with users through speech. Typical text-to-speech pipelines include a vocoder, which translates intermediate audio representations into an audio waveform. Most existing vocoders are difficult to parallelize since each generated sample is conditioned on previous samples. WaveGl&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.05685v1-abstract-full').style.display = 'inline'; document.getElementById('2001.05685v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2001.05685v1-abstract-full" style="display: none;"> Automatic speech synthesis is a challenging task that is becoming increasingly important as edge devices begin to interact with users through speech. Typical text-to-speech pipelines include a vocoder, which translates intermediate audio representations into an audio waveform. Most existing vocoders are difficult to parallelize since each generated sample is conditioned on previous samples. WaveGlow is a flow-based feed-forward alternative to these auto-regressive models (Prenger et al., 2019). However, while WaveGlow can be easily parallelized, the model is too expensive for real-time speech synthesis on the edge. This paper presents SqueezeWave, a family of lightweight vocoders based on WaveGlow that can generate audio of similar quality to WaveGlow with 61x - 214x fewer MACs. Code, trained models, and generated audio are publicly available at https://github.com/tianrengao/SqueezeWave. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.05685v1-abstract-full').style.display = 'none'; document.getElementById('2001.05685v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 January, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2001.01656">arXiv:2001.01656</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2001.01656">pdf</a>, <a href="https://arxiv.org/format/2001.01656">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Audio-visual Recognition of Overlapped speech for the LRS2 dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Yu%2C+J">Jianwei Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+S">Shi-Xiong Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+J">Jian Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Ghorbani%2C+S">Shahram Ghorbani</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bo Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Kang%2C+S">Shiyin Kang</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+S">Shansong Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+X">Xunying Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Meng%2C+H">Helen Meng</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+D">Dong Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2001.01656v1-abstract-short" style="display: inline;"> Automatic recognition of overlapped speech remains a highly challenging task to date. Motivated by the bimodal nature of human speech perception, this paper investigates the use of audio-visual technologies for overlapped speech recognition. Three issues associated with the construction of audio-visual speech recognition (AVSR) systems are addressed. First, the basic architecture designs i.e. end-&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.01656v1-abstract-full').style.display = 'inline'; document.getElementById('2001.01656v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2001.01656v1-abstract-full" style="display: none;"> Automatic recognition of overlapped speech remains a highly challenging task to date. Motivated by the bimodal nature of human speech perception, this paper investigates the use of audio-visual technologies for overlapped speech recognition. Three issues associated with the construction of audio-visual speech recognition (AVSR) systems are addressed. First, the basic architecture designs i.e. end-to-end and hybrid of AVSR systems are investigated. Second, purposefully designed modality fusion gates are used to robustly integrate the audio and visual features. Third, in contrast to a traditional pipelined architecture containing explicit speech separation and recognition components, a streamlined and integrated AVSR system optimized consistently using the lattice-free MMI (LF-MMI) discriminative criterion is also proposed. The proposed LF-MMI time-delay neural network (TDNN) system establishes the state-of-the-art for the LRS2 dataset. Experiments on overlapped speech simulated from the LRS2 dataset suggest the proposed AVSR system outperformed the audio only baseline LF-MMI DNN system by up to 29.98\% absolute in word error rate (WER) reduction, and produced recognition performance comparable to a more complex pipelined system. Consistent performance improvements of 4.89\% absolute in WER reduction over the baseline AVSR system using feature fusion are also obtained. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.01656v1-abstract-full').style.display = 'none'; document.getElementById('2001.01656v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 5 figures, submitted to icassp2019</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2001.00835">arXiv:2001.00835</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2001.00835">pdf</a>, <a href="https://arxiv.org/format/2001.00835">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Policy Synthesis for Switched Linear Systems with Markov Decision Process Switching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bo Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Cubuktepe%2C+M">Murat Cubuktepe</a>, <a href="/search/eess?searchtype=author&amp;query=Djeumou%2C+F">Franck Djeumou</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+Z">Zhe Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Topcu%2C+U">Ufuk Topcu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2001.00835v1-abstract-short" style="display: inline;"> We study the synthesis of mode switching protocols for a class of discrete-time switched linear systems in which the mode jumps are governed by Markov decision processes (MDPs). We call such systems MDP-JLS for brevity. Each state of the MDP corresponds to a mode in the switched system. The probabilistic state transitions in the MDP represent the mode transitions. We focus on finding a policy that&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.00835v1-abstract-full').style.display = 'inline'; document.getElementById('2001.00835v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2001.00835v1-abstract-full" style="display: none;"> We study the synthesis of mode switching protocols for a class of discrete-time switched linear systems in which the mode jumps are governed by Markov decision processes (MDPs). We call such systems MDP-JLS for brevity. Each state of the MDP corresponds to a mode in the switched system. The probabilistic state transitions in the MDP represent the mode transitions. We focus on finding a policy that selects the switching actions at each mode such that the switched system that follows these actions is guaranteed to be stable. Given a policy in the MDP, the considered MDP-JLS reduces to a Markov jump linear system (MJLS). {We consider both mean-square stability and stability with probability one. For mean-square stability, we leverage existing stability conditions for MJLSs and propose efficient semidefinite programming formulations to find a stabilizing policy in the MDP. For stability with probability one, we derive new sufficient conditions and compute a stabilizing policy using linear programming. We also extend the policy synthesis results to MDP-JLS with uncertain mode transition probabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.00835v1-abstract-full').style.display = 'none'; document.getElementById('2001.00835v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:1904.11456</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1912.06986">arXiv:1912.06986</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1912.06986">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TNANO.2020.2999751">10.1109/TNANO.2020.2999751 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Erase-hidden and Drivability-improved Magnetic Non-Volatile Flip-Flops with NAND-SPIN Devices </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Z">Ziyi Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Z">Zhaohao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+Y">Yansong Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bi Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+W">Weisheng Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1912.06986v2-abstract-short" style="display: inline;"> Non-volatile flip-flops (NVFFs) using power gating techniques promise to overcome the soaring leakage power consumption issue with the scaling of CMOS technology. Magnetic tunnel junction (MTJ) is a good candidate for constructing the NVFF thanks to its low power, high speed, good CMOS compatibility, etc. In this paper, we propose a novel magnetic NVFF based on an emerging memory device called NAN&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1912.06986v2-abstract-full').style.display = 'inline'; document.getElementById('1912.06986v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1912.06986v2-abstract-full" style="display: none;"> Non-volatile flip-flops (NVFFs) using power gating techniques promise to overcome the soaring leakage power consumption issue with the scaling of CMOS technology. Magnetic tunnel junction (MTJ) is a good candidate for constructing the NVFF thanks to its low power, high speed, good CMOS compatibility, etc. In this paper, we propose a novel magnetic NVFF based on an emerging memory device called NAND-SPIN. The data writing of NAND-SPIN is achieved by successively applying two unidirectional currents, which respectively generate the spin orbit torque (SOT) and spin transfer torque (STT) for erase and programming operations. This characteristic allows us to design an erase-hidden and drivability-improved magnetic NVFF. Furthermore, more design flexibility could be obtained since the backup operation of the proposed NVFF is not limited by the inherent slave latch. Simulation results show that our proposed NVFF achieves performance improvement in terms of power, delay and area, compared with conventional slave-latch-driven SOT-NVFF designs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1912.06986v2-abstract-full').style.display = 'none'; document.getElementById('1912.06986v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 December, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This article has been accepted in a future issue of IEEE Transactions on Nanotechnology: Regular Papers</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1910.13825">arXiv:1910.13825</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1910.13825">pdf</a>, <a href="https://arxiv.org/ps/1910.13825">ps</a>, <a href="https://arxiv.org/format/1910.13825">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Overlapped speech recognition from a jointly learned multi-channel neural speech extraction and representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Bo Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+M">Meng Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+L">Lianwu Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Weng%2C+C">Chao Weng</a>, <a href="/search/eess?searchtype=author&amp;query=Su%2C+D">Dan Su</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+D">Dong Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1910.13825v1-abstract-short" style="display: inline;"> We propose an end-to-end joint optimization framework of a multi-channel neural speech extraction and deep acoustic model without mel-filterbank (FBANK) extraction for overlapped speech recognition. First, based on a multi-channel convolutional TasNet with STFT kernel, we unify the multi-channel target speech enhancement front-end network and a convolutional, long short-term memory and fully conne&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.13825v1-abstract-full').style.display = 'inline'; document.getElementById('1910.13825v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1910.13825v1-abstract-full" style="display: none;"> We propose an end-to-end joint optimization framework of a multi-channel neural speech extraction and deep acoustic model without mel-filterbank (FBANK) extraction for overlapped speech recognition. First, based on a multi-channel convolutional TasNet with STFT kernel, we unify the multi-channel target speech enhancement front-end network and a convolutional, long short-term memory and fully connected deep neural network (CLDNN) based acoustic model (AM) with the FBANK extraction layer to build a hybrid neural network, which is thus jointly updated only by the recognition loss. The proposed framework achieves 28% word error rate reduction (WERR) over a separately optimized system on AISHELL-1 and shows consistent robustness to signal to interference ratio (SIR) and angle difference between overlapping speakers. Next, a further exploration shows that the speech recognition is improved with a simplified structure by replacing the FBANK extraction layer in the joint model with a learnable feature projection. Finally, we also perform the objective measurement of speech quality on the reconstructed waveform from the enhancement network in the joint model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.13825v1-abstract-full').style.display = 'none'; document.getElementById('1910.13825v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2019. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Wu%2C+B&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Wu%2C+B&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Wu%2C+B&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10