CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 115 results for author: <span class="mathjax">Pantic, M</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Pantic%2C+M">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Pantic, M"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Pantic%2C+M&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Pantic, M"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Pantic%2C+M&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Pantic%2C+M&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Pantic%2C+M&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Pantic%2C+M&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02256">arXiv:2411.02256</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.02256">pdf</a>, <a href="https://arxiv.org/format/2411.02256">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unified Speech Recognition: A Single Model for Auditory, Visual, and Audiovisual Inputs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Haliassos%2C+A">Alexandros Haliassos</a>, <a href="/search/cs?searchtype=author&amp;query=Mira%2C+R">Rodrigo Mira</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Honglie Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Landgraf%2C+Z">Zoe Landgraf</a>, <a href="/search/cs?searchtype=author&amp;query=Petridis%2C+S">Stavros Petridis</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02256v1-abstract-short" style="display: inline;"> Research in auditory, visual, and audiovisual speech recognition (ASR, VSR, and AVSR, respectively) has traditionally been conducted independently. Even recent self-supervised studies addressing two or all three tasks simultaneously tend to yield separate models, leading to disjoint inference pipelines with increased memory requirements and redundancies. This paper proposes unified training strate&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02256v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02256v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02256v1-abstract-full" style="display: none;"> Research in auditory, visual, and audiovisual speech recognition (ASR, VSR, and AVSR, respectively) has traditionally been conducted independently. Even recent self-supervised studies addressing two or all three tasks simultaneously tend to yield separate models, leading to disjoint inference pipelines with increased memory requirements and redundancies. This paper proposes unified training strategies for these systems. We demonstrate that training a single model for all three tasks enhances VSR and AVSR performance, overcoming typical optimisation challenges when training from scratch. Moreover, we introduce a greedy pseudo-labelling approach to more effectively leverage unlabelled samples, addressing shortcomings in related self-supervised methods. Finally, we develop a self-supervised pre-training method within our framework, proving its effectiveness alongside our semi-supervised approach. Despite using a single model for all tasks, our unified approach achieves state-of-the-art performance compared to recent methods on LRS3 and LRS2 for ASR, VSR, and AVSR, as well as on the newly released WildVSR dataset. Code and models are available at https://github.com/ahaliassos/usr. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02256v1-abstract-full').style.display = 'none'; document.getElementById('2411.02256v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024. Code: https://github.com/ahaliassos/usr</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07771">arXiv:2410.07771</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.07771">pdf</a>, <a href="https://arxiv.org/format/2410.07771">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Full-Rank No More: Low-Rank Weight Training for Modern Speech Recognition Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Fernandez-Lopez%2C+A">Adriana Fernandez-Lopez</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Shiwei Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+L">Lu Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Petridis%2C+S">Stavros Petridis</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07771v1-abstract-short" style="display: inline;"> This paper investigates the under-explored area of low-rank weight training for large-scale Conformer-based speech recognition models from scratch. Our study demonstrates the viability of this training paradigm for such models, yielding several notable findings. Firstly, we discover that applying a low-rank structure exclusively to the attention modules can unexpectedly enhance performance, even w&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07771v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07771v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07771v1-abstract-full" style="display: none;"> This paper investigates the under-explored area of low-rank weight training for large-scale Conformer-based speech recognition models from scratch. Our study demonstrates the viability of this training paradigm for such models, yielding several notable findings. Firstly, we discover that applying a low-rank structure exclusively to the attention modules can unexpectedly enhance performance, even with a significant rank reduction of 12%. In contrast, feed-forward layers present greater challenges, as they begin to exhibit performance degradation with a moderate 50% rank reduction. Furthermore, we find that both initialization and layer-wise rank assignment play critical roles in successful low-rank training. Specifically, employing SVD initialization and linear layer-wise rank mapping significantly boosts the efficacy of low-rank weight training. Building on these insights, we introduce the Low-Rank Speech Model from Scratch (LR-SMS), an approach that achieves performance parity with full-rank training while delivering substantial reductions in parameters count (by at least 2x), and training time speedups (by 1.3x for ASR and 1.15x for AVSR). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07771v1-abstract-full').style.display = 'none'; document.getElementById('2410.07771v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00736">arXiv:2410.00736</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.00736">pdf</a>, <a href="https://arxiv.org/format/2410.00736">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Radar Meets Vision: Robustifying Monocular Metric Depth Prediction for Mobile Robotics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Job%2C+M">Marco Job</a>, <a href="/search/cs?searchtype=author&amp;query=Stastny%2C+T">Thomas Stastny</a>, <a href="/search/cs?searchtype=author&amp;query=Kazik%2C+T">Tim Kazik</a>, <a href="/search/cs?searchtype=author&amp;query=Siegwart%2C+R">Roland Siegwart</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Michael Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00736v1-abstract-short" style="display: inline;"> Mobile robots require accurate and robust depth measurements to understand and interact with the environment. While existing sensing modalities address this problem to some extent, recent research on monocular depth estimation has leveraged the information richness, yet low cost and simplicity of monocular cameras. These works have shown significant generalization capabilities, mainly in automotiv&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00736v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00736v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00736v1-abstract-full" style="display: none;"> Mobile robots require accurate and robust depth measurements to understand and interact with the environment. While existing sensing modalities address this problem to some extent, recent research on monocular depth estimation has leveraged the information richness, yet low cost and simplicity of monocular cameras. These works have shown significant generalization capabilities, mainly in automotive and indoor settings. However, robots often operate in environments with limited scale cues, self-similar appearances, and low texture. In this work, we encode measurements from a low-cost mmWave radar into the input space of a state-of-the-art monocular depth estimation model. Despite the radar&#39;s extreme point cloud sparsity, our method demonstrates generalization and robustness across industrial and outdoor experiments. Our approach reduces the absolute relative error of depth predictions by 9-64% across a range of unseen, real-world validation datasets. Importantly, we maintain consistency of all performance metrics across all experiments and scene depths where current vision-only approaches fail. We further address the present deficit of training data in mobile robotics environments by introducing a novel methodology for synthesizing rendered, realistic learning datasets based on photogrammetric data that simulate the radar sensor observations for training. Our code, datasets, and pre-trained networks are made available at https://github.com/ethz-asl/radarmeetsvision. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00736v1-abstract-full').style.display = 'none'; document.getElementById('2410.00736v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICRA 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12319">arXiv:2409.12319</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.12319">pdf</a>, <a href="https://arxiv.org/format/2409.12319">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Large Language Models Are Strong Audio-Visual Speech Recognition Learners </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cappellazzo%2C+U">Umberto Cappellazzo</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+M">Minsu Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Honglie Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+P">Pingchuan Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Petridis%2C+S">Stavros Petridis</a>, <a href="/search/cs?searchtype=author&amp;query=Falavigna%2C+D">Daniele Falavigna</a>, <a href="/search/cs?searchtype=author&amp;query=Brutti%2C+A">Alessio Brutti</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12319v1-abstract-short" style="display: inline;"> Multimodal large language models (MLLMs) have recently become a focal point of research due to their formidable multimodal understanding capabilities. For example, in the audio and speech domains, an LLM can be equipped with (automatic) speech recognition (ASR) abilities by just concatenating the audio tokens, computed with an audio encoder, and the text tokens to achieve state-of-the-art results.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12319v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12319v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12319v1-abstract-full" style="display: none;"> Multimodal large language models (MLLMs) have recently become a focal point of research due to their formidable multimodal understanding capabilities. For example, in the audio and speech domains, an LLM can be equipped with (automatic) speech recognition (ASR) abilities by just concatenating the audio tokens, computed with an audio encoder, and the text tokens to achieve state-of-the-art results. On the contrary, tasks like visual and audio-visual speech recognition (VSR/AVSR), which also exploit noise-invariant lip movement information, have received little or no attention. To bridge this gap, we propose Llama-AVSR, a new MLLM with strong audio-visual speech recognition capabilities. It leverages pre-trained audio and video encoders to produce modality-specific tokens which, together with the text tokens, are processed by a pre-trained LLM (e.g., Llama3.1-8B) to yield the resulting response in an auto-regressive fashion. Llama-AVSR requires a small number of trainable parameters as only modality-specific projectors and LoRA modules are trained whereas the multi-modal encoders and LLM are kept frozen. We evaluate our proposed approach on LRS3, the largest public AVSR benchmark, and we achieve new state-of-the-art results for the tasks of ASR and AVSR with a WER of 0.81% and 0.77%, respectively. To bolster our results, we investigate the key factors that underpin the effectiveness of Llama-AVSR: the choice of the pre-trained encoders and LLM, the efficient integration of LoRA modules, and the optimal performance-efficiency trade-off obtained via modality-aware compression rates. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12319v1-abstract-full').style.display = 'none'; document.getElementById('2409.12319v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The code will be made available at this link: https://github.com/umbertocappellazzo/AVSR-LLMs</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.13530">arXiv:2407.13530</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.13530">pdf</a>, <a href="https://arxiv.org/format/2407.13530">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Pushing the Limits of Reactive Planning: Learning to Escape Local Minima </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Meijer%2C+I">Isar Meijer</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Michael Pantic</a>, <a href="/search/cs?searchtype=author&amp;query=Oleynikova%2C+H">Helen Oleynikova</a>, <a href="/search/cs?searchtype=author&amp;query=Siegwart%2C+R">Roland Siegwart</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.13530v1-abstract-short" style="display: inline;"> When does a robot planner need a map? Reactive methods that use only the robot&#39;s current sensor data and local information are fast and flexible, but prone to getting stuck in local minima. Is there a middle-ground between fully reactive methods and map-based path planners? In this paper, we investigate feed forward and recurrent networks to augment a purely reactive sensor-based planner, which sh&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13530v1-abstract-full').style.display = 'inline'; document.getElementById('2407.13530v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.13530v1-abstract-full" style="display: none;"> When does a robot planner need a map? Reactive methods that use only the robot&#39;s current sensor data and local information are fast and flexible, but prone to getting stuck in local minima. Is there a middle-ground between fully reactive methods and map-based path planners? In this paper, we investigate feed forward and recurrent networks to augment a purely reactive sensor-based planner, which should give the robot geometric intuition about how to escape local minima. We train on a large number of extremely cluttered worlds auto-generated from primitive shapes, and show that our system zero-shot transfers to real 3D man-made environments, and can handle up to 30% sensor noise without degeneration of performance. We also offer a discussion of what role network memory plays in our final system, and what insights can be drawn about the nature of reactive vs. map-based navigation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13530v1-abstract-full').style.display = 'none'; document.getElementById('2407.13530v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07825">arXiv:2407.07825</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.07825">pdf</a>, <a href="https://arxiv.org/format/2407.07825">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> RT-LA-VocE: Real-Time Low-SNR Audio-Visual Speech Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Honglie Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Mira%2C+R">Rodrigo Mira</a>, <a href="/search/cs?searchtype=author&amp;query=Petridis%2C+S">Stavros Petridis</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07825v1-abstract-short" style="display: inline;"> In this paper, we aim to generate clean speech frame by frame from a live video stream and a noisy audio stream without relying on future inputs. To this end, we propose RT-LA-VocE, which completely re-designs every component of LA-VocE, a state-of-the-art non-causal audio-visual speech enhancement model, to perform causal real-time inference with a 40ms input frame. We do so by devising new visua&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07825v1-abstract-full').style.display = 'inline'; document.getElementById('2407.07825v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07825v1-abstract-full" style="display: none;"> In this paper, we aim to generate clean speech frame by frame from a live video stream and a noisy audio stream without relying on future inputs. To this end, we propose RT-LA-VocE, which completely re-designs every component of LA-VocE, a state-of-the-art non-causal audio-visual speech enhancement model, to perform causal real-time inference with a 40ms input frame. We do so by devising new visual and audio encoders that rely solely on past frames, replacing the Transformer encoder with the Emformer, and designing a new causal neural vocoder C-HiFi-GAN. On the popular AVSpeech dataset, we show that our algorithm achieves state-of-the-art results in all real-time scenarios. More importantly, each component is carefully tuned to minimize the algorithm latency to the theoretical minimum (40ms) while maintaining a low end-to-end processing latency of 28.15ms per frame, enabling real-time frame-by-frame enhancement with minimal delay. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07825v1-abstract-full').style.display = 'none'; document.getElementById('2407.07825v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18373">arXiv:2406.18373</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.18373">pdf</a>, <a href="https://arxiv.org/format/2406.18373">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Data Pruning for Automatic Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+Q">Qiao Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+P">Pingchuan Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Fernandez-Lopez%2C+A">Adriana Fernandez-Lopez</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+B">Boqian Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+L">Lu Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Petridis%2C+S">Stavros Petridis</a>, <a href="/search/cs?searchtype=author&amp;query=Pechenizkiy%2C+M">Mykola Pechenizkiy</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a>, <a href="/search/cs?searchtype=author&amp;query=Mocanu%2C+D+C">Decebal Constantin Mocanu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Shiwei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18373v1-abstract-short" style="display: inline;"> The recent success of Automatic Speech Recognition (ASR) is largely attributed to the ever-growing amount of training data. However, this trend has made model training prohibitively costly and imposed computational demands. While data pruning has been proposed to mitigate this issue by identifying a small subset of relevant data, its application in ASR has been barely explored, and existing works&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18373v1-abstract-full').style.display = 'inline'; document.getElementById('2406.18373v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18373v1-abstract-full" style="display: none;"> The recent success of Automatic Speech Recognition (ASR) is largely attributed to the ever-growing amount of training data. However, this trend has made model training prohibitively costly and imposed computational demands. While data pruning has been proposed to mitigate this issue by identifying a small subset of relevant data, its application in ASR has been barely explored, and existing works often entail significant overhead to achieve meaningful results. To fill this gap, this paper presents the first investigation of dynamic data pruning for ASR, finding that we can reach the full-data performance by dynamically selecting 70% of data. Furthermore, we introduce Dynamic Data Pruning for ASR (DDP-ASR), which offers several fine-grained pruning granularities specifically tailored for speech-related datasets, going beyond the conventional pruning of entire time sequences. Our intensive experiments show that DDP-ASR can save up to 1.6x training time with negligible performance loss. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18373v1-abstract-full').style.display = 'none'; document.getElementById('2406.18373v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17614">arXiv:2406.17614</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.17614">pdf</a>, <a href="https://arxiv.org/format/2406.17614">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> MSRS: Training Multimodal Speech Recognition Models from Scratch with Sparse Mask Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Fernandez-Lopez%2C+A">Adriana Fernandez-Lopez</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Honglie Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+P">Pingchuan Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+L">Lu Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+Q">Qiao Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Petridis%2C+S">Stavros Petridis</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Shiwei Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17614v1-abstract-short" style="display: inline;"> Pre-trained models have been a foundational approach in speech recognition, albeit with associated additional costs. In this study, we propose a regularization technique that facilitates the training of visual and audio-visual speech recognition models (VSR and AVSR) from scratch. This approach, abbreviated as \textbf{MSRS} (Multimodal Speech Recognition from Scratch), introduces a sparse regulari&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17614v1-abstract-full').style.display = 'inline'; document.getElementById('2406.17614v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17614v1-abstract-full" style="display: none;"> Pre-trained models have been a foundational approach in speech recognition, albeit with associated additional costs. In this study, we propose a regularization technique that facilitates the training of visual and audio-visual speech recognition models (VSR and AVSR) from scratch. This approach, abbreviated as \textbf{MSRS} (Multimodal Speech Recognition from Scratch), introduces a sparse regularization that rapidly learns sparse structures within the dense model at the very beginning of training, which receives healthier gradient flow than the dense equivalent. Once the sparse mask stabilizes, our method allows transitioning to a dense model or keeping a sparse model by updating non-zero values. MSRS achieves competitive results in VSR and AVSR with 21.1% and 0.9% WER on the LRS3 benchmark, while reducing training time by at least 2x. We explore other sparse approaches and show that only MSRS enables training from scratch by implicitly masking the weights affected by vanishing gradients. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17614v1-abstract-full').style.display = 'none'; document.getElementById('2406.17614v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17333">arXiv:2406.17333</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.17333">pdf</a>, <a href="https://arxiv.org/format/2406.17333">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Task Adaptation in Industrial Human-Robot Interaction: Leveraging Riemannian Motion Policies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Allenspach%2C+M">Mike Allenspach</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Michael Pantic</a>, <a href="/search/cs?searchtype=author&amp;query=Girod%2C+R">Rik Girod</a>, <a href="/search/cs?searchtype=author&amp;query=Ott%2C+L">Lionel Ott</a>, <a href="/search/cs?searchtype=author&amp;query=Siegwart%2C+R">Roland Siegwart</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17333v1-abstract-short" style="display: inline;"> In real-world industrial environments, modern robots often rely on human operators for crucial decision-making and mission synthesis from individual tasks. Effective and safe collaboration between humans and robots requires systems that can adjust their motion based on human intentions, enabling dynamic task planning and adaptation. Addressing the needs of industrial applications, we propose a mot&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17333v1-abstract-full').style.display = 'inline'; document.getElementById('2406.17333v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17333v1-abstract-full" style="display: none;"> In real-world industrial environments, modern robots often rely on human operators for crucial decision-making and mission synthesis from individual tasks. Effective and safe collaboration between humans and robots requires systems that can adjust their motion based on human intentions, enabling dynamic task planning and adaptation. Addressing the needs of industrial applications, we propose a motion control framework that (i) removes the need for manual control of the robot&#39;s movement; (ii) facilitates the formulation and combination of complex tasks; and (iii) allows the seamless integration of human intent recognition and robot motion planning. For this purpose, we leverage a modular and purely reactive approach for task parametrization and motion generation, embodied by Riemannian Motion Policies. The effectiveness of our method is demonstrated, evaluated, and compared to \remove{state-of-the-art approaches}\add{a representative state-of-the-art approach} in experimental scenarios inspired by realistic industrial Human-Robot Interaction settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17333v1-abstract-full').style.display = 'none'; document.getElementById('2406.17333v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages; Robotics, Science and Systems (RSS) 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Robotics, Science and Systems (RSS) 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.13617">arXiv:2405.13617</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.13617">pdf</a>, <a href="https://arxiv.org/format/2405.13617">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Waverider: Leveraging Hierarchical, Multi-Resolution Maps for Efficient and Reactive Obstacle Avoidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Reijgwart%2C+V">Victor Reijgwart</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Michael Pantic</a>, <a href="/search/cs?searchtype=author&amp;query=Siegwart%2C+R">Roland Siegwart</a>, <a href="/search/cs?searchtype=author&amp;query=Ott%2C+L">Lionel Ott</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.13617v1-abstract-short" style="display: inline;"> Fast and reliable obstacle avoidance is an important task for mobile robots. In this work, we propose an efficient reactive system that provides high-quality obstacle avoidance while running at hundreds of hertz with minimal resource usage. Our approach combines wavemap, a hierarchical volumetric map representation, with a novel hierarchical and parallelizable obstacle avoidance algorithm formulat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.13617v1-abstract-full').style.display = 'inline'; document.getElementById('2405.13617v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.13617v1-abstract-full" style="display: none;"> Fast and reliable obstacle avoidance is an important task for mobile robots. In this work, we propose an efficient reactive system that provides high-quality obstacle avoidance while running at hundreds of hertz with minimal resource usage. Our approach combines wavemap, a hierarchical volumetric map representation, with a novel hierarchical and parallelizable obstacle avoidance algorithm formulated through Riemannian Motion Policies (RMP). Leveraging multi-resolution obstacle avoidance policies, the proposed navigation system facilitates precise, low-latency (36ms), and extremely efficient obstacle avoidance with a very large perceptive radius (30m). We perform extensive statistical evaluations on indoor and outdoor maps, verifying that the proposed system compares favorably to fixed-resolution RMP variants and CHOMP. Finally, the RMP formulation allows the seamless fusion of obstacle avoidance with additional objectives, such as goal-seeking, to obtain a fully-fledged navigation system that is versatile and robust. We deploy the system on a Micro Aerial Vehicle and show how it navigates through an indoor obstacle course. Our complete implementation, called waverider, is made available as open source. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.13617v1-abstract-full').style.display = 'none'; document.getElementById('2405.13617v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 12 figures, accepted to ICRA 2024, code is open-source: https://github.com/ethz-asl/waverider</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.19110">arXiv:2404.19110</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.19110">pdf</a>, <a href="https://arxiv.org/format/2404.19110">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EMOPortraits: Emotion-enhanced Multimodal One-shot Head Avatars </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Drobyshev%2C+N">Nikita Drobyshev</a>, <a href="/search/cs?searchtype=author&amp;query=Casademunt%2C+A+B">Antoni Bigata Casademunt</a>, <a href="/search/cs?searchtype=author&amp;query=Vougioukas%2C+K">Konstantinos Vougioukas</a>, <a href="/search/cs?searchtype=author&amp;query=Landgraf%2C+Z">Zoe Landgraf</a>, <a href="/search/cs?searchtype=author&amp;query=Petridis%2C+S">Stavros Petridis</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.19110v1-abstract-short" style="display: inline;"> Head avatars animated by visual signals have gained popularity, particularly in cross-driving synthesis where the driver differs from the animated character, a challenging but highly practical approach. The recently presented MegaPortraits model has demonstrated state-of-the-art results in this domain. We conduct a deep examination and evaluation of this model, with a particular focus on its laten&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.19110v1-abstract-full').style.display = 'inline'; document.getElementById('2404.19110v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.19110v1-abstract-full" style="display: none;"> Head avatars animated by visual signals have gained popularity, particularly in cross-driving synthesis where the driver differs from the animated character, a challenging but highly practical approach. The recently presented MegaPortraits model has demonstrated state-of-the-art results in this domain. We conduct a deep examination and evaluation of this model, with a particular focus on its latent space for facial expression descriptors, and uncover several limitations with its ability to express intense face motions. To address these limitations, we propose substantial changes in both training pipeline and model architecture, to introduce our EMOPortraits model, where we: Enhance the model&#39;s capability to faithfully support intense, asymmetric face expressions, setting a new state-of-the-art result in the emotion transfer task, surpassing previous methods in both metrics and quality. Incorporate speech-driven mode to our model, achieving top-tier performance in audio-driven facial animation, making it possible to drive source identity through diverse modalities, including visual signal, audio, or a blend of both. We propose a novel multi-view video dataset featuring a wide range of intense and asymmetric facial expressions, filling the gap with absence of such data in existing datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.19110v1-abstract-full').style.display = 'none'; document.getElementById('2404.19110v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.02098">arXiv:2404.02098</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.02098">pdf</a>, <a href="https://arxiv.org/format/2404.02098">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> BRAVEn: Improving Self-Supervised Pre-training for Visual and Auditory Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Haliassos%2C+A">Alexandros Haliassos</a>, <a href="/search/cs?searchtype=author&amp;query=Zinonos%2C+A">Andreas Zinonos</a>, <a href="/search/cs?searchtype=author&amp;query=Mira%2C+R">Rodrigo Mira</a>, <a href="/search/cs?searchtype=author&amp;query=Petridis%2C+S">Stavros Petridis</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.02098v1-abstract-short" style="display: inline;"> Self-supervision has recently shown great promise for learning visual and auditory speech representations from unlabelled data. In this work, we propose BRAVEn, an extension to the recent RAVEn method, which learns speech representations entirely from raw audio-visual data. Our modifications to RAVEn enable BRAVEn to achieve state-of-the-art results among self-supervised methods in various setting&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.02098v1-abstract-full').style.display = 'inline'; document.getElementById('2404.02098v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.02098v1-abstract-full" style="display: none;"> Self-supervision has recently shown great promise for learning visual and auditory speech representations from unlabelled data. In this work, we propose BRAVEn, an extension to the recent RAVEn method, which learns speech representations entirely from raw audio-visual data. Our modifications to RAVEn enable BRAVEn to achieve state-of-the-art results among self-supervised methods in various settings. Moreover, we observe favourable scaling behaviour by increasing the amount of unlabelled data well beyond other self-supervised works. In particular, we achieve 20.0% / 1.7% word error rate for VSR / ASR on the LRS3 test set, with only 30 hours of labelled data and no external ASR models. Our results suggest that readily available unlabelled audio-visual data can largely replace costly transcribed data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.02098v1-abstract-full').style.display = 'none'; document.getElementById('2404.02098v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICASSP 2024. Code: https://github.com/ahaliassos/raven</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.17434">arXiv:2402.17434</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.17434">pdf</a>, <a href="https://arxiv.org/format/2402.17434">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Passive Aligning Physical Interaction of Fully-Actuated Aerial Vehicles for Pushing Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hui%2C+T">Tong Hui</a>, <a href="/search/cs?searchtype=author&amp;query=Cuniato%2C+E">Eugenio Cuniato</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Michael Pantic</a>, <a href="/search/cs?searchtype=author&amp;query=Tognon%2C+M">Marco Tognon</a>, <a href="/search/cs?searchtype=author&amp;query=Fumagalli%2C+M">Matteo Fumagalli</a>, <a href="/search/cs?searchtype=author&amp;query=Siegwart%2C+R">Roland Siegwart</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.17434v1-abstract-short" style="display: inline;"> Recently, the utilization of aerial manipulators for performing pushing tasks in non-destructive testing (NDT) applications has seen significant growth. Such operations entail physical interactions between the aerial robotic system and the environment. End-effectors with multiple contact points are often used for placing NDT sensors in contact with a surface to be inspected. Aligning the NDT senso&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.17434v1-abstract-full').style.display = 'inline'; document.getElementById('2402.17434v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.17434v1-abstract-full" style="display: none;"> Recently, the utilization of aerial manipulators for performing pushing tasks in non-destructive testing (NDT) applications has seen significant growth. Such operations entail physical interactions between the aerial robotic system and the environment. End-effectors with multiple contact points are often used for placing NDT sensors in contact with a surface to be inspected. Aligning the NDT sensor and the work surface while preserving contact, requires that all available contact points at the end-effector tip are in contact with the work surface. With a standard full-pose controller, attitude errors often occur due to perturbations caused by modeling uncertainties, sensor noise, and environmental uncertainties. Even small attitude errors can cause a loss of contact points between the end-effector tip and the work surface. To preserve full alignment amidst these uncertainties, we propose a control strategy which selectively deactivates angular motion control and enables direct force control in specific directions. In particular, we derive two essential conditions to be met, such that the robot can passively align with flat work surfaces achieving full alignment through the rotation along non-actively controlled axes. Additionally, these conditions serve as hardware design and control guidelines for effectively integrating the proposed control method for practical usage. Real world experiments are conducted to validate both the control design and the guidelines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.17434v1-abstract-full').style.display = 'none'; document.getElementById('2402.17434v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to the 2024 IEEE International Conference on Robotics and Automation (ICRA2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.14730">arXiv:2312.14730</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.14730">pdf</a>, <a href="https://arxiv.org/format/2312.14730">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> To Fuse or Not to Fuse: Measuring Consistency in Multi-Sensor Fusion for Aerial Robots </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lanegger%2C+C">Christian Lanegger</a>, <a href="/search/cs?searchtype=author&amp;query=Oleynikova%2C+H">Helen Oleynikova</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Michael Pantic</a>, <a href="/search/cs?searchtype=author&amp;query=Ott%2C+L">Lionel Ott</a>, <a href="/search/cs?searchtype=author&amp;query=Siegwart%2C+R">Roland Siegwart</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.14730v1-abstract-short" style="display: inline;"> Aerial vehicles are no longer limited to flying in open space: recent work has focused on aerial manipulation and up-close inspection. Such applications place stringent requirements on state estimation: the robot must combine state information from many sources, including onboard odometry and global positioning sensors. However, flying close to or in contact with structures is a degenerate case fo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.14730v1-abstract-full').style.display = 'inline'; document.getElementById('2312.14730v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.14730v1-abstract-full" style="display: none;"> Aerial vehicles are no longer limited to flying in open space: recent work has focused on aerial manipulation and up-close inspection. Such applications place stringent requirements on state estimation: the robot must combine state information from many sources, including onboard odometry and global positioning sensors. However, flying close to or in contact with structures is a degenerate case for many sensing modalities, and the robot&#39;s state estimation framework must intelligently choose which sensors are currently trustworthy. We evaluate a number of metrics to judge the reliability of sensing modalities in a multi-sensor fusion framework, then introduce a consensus-finding scheme that uses this metric to choose which sensors to fuse or not to fuse. Finally, we show that such a fusion framework is more robust and accurate than fusing all sensors all the time and demonstrate how such metrics can be informative in real-world experiments in indoor-outdoor flight and bridge inspection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.14730v1-abstract-full').style.display = 'none'; document.getElementById('2312.14730v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted and presented at the 18th International Symposium on Experimental Robotics (ISER 2023)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.05125">arXiv:2312.05125</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.05125">pdf</a>, <a href="https://arxiv.org/format/2312.05125">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Learning to Fly Omnidirectional Micro Aerial Vehicles with an End-To-End Control Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cuniato%2C+E">Eugenio Cuniato</a>, <a href="/search/cs?searchtype=author&amp;query=Andersson%2C+O">Olov Andersson</a>, <a href="/search/cs?searchtype=author&amp;query=Oleynikova%2C+H">Helen Oleynikova</a>, <a href="/search/cs?searchtype=author&amp;query=Siegwart%2C+R">Roland Siegwart</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Michael Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.05125v1-abstract-short" style="display: inline;"> Overactuated tilt-rotor platforms offer many advantages over traditional fixed-arm drones, allowing the decoupling of the applied force from the attitude of the robot. This expands their application areas to aerial interaction and manipulation, and allows them to overcome disturbances such as from ground or wall effects by exploiting the additional degrees of freedom available to their controllers&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.05125v1-abstract-full').style.display = 'inline'; document.getElementById('2312.05125v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.05125v1-abstract-full" style="display: none;"> Overactuated tilt-rotor platforms offer many advantages over traditional fixed-arm drones, allowing the decoupling of the applied force from the attitude of the robot. This expands their application areas to aerial interaction and manipulation, and allows them to overcome disturbances such as from ground or wall effects by exploiting the additional degrees of freedom available to their controllers. However, the overactuation also complicates the control problem, especially if the motors that tilt the arms have slower dynamics than those spinning the propellers. Instead of building a complex model-based controller that takes all of these subtleties into account, we attempt to learn an end-to-end pose controller using reinforcement learning, and show its superior behavior in the presence of inertial and force disturbances compared to a state-of-the-art traditional controller. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.05125v1-abstract-full').style.display = 'none'; document.getElementById('2312.05125v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted and presented at the 18th International Symposium on Experimental Robotics (ISER 2023)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.05110">arXiv:2312.05110</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.05110">pdf</a>, <a href="https://arxiv.org/format/2312.05110">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Soliro -- a hybrid dynamic tilt-wing aerial manipulator with minimal actuators </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Michael Pantic</a>, <a href="/search/cs?searchtype=author&amp;query=Hampp%2C+E">Elias Hampp</a>, <a href="/search/cs?searchtype=author&amp;query=Flammer%2C+R">Ramon Flammer</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+W">Weixuan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Stastny%2C+T">Thomas Stastny</a>, <a href="/search/cs?searchtype=author&amp;query=Ott%2C+L">Lionel Ott</a>, <a href="/search/cs?searchtype=author&amp;query=Siegwart%2C+R">Roland Siegwart</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.05110v1-abstract-short" style="display: inline;"> The ability to enter in contact with and manipulate physical objects with a flying robot enables many novel applications, such as contact inspection, painting, drilling, and sample collection. Generally, these aerial robots need more degrees of freedom than a standard quadrotor. While there is active research of over-actuated, omnidirectional MAVs and aerial manipulators as well as VTOL and hybrid&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.05110v1-abstract-full').style.display = 'inline'; document.getElementById('2312.05110v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.05110v1-abstract-full" style="display: none;"> The ability to enter in contact with and manipulate physical objects with a flying robot enables many novel applications, such as contact inspection, painting, drilling, and sample collection. Generally, these aerial robots need more degrees of freedom than a standard quadrotor. While there is active research of over-actuated, omnidirectional MAVs and aerial manipulators as well as VTOL and hybrid platforms, the two concepts have not been combined. We address the problem of conceptualization, characterization, control, and testing of a 5DOF rotary-/fixed-wing hybrid, tilt-rotor, split tilt-wing, nearly omnidirectional aerial robot. We present an elegant solution with a minimal set of actuators and that does not need any classical control surfaces or flaps. The concept is validated in a wind tunnel study and in multiple flights with forward and backward transitions. Fixed-wing flight speeds up to 10 m/s were reached, with a power reduction of 30% as compared to rotary wing flight. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.05110v1-abstract-full').style.display = 'none'; document.getElementById('2312.05110v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted and presented at the 18th International Symposium on Experimental Robotics (ISER 2023)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.16584">arXiv:2307.16584</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2307.16584">pdf</a>, <a href="https://arxiv.org/format/2307.16584">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Audio-visual video-to-speech synthesis with synthesized input audio </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kefalas%2C+T">Triantafyllos Kefalas</a>, <a href="/search/cs?searchtype=author&amp;query=Panagakis%2C+Y">Yannis Panagakis</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.16584v1-abstract-short" style="display: inline;"> Video-to-speech synthesis involves reconstructing the speech signal of a speaker from a silent video. The implicit assumption of this task is that the sound signal is either missing or contains a high amount of noise/corruption such that it is not useful for processing. Previous works in the literature either use video inputs only or employ both video and audio inputs during training, and discard&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.16584v1-abstract-full').style.display = 'inline'; document.getElementById('2307.16584v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.16584v1-abstract-full" style="display: none;"> Video-to-speech synthesis involves reconstructing the speech signal of a speaker from a silent video. The implicit assumption of this task is that the sound signal is either missing or contains a high amount of noise/corruption such that it is not useful for processing. Previous works in the literature either use video inputs only or employ both video and audio inputs during training, and discard the input audio pathway during inference. In this work we investigate the effect of using video and audio inputs for video-to-speech synthesis during both training and inference. In particular, we use pre-trained video-to-speech models to synthesize the missing speech signals and then train an audio-visual-to-speech synthesis model, using both the silent video and the synthesized speech as inputs, to predict the final reconstructed speech. Our experiments demonstrate that this approach is successful with both raw waveforms and mel spectrograms as target outputs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.16584v1-abstract-full').style.display = 'none'; document.getElementById('2307.16584v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.04552">arXiv:2307.04552</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2307.04552">pdf</a>, <a href="https://arxiv.org/format/2307.04552">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SparseVSR: Lightweight and Noise Robust Visual Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Fernandez-Lopez%2C+A">Adriana Fernandez-Lopez</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Honglie Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+P">Pingchuan Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Haliassos%2C+A">Alexandros Haliassos</a>, <a href="/search/cs?searchtype=author&amp;query=Petridis%2C+S">Stavros Petridis</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.04552v1-abstract-short" style="display: inline;"> Recent advances in deep neural networks have achieved unprecedented success in visual speech recognition. However, there remains substantial disparity between current methods and their deployment in resource-constrained devices. In this work, we explore different magnitude-based pruning techniques to generate a lightweight model that achieves higher performance than its dense model equivalent, esp&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.04552v1-abstract-full').style.display = 'inline'; document.getElementById('2307.04552v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.04552v1-abstract-full" style="display: none;"> Recent advances in deep neural networks have achieved unprecedented success in visual speech recognition. However, there remains substantial disparity between current methods and their deployment in resource-constrained devices. In this work, we explore different magnitude-based pruning techniques to generate a lightweight model that achieves higher performance than its dense model equivalent, especially under the presence of visual noise. Our sparse models achieve state-of-the-art results at 10% sparsity on the LRS3 dataset and outperform the dense equivalent up to 70% sparsity. We evaluate our 50% sparse model on 7 different visual noise types and achieve an overall absolute improvement of more than 2% WER compared to the dense equivalent. Our results confirm that sparse networks are more resistant to noise than dense networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.04552v1-abstract-full').style.display = 'none'; document.getElementById('2307.04552v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Interspeech 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.15464">arXiv:2306.15464</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2306.15464">pdf</a>, <a href="https://arxiv.org/format/2306.15464">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Large-scale unsupervised audio pre-training for video-to-speech synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kefalas%2C+T">Triantafyllos Kefalas</a>, <a href="/search/cs?searchtype=author&amp;query=Panagakis%2C+Y">Yannis Panagakis</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.15464v2-abstract-short" style="display: inline;"> Video-to-speech synthesis is the task of reconstructing the speech signal from a silent video of a speaker. Most established approaches to date involve a two-step process, whereby an intermediate representation from the video, such as a spectrogram, is extracted first and then passed to a vocoder to produce the raw audio. Some recent work has focused on end-to-end synthesis, whereby the generation&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.15464v2-abstract-full').style.display = 'inline'; document.getElementById('2306.15464v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.15464v2-abstract-full" style="display: none;"> Video-to-speech synthesis is the task of reconstructing the speech signal from a silent video of a speaker. Most established approaches to date involve a two-step process, whereby an intermediate representation from the video, such as a spectrogram, is extracted first and then passed to a vocoder to produce the raw audio. Some recent work has focused on end-to-end synthesis, whereby the generation of raw audio and any intermediate representations is performed jointly. All such approaches involve training on data from almost exclusively audio-visual datasets, i.e. every audio sample has a corresponding video sample. This precludes the use of abundant audio-only datasets which may not have a corresponding visual modality (e.g. audiobooks, radio podcasts, speech recognition datasets etc.), as well as audio-only architectures that have been developed by the audio machine learning community over the years. In this paper we propose to train encoder-decoder models on more than 3,500 hours of audio data at 24kHz, and then use the pre-trained decoders to initialize the audio decoders for the video-to-speech synthesis task. The pre-training step uses audio samples only and does not require labels or corresponding samples from other modalities (visual, text). We demonstrate that this pre-training step improves the reconstructed speech and that it is an unexplored way to improve the quality of the generator in a cross-modal task while only requiring samples from one of the modalities. We conduct experiments using both raw audio and mel spectrograms as target outputs and benchmark our models with existing work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.15464v2-abstract-full').style.display = 'none'; document.getElementById('2306.15464v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Corrected typos. This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.08854">arXiv:2305.08854</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.08854">pdf</a>, <a href="https://arxiv.org/format/2305.08854">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Laughing Matters: Introducing Laughing-Face Generation using Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Casademunt%2C+A+B">Antoni Bigata Casademunt</a>, <a href="/search/cs?searchtype=author&amp;query=Mira%2C+R">Rodrigo Mira</a>, <a href="/search/cs?searchtype=author&amp;query=Drobyshev%2C+N">Nikita Drobyshev</a>, <a href="/search/cs?searchtype=author&amp;query=Vougioukas%2C+K">Konstantinos Vougioukas</a>, <a href="/search/cs?searchtype=author&amp;query=Petridis%2C+S">Stavros Petridis</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.08854v2-abstract-short" style="display: inline;"> Speech-driven animation has gained significant traction in recent years, with current methods achieving near-photorealistic results. However, the field remains underexplored regarding non-verbal communication despite evidence demonstrating its importance in human interaction. In particular, generating laughter sequences presents a unique challenge due to the intricacy and nuances of this behaviour&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.08854v2-abstract-full').style.display = 'inline'; document.getElementById('2305.08854v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.08854v2-abstract-full" style="display: none;"> Speech-driven animation has gained significant traction in recent years, with current methods achieving near-photorealistic results. However, the field remains underexplored regarding non-verbal communication despite evidence demonstrating its importance in human interaction. In particular, generating laughter sequences presents a unique challenge due to the intricacy and nuances of this behaviour. This paper aims to bridge this gap by proposing a novel model capable of generating realistic laughter sequences, given a still portrait and an audio clip containing laughter. We highlight the failure cases of traditional facial animation methods and leverage recent advances in diffusion models to produce convincing laughter videos. We train our model on a diverse set of laughter datasets and introduce an evaluation metric specifically designed for laughter. When compared with previous speech-driven approaches, our model achieves state-of-the-art performance across all metrics, even when these are re-trained for laughter generation. Our code and project are publicly available <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.08854v2-abstract-full').style.display = 'none'; document.getElementById('2305.08854v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.17200">arXiv:2303.17200</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2303.17200">pdf</a>, <a href="https://arxiv.org/format/2303.17200">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SynthVSR: Scaling Up Visual Speech Recognition With Synthetic Supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xubo Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Lakomkin%2C+E">Egor Lakomkin</a>, <a href="/search/cs?searchtype=author&amp;query=Vougioukas%2C+K">Konstantinos Vougioukas</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+P">Pingchuan Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Honglie Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+R">Ruiming Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Doulaty%2C+M">Morrie Doulaty</a>, <a href="/search/cs?searchtype=author&amp;query=Moritz%2C+N">Niko Moritz</a>, <a href="/search/cs?searchtype=author&amp;query=Kol%C3%A1%C5%99%2C+J">J谩chym Kol谩艡</a>, <a href="/search/cs?searchtype=author&amp;query=Petridis%2C+S">Stavros Petridis</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a>, <a href="/search/cs?searchtype=author&amp;query=Fuegen%2C+C">Christian Fuegen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.17200v2-abstract-short" style="display: inline;"> Recently reported state-of-the-art results in visual speech recognition (VSR) often rely on increasingly large amounts of video data, while the publicly available transcribed video datasets are limited in size. In this paper, for the first time, we study the potential of leveraging synthetic visual data for VSR. Our method, termed SynthVSR, substantially improves the performance of VSR systems wit&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.17200v2-abstract-full').style.display = 'inline'; document.getElementById('2303.17200v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.17200v2-abstract-full" style="display: none;"> Recently reported state-of-the-art results in visual speech recognition (VSR) often rely on increasingly large amounts of video data, while the publicly available transcribed video datasets are limited in size. In this paper, for the first time, we study the potential of leveraging synthetic visual data for VSR. Our method, termed SynthVSR, substantially improves the performance of VSR systems with synthetic lip movements. The key idea behind SynthVSR is to leverage a speech-driven lip animation model that generates lip movements conditioned on the input speech. The speech-driven lip animation model is trained on an unlabeled audio-visual dataset and could be further optimized towards a pre-trained VSR model when labeled videos are available. As plenty of transcribed acoustic data and face images are available, we are able to generate large-scale synthetic data using the proposed lip animation model for semi-supervised VSR training. We evaluate the performance of our approach on the largest public VSR benchmark - Lip Reading Sentences 3 (LRS3). SynthVSR achieves a WER of 43.3% with only 30 hours of real labeled data, outperforming off-the-shelf approaches using thousands of hours of video. The WER is further reduced to 27.9% when using all 438 hours of labeled data from LRS3, which is on par with the state-of-the-art self-supervised AV-HuBERT method. Furthermore, when combined with large-scale pseudo-labeled audio-visual data SynthVSR yields a new state-of-the-art VSR WER of 16.9% using publicly available data only, surpassing the recent state-of-the-art approaches trained with 29 times more non-public machine-transcribed video data (90,000 hours). Finally, we perform extensive ablation studies to understand the effect of each component in our proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.17200v2-abstract-full').style.display = 'none'; document.getElementById('2303.17200v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE/CVF CVPR 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.14307">arXiv:2303.14307</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2303.14307">pdf</a>, <a href="https://arxiv.org/format/2303.14307">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICASSP49357.2023.10096889">10.1109/ICASSP49357.2023.10096889 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Auto-AVSR: Audio-Visual Speech Recognition with Automatic Labels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ma%2C+P">Pingchuan Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Haliassos%2C+A">Alexandros Haliassos</a>, <a href="/search/cs?searchtype=author&amp;query=Fernandez-Lopez%2C+A">Adriana Fernandez-Lopez</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Honglie Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Petridis%2C+S">Stavros Petridis</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.14307v3-abstract-short" style="display: inline;"> Audio-visual speech recognition has received a lot of attention due to its robustness against acoustic noise. Recently, the performance of automatic, visual, and audio-visual speech recognition (ASR, VSR, and AV-ASR, respectively) has been substantially improved, mainly due to the use of larger models and training sets. However, accurate labelling of datasets is time-consuming and expensive. Hence&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.14307v3-abstract-full').style.display = 'inline'; document.getElementById('2303.14307v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.14307v3-abstract-full" style="display: none;"> Audio-visual speech recognition has received a lot of attention due to its robustness against acoustic noise. Recently, the performance of automatic, visual, and audio-visual speech recognition (ASR, VSR, and AV-ASR, respectively) has been substantially improved, mainly due to the use of larger models and training sets. However, accurate labelling of datasets is time-consuming and expensive. Hence, in this work, we investigate the use of automatically-generated transcriptions of unlabelled datasets to increase the training set size. For this purpose, we use publicly-available pre-trained ASR models to automatically transcribe unlabelled datasets such as AVSpeech and VoxCeleb2. Then, we train ASR, VSR and AV-ASR models on the augmented training set, which consists of the LRS2 and LRS3 datasets as well as the additional automatically-transcribed data. We demonstrate that increasing the size of the training set, a recent trend in the literature, leads to reduced WER despite using noisy transcriptions. The proposed model achieves new state-of-the-art performance on AV-ASR on LRS2 and LRS3. In particular, it achieves a WER of 0.9% on LRS3, a relative improvement of 30% over the current state-of-the-art approach, and outperforms methods that have been trained on non-publicly available datasets with 26 times more training data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.14307v3-abstract-full').style.display = 'none'; document.getElementById('2303.14307v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICASSP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.09455">arXiv:2303.09455</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2303.09455">pdf</a>, <a href="https://arxiv.org/format/2303.09455">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Learning Cross-lingual Visual Speech Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zinonos%2C+A">Andreas Zinonos</a>, <a href="/search/cs?searchtype=author&amp;query=Haliassos%2C+A">Alexandros Haliassos</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+P">Pingchuan Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Petridis%2C+S">Stavros Petridis</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.09455v1-abstract-short" style="display: inline;"> Cross-lingual self-supervised learning has been a growing research topic in the last few years. However, current works only explored the use of audio signals to create representations. In this work, we study cross-lingual self-supervised visual representation learning. We use the recently-proposed Raw Audio-Visual Speech Encoders (RAVEn) framework to pre-train an audio-visual model with unlabelled&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.09455v1-abstract-full').style.display = 'inline'; document.getElementById('2303.09455v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.09455v1-abstract-full" style="display: none;"> Cross-lingual self-supervised learning has been a growing research topic in the last few years. However, current works only explored the use of audio signals to create representations. In this work, we study cross-lingual self-supervised visual representation learning. We use the recently-proposed Raw Audio-Visual Speech Encoders (RAVEn) framework to pre-train an audio-visual model with unlabelled multilingual data, and then fine-tune the visual model on labelled transcriptions. Our experiments show that: (1) multi-lingual models with more data outperform monolingual ones, but, when keeping the amount of data fixed, monolingual models tend to reach better performance; (2) multi-lingual outperforms English-only pre-training; (3) using languages which are more similar yields better results; and (4) fine-tuning on unseen languages is competitive to using the target language in the pre-training set. We hope our study inspires future research on non-English-only speech representation learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.09455v1-abstract-full').style.display = 'none'; document.getElementById('2303.09455v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.01352">arXiv:2303.01352</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2303.01352">pdf</a>, <a href="https://arxiv.org/format/2303.01352">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Chasing Millimeters: Design, Navigation and State Estimation for Precise In-flight Marking on Ceilings </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lanegger%2C+C">Christian Lanegger</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Michael Pantic</a>, <a href="/search/cs?searchtype=author&amp;query=B%C3%A4hnemann%2C+R">Rik B盲hnemann</a>, <a href="/search/cs?searchtype=author&amp;query=Siegwart%2C+R">Roland Siegwart</a>, <a href="/search/cs?searchtype=author&amp;query=Ott%2C+L">Lionel Ott</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.01352v1-abstract-short" style="display: inline;"> Precise markings for drilling and assembly are crucial, laborious construction tasks. Aerial robots with suitable end-effectors are capable of markings at the millimeter scale. However, so far, they have only been demonstrated under laboratory conditions where rigid state estimation and navigation assumptions do not impede robustness and accuracy. This paper presents a complete aerial layouting sy&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.01352v1-abstract-full').style.display = 'inline'; document.getElementById('2303.01352v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.01352v1-abstract-full" style="display: none;"> Precise markings for drilling and assembly are crucial, laborious construction tasks. Aerial robots with suitable end-effectors are capable of markings at the millimeter scale. However, so far, they have only been demonstrated under laboratory conditions where rigid state estimation and navigation assumptions do not impede robustness and accuracy. This paper presents a complete aerial layouting system capable of precise markings on-site under realistic conditions. We use a compliant actuated end-effector on an omnidirectional flying base. Combining a two-stage factor-graph state estimator with a Riemannian Motion Policy-based navigation stack, we avoid the need for a globally consistent estimate and increase robustness. The policy-based navigation is structured into individual behaviors in different state spaces. Through a comprehensive study, we show that the system creates highly precise markings at a relative precision of 1.5 mm and a global accuracy of 5-6 mm and discuss the results in the context of future construction robotics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.01352v1-abstract-full').style.display = 'none'; document.getElementById('2303.01352v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">C. Lanegger and M. Pantic contributed equally. Submitted to Autonomous Robots journal (Springer)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.08068">arXiv:2301.08068</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2301.08068">pdf</a>, <a href="https://arxiv.org/format/2301.08068">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Obstacle avoidance using raycasting and Riemannian Motion Policies at kHz rates for MAVs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Michael Pantic</a>, <a href="/search/cs?searchtype=author&amp;query=Meijer%2C+I">Isar Meijer</a>, <a href="/search/cs?searchtype=author&amp;query=B%C3%A4hnemann%2C+R">Rik B盲hnemann</a>, <a href="/search/cs?searchtype=author&amp;query=Alatur%2C+N">Nikhilesh Alatur</a>, <a href="/search/cs?searchtype=author&amp;query=Andersson%2C+O">Olov Andersson</a>, <a href="/search/cs?searchtype=author&amp;query=Lerma%2C+C+C">Cesar Cadena Lerma</a>, <a href="/search/cs?searchtype=author&amp;query=Siegwart%2C+R">Roland Siegwart</a>, <a href="/search/cs?searchtype=author&amp;query=Ott%2C+L">Lionel Ott</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.08068v1-abstract-short" style="display: inline;"> In this paper, we present a novel method for using Riemannian Motion Policies on volumetric maps, shown in the example of obstacle avoidance for Micro Aerial Vehicles (MAVs). While sampling or optimization-based planners are widely used for obstacle avoidance with volumetric maps, they are computationally expensive and often have inflexible monolithic architectures. Riemannian Motion Policies are&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.08068v1-abstract-full').style.display = 'inline'; document.getElementById('2301.08068v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.08068v1-abstract-full" style="display: none;"> In this paper, we present a novel method for using Riemannian Motion Policies on volumetric maps, shown in the example of obstacle avoidance for Micro Aerial Vehicles (MAVs). While sampling or optimization-based planners are widely used for obstacle avoidance with volumetric maps, they are computationally expensive and often have inflexible monolithic architectures. Riemannian Motion Policies are a modular, parallelizable, and efficient navigation paradigm but are challenging to use with the widely used voxel-based environment representations. We propose using GPU raycasting and a large number of concurrent policies to provide direct obstacle avoidance using Riemannian Motion Policies in voxelized maps without the need for smoothing or pre-processing of the map. Additionally, we present how the same method can directly plan on LiDAR scans without the need for an intermediate map. We show how this reactive approach compares favorably to traditional planning methods and is able to plan using thousands of rays at kilohertz rates. We demonstrate the planner successfully on a real MAV for static and dynamic obstacles. The presented planner is made available as an open-source software package. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.08068v1-abstract-full').style.display = 'none'; document.getElementById('2301.08068v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IROS 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.03396">arXiv:2301.03396</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2301.03396">pdf</a>, <a href="https://arxiv.org/format/2301.03396">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Diffused Heads: Diffusion Models Beat GANs on Talking-Face Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Stypu%C5%82kowski%2C+M">Micha艂 Stypu艂kowski</a>, <a href="/search/cs?searchtype=author&amp;query=Vougioukas%2C+K">Konstantinos Vougioukas</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+S">Sen He</a>, <a href="/search/cs?searchtype=author&amp;query=Zi%C4%99ba%2C+M">Maciej Zi臋ba</a>, <a href="/search/cs?searchtype=author&amp;query=Petridis%2C+S">Stavros Petridis</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.03396v2-abstract-short" style="display: inline;"> Talking face generation has historically struggled to produce head movements and natural facial expressions without guidance from additional reference videos. Recent developments in diffusion-based generative models allow for more realistic and stable data synthesis and their performance on image and video generation has surpassed that of other generative models. In this work, we present an autore&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.03396v2-abstract-full').style.display = 'inline'; document.getElementById('2301.03396v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.03396v2-abstract-full" style="display: none;"> Talking face generation has historically struggled to produce head movements and natural facial expressions without guidance from additional reference videos. Recent developments in diffusion-based generative models allow for more realistic and stable data synthesis and their performance on image and video generation has surpassed that of other generative models. In this work, we present an autoregressive diffusion model that requires only one identity image and audio sequence to generate a video of a realistic talking human head. Our solution is capable of hallucinating head movements, facial expressions, such as blinks, and preserving a given background. We evaluate our model on two different datasets, achieving state-of-the-art results on both of them. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.03396v2-abstract-full').style.display = 'none'; document.getElementById('2301.03396v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.06246">arXiv:2212.06246</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2212.06246">pdf</a>, <a href="https://arxiv.org/format/2212.06246">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Jointly Learning Visual and Auditory Speech Representations from Raw Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Haliassos%2C+A">Alexandros Haliassos</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+P">Pingchuan Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Mira%2C+R">Rodrigo Mira</a>, <a href="/search/cs?searchtype=author&amp;query=Petridis%2C+S">Stavros Petridis</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.06246v2-abstract-short" style="display: inline;"> We present RAVEn, a self-supervised multi-modal approach to jointly learn visual and auditory speech representations. Our pre-training objective involves encoding masked inputs, and then predicting contextualised targets generated by slowly-evolving momentum encoders. Driven by the inherent differences between video and audio, our design is asymmetric w.r.t. the two modalities&#39; pretext tasks: Wher&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.06246v2-abstract-full').style.display = 'inline'; document.getElementById('2212.06246v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.06246v2-abstract-full" style="display: none;"> We present RAVEn, a self-supervised multi-modal approach to jointly learn visual and auditory speech representations. Our pre-training objective involves encoding masked inputs, and then predicting contextualised targets generated by slowly-evolving momentum encoders. Driven by the inherent differences between video and audio, our design is asymmetric w.r.t. the two modalities&#39; pretext tasks: Whereas the auditory stream predicts both the visual and auditory targets, the visual one predicts only the auditory targets. We observe strong results in low- and high-resource labelled data settings when fine-tuning the visual and auditory encoders resulting from a single pre-training stage, in which the encoders are jointly trained. Notably, RAVEn surpasses all self-supervised methods on visual speech recognition (VSR) on LRS3, and combining RAVEn with self-training using only 30 hours of labelled data even outperforms a recent semi-supervised method trained on 90,000 hours of non-public data. At the same time, we achieve state-of-the-art results in the LRS3 low-resource setting for auditory speech recognition (as well as for VSR). Our findings point to the viability of learning powerful speech representations entirely from raw video and audio, i.e., without relying on handcrafted features. Code and models are available at https://github.com/ahaliassos/raven. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.06246v2-abstract-full').style.display = 'none'; document.getElementById('2212.06246v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2023. Code: https://github.com/ahaliassos/raven</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.10999">arXiv:2211.10999</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2211.10999">pdf</a>, <a href="https://arxiv.org/format/2211.10999">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> LA-VocE: Low-SNR Audio-visual Speech Enhancement using Neural Vocoders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mira%2C+R">Rodrigo Mira</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+B">Buye Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Donley%2C+J">Jacob Donley</a>, <a href="/search/cs?searchtype=author&amp;query=Kumar%2C+A">Anurag Kumar</a>, <a href="/search/cs?searchtype=author&amp;query=Petridis%2C+S">Stavros Petridis</a>, <a href="/search/cs?searchtype=author&amp;query=Ithapu%2C+V+K">Vamsi Krishna Ithapu</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.10999v2-abstract-short" style="display: inline;"> Audio-visual speech enhancement aims to extract clean speech from a noisy environment by leveraging not only the audio itself but also the target speaker&#39;s lip movements. This approach has been shown to yield improvements over audio-only speech enhancement, particularly for the removal of interfering speech. Despite recent advances in speech synthesis, most audio-visual approaches continue to use&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.10999v2-abstract-full').style.display = 'inline'; document.getElementById('2211.10999v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.10999v2-abstract-full" style="display: none;"> Audio-visual speech enhancement aims to extract clean speech from a noisy environment by leveraging not only the audio itself but also the target speaker&#39;s lip movements. This approach has been shown to yield improvements over audio-only speech enhancement, particularly for the removal of interfering speech. Despite recent advances in speech synthesis, most audio-visual approaches continue to use spectral mapping/masking to reproduce the clean audio, often resulting in visual backbones added to existing speech enhancement architectures. In this work, we propose LA-VocE, a new two-stage approach that predicts mel-spectrograms from noisy audio-visual speech via a transformer-based architecture, and then converts them into waveform audio using a neural vocoder (HiFi-GAN). We train and evaluate our framework on thousands of speakers and 11+ different languages, and study our model&#39;s ability to adapt to different levels of background noise and speech interference. Our experiments show that LA-VocE outperforms existing methods according to multiple metrics, particularly under very noisy scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.10999v2-abstract-full').style.display = 'none'; document.getElementById('2211.10999v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted to ICASSP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.06143">arXiv:2211.06143</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2211.06143">pdf</a>, <a href="https://arxiv.org/format/2211.06143">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FAN-Trans: Online Knowledge Distillation for Facial Action Unit Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+J">Jing Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+J">Jie Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y">Yiming Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Hristov%2C+Y">Yordan Hristov</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.06143v1-abstract-short" style="display: inline;"> Due to its importance in facial behaviour analysis, facial action unit (AU) detection has attracted increasing attention from the research community. Leveraging the online knowledge distillation framework, we propose the ``FANTrans&#34; method for AU detection. Our model consists of a hybrid network of convolution and transformer blocks to learn per-AU features and to model AU co-occurrences. The mode&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.06143v1-abstract-full').style.display = 'inline'; document.getElementById('2211.06143v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.06143v1-abstract-full" style="display: none;"> Due to its importance in facial behaviour analysis, facial action unit (AU) detection has attracted increasing attention from the research community. Leveraging the online knowledge distillation framework, we propose the ``FANTrans&#34; method for AU detection. Our model consists of a hybrid network of convolution and transformer blocks to learn per-AU features and to model AU co-occurrences. The model uses a pre-trained face alignment network as the feature extractor. After further transformation by a small learnable add-on convolutional subnet, the per-AU features are fed into transformer blocks to enhance their representation. As multiple AUs often appear together, we propose a learnable attention drop mechanism in the transformer block to learn the correlation between the features for different AUs. We also design a classifier that predicts AU presence by considering all AUs&#39; features, to explicitly capture label dependencies. Finally, we make the attempt of adapting online knowledge distillation in the training stage for this task, further improving the model&#39;s performance. Experiments on the BP4D and DISFA datasets demonstrating the effectiveness of proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.06143v1-abstract-full').style.display = 'none'; document.getElementById('2211.06143v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 6 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> WACV2023 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.02133">arXiv:2211.02133</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2211.02133">pdf</a>, <a href="https://arxiv.org/format/2211.02133">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Streaming Audio-Visual Speech Recognition with Alignment Regularization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ma%2C+P">Pingchuan Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Moritz%2C+N">Niko Moritz</a>, <a href="/search/cs?searchtype=author&amp;query=Petridis%2C+S">Stavros Petridis</a>, <a href="/search/cs?searchtype=author&amp;query=Fuegen%2C+C">Christian Fuegen</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.02133v2-abstract-short" style="display: inline;"> In this work, we propose a streaming AV-ASR system based on a hybrid connectionist temporal classification (CTC)/attention neural network architecture. The audio and the visual encoder neural networks are both based on the conformer architecture, which is made streamable using chunk-wise self-attention (CSA) and causal convolution. Streaming recognition with a decoder neural network is realized by&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.02133v2-abstract-full').style.display = 'inline'; document.getElementById('2211.02133v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.02133v2-abstract-full" style="display: none;"> In this work, we propose a streaming AV-ASR system based on a hybrid connectionist temporal classification (CTC)/attention neural network architecture. The audio and the visual encoder neural networks are both based on the conformer architecture, which is made streamable using chunk-wise self-attention (CSA) and causal convolution. Streaming recognition with a decoder neural network is realized by using the triggered attention technique, which performs time-synchronous decoding with joint CTC/attention scoring. Additionally, we propose a novel alignment regularization technique that promotes synchronization of the audio and visual encoder, which in turn results in better word error rates (WERs) at all SNR levels for streaming and offline AV-ASR models. The proposed AV-ASR model achieves WERs of 2.0% and 2.6% on the Lip Reading Sentences 3 (LRS3) dataset in an offline and online setup, respectively, which both present state-of-the-art results when no external training data are used. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.02133v2-abstract-full').style.display = 'none'; document.getElementById('2211.02133v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Interspeech 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.11341">arXiv:2210.11341</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2210.11341">pdf</a>, <a href="https://arxiv.org/format/2210.11341">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SS-VAERR: Self-Supervised Apparent Emotional Reaction Recognition from Video </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jegorova%2C+M">Marija Jegorova</a>, <a href="/search/cs?searchtype=author&amp;query=Petridis%2C+S">Stavros Petridis</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.11341v1-abstract-short" style="display: inline;"> This work focuses on the apparent emotional reaction recognition (AERR) from the video-only input, conducted in a self-supervised fashion. The network is first pre-trained on different self-supervised pretext tasks and later fine-tuned on the downstream target task. Self-supervised learning facilitates the use of pre-trained architectures and larger datasets that might be deemed unfit for the targ&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.11341v1-abstract-full').style.display = 'inline'; document.getElementById('2210.11341v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.11341v1-abstract-full" style="display: none;"> This work focuses on the apparent emotional reaction recognition (AERR) from the video-only input, conducted in a self-supervised fashion. The network is first pre-trained on different self-supervised pretext tasks and later fine-tuned on the downstream target task. Self-supervised learning facilitates the use of pre-trained architectures and larger datasets that might be deemed unfit for the target task and yet might be useful to learn informative representations and hence provide useful initializations for further fine-tuning on smaller more suitable data. Our presented contribution is two-fold: (1) an analysis of different state-of-the-art (SOTA) pretext tasks for the video-only apparent emotional reaction recognition architecture, and (2) an analysis of various combinations of the regression and classification losses that are likely to improve the performance further. Together these two contributions result in the current state-of-the-art performance for the video-only spontaneous apparent emotional reaction recognition with continuous annotations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.11341v1-abstract-full').style.display = 'none'; document.getElementById('2210.11341v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.01383">arXiv:2209.01383</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2209.01383">pdf</a>, <a href="https://arxiv.org/format/2209.01383">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICASSP43922.2022.9746706">10.1109/ICASSP43922.2022.9746706 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Training Strategies for Improved Lip-reading </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ma%2C+P">Pingchuan Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yujiang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Petridis%2C+S">Stavros Petridis</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+J">Jie Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.01383v3-abstract-short" style="display: inline;"> Several training strategies and temporal models have been recently proposed for isolated word lip-reading in a series of independent works. However, the potential of combining the best strategies and investigating the impact of each of them has not been explored. In this paper, we systematically investigate the performance of state-of-the-art data augmentation approaches, temporal models and other&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.01383v3-abstract-full').style.display = 'inline'; document.getElementById('2209.01383v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.01383v3-abstract-full" style="display: none;"> Several training strategies and temporal models have been recently proposed for isolated word lip-reading in a series of independent works. However, the potential of combining the best strategies and investigating the impact of each of them has not been explored. In this paper, we systematically investigate the performance of state-of-the-art data augmentation approaches, temporal models and other training strategies, like self-distillation and using word boundary indicators. Our results show that Time Masking (TM) is the most important augmentation followed by mixup and Densely-Connected Temporal Convolutional Networks (DC-TCN) are the best temporal model for lip-reading of isolated words. Using self-distillation and word boundary indicators is also beneficial but to a lesser extent. A combination of all the above methods results in a classification accuracy of 93.4%, which is an absolute improvement of 4.6% over the current state-of-the-art performance on the LRW dataset. The performance can be further improved to 94.1% by pre-training on additional datasets. An error analysis of the various training strategies reveals that the performance improves by increasing the classification accuracy of hard-to-recognise words. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.01383v3-abstract-full').style.display = 'none'; document.getElementById('2209.01383v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICASSP 2022. Code is available at https://sites.google.com/view/audiovisual-speech-recognition</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 8472-8476, 2022 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.02058">arXiv:2205.02058</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2205.02058">pdf</a>, <a href="https://arxiv.org/format/2205.02058">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SVTS: Scalable Video-to-Speech Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mira%2C+R">Rodrigo Mira</a>, <a href="/search/cs?searchtype=author&amp;query=Haliassos%2C+A">Alexandros Haliassos</a>, <a href="/search/cs?searchtype=author&amp;query=Petridis%2C+S">Stavros Petridis</a>, <a href="/search/cs?searchtype=author&amp;query=Schuller%2C+B+W">Bj枚rn W. Schuller</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.02058v2-abstract-short" style="display: inline;"> Video-to-speech synthesis (also known as lip-to-speech) refers to the translation of silent lip movements into the corresponding audio. This task has received an increasing amount of attention due to its self-supervised nature (i.e., can be trained without manual labelling) combined with the ever-growing collection of audio-visual data available online. Despite these strong motivations, contempora&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.02058v2-abstract-full').style.display = 'inline'; document.getElementById('2205.02058v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.02058v2-abstract-full" style="display: none;"> Video-to-speech synthesis (also known as lip-to-speech) refers to the translation of silent lip movements into the corresponding audio. This task has received an increasing amount of attention due to its self-supervised nature (i.e., can be trained without manual labelling) combined with the ever-growing collection of audio-visual data available online. Despite these strong motivations, contemporary video-to-speech works focus mainly on small- to medium-sized corpora with substantial constraints in both vocabulary and setting. In this work, we introduce a scalable video-to-speech framework consisting of two components: a video-to-spectrogram predictor and a pre-trained neural vocoder, which converts the mel-frequency spectrograms into waveform audio. We achieve state-of-the art results for GRID and considerably outperform previous approaches on LRW. More importantly, by focusing on spectrogram prediction using a simple feedforward model, we can efficiently and effectively scale our method to very large and unconstrained datasets: To the best of our knowledge, we are the first to show intelligible results on the challenging LRS3 dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.02058v2-abstract-full').style.display = 'none'; document.getElementById('2205.02058v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted to INTERSPEECH 2022 (Oral Presentation)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.01389">arXiv:2205.01389</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2205.01389">pdf</a>, <a href="https://arxiv.org/format/2205.01389">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Sampling-free obstacle gradients and reactive planning in Neural Radiance Fields (NeRF) </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Michael Pantic</a>, <a href="/search/cs?searchtype=author&amp;query=Cadena%2C+C">Cesar Cadena</a>, <a href="/search/cs?searchtype=author&amp;query=Siegwart%2C+R">Roland Siegwart</a>, <a href="/search/cs?searchtype=author&amp;query=Ott%2C+L">Lionel Ott</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.01389v1-abstract-short" style="display: inline;"> This work investigates the use of Neural implicit representations, specifically Neural Radiance Fields (NeRF), for geometrical queries and motion planning. We show that by adding the capacity to infer occupancy in a radius to a pre-trained NeRF, we are effectively learning an approximation to a Euclidean Signed Distance Field (ESDF). Using backward differentiation of the augmented network, we obta&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.01389v1-abstract-full').style.display = 'inline'; document.getElementById('2205.01389v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.01389v1-abstract-full" style="display: none;"> This work investigates the use of Neural implicit representations, specifically Neural Radiance Fields (NeRF), for geometrical queries and motion planning. We show that by adding the capacity to infer occupancy in a radius to a pre-trained NeRF, we are effectively learning an approximation to a Euclidean Signed Distance Field (ESDF). Using backward differentiation of the augmented network, we obtain an obstacle gradient that is integrated into an obstacle avoidance policy based on the Riemannian Motion Policies (RMP) framework. Thus, our findings allow for very fast sampling-free obstacle avoidance planning in the implicit representation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.01389v1-abstract-full').style.display = 'none'; document.getElementById('2205.01389v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to the &#34;Motion Planning with Implicit Neural Representations of Geometry&#34; Workshop at ICRA 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.13166">arXiv:2203.13166</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2203.13166">pdf</a>, <a href="https://arxiv.org/format/2203.13166">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TPAMI.2023.3243812">10.1109/TPAMI.2023.3243812 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Self-supervised Video-centralised Transformer for Video Face Clustering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yujiang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+M">Mingzhi Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+J">Jie Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+Y">Yiming Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y">Yiming Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+P">Pingchuan Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Petridis%2C+S">Stavros Petridis</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.13166v4-abstract-short" style="display: inline;"> This paper presents a novel method for face clustering in videos using a video-centralised transformer. Previous works often employed contrastive learning to learn frame-level representation and used average pooling to aggregate the features along the temporal dimension. This approach may not fully capture the complicated video dynamics. In addition, despite the recent progress in video-based cont&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.13166v4-abstract-full').style.display = 'inline'; document.getElementById('2203.13166v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.13166v4-abstract-full" style="display: none;"> This paper presents a novel method for face clustering in videos using a video-centralised transformer. Previous works often employed contrastive learning to learn frame-level representation and used average pooling to aggregate the features along the temporal dimension. This approach may not fully capture the complicated video dynamics. In addition, despite the recent progress in video-based contrastive learning, few have attempted to learn a self-supervised clustering-friendly face representation that benefits the video face clustering task. To overcome these limitations, our method employs a transformer to directly learn video-level representations that can better reflect the temporally-varying property of faces in videos, while we also propose a video-centralised self-supervised framework to train the transformer model. We also investigate face clustering in egocentric videos, a fast-emerging field that has not been studied yet in works related to face clustering. To this end, we present and release the first large-scale egocentric video face clustering dataset named EasyCom-Clustering. We evaluate our proposed method on both the widely used Big Bang Theory (BBT) dataset and the new EasyCom-Clustering dataset. Results show the performance of our video-centralised transformer has surpassed all previous state-of-the-art methods on both benchmarks, exhibiting a self-attentive understanding of face videos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.13166v4-abstract-full').style.display = 'none'; document.getElementById('2203.13166v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.13084">arXiv:2202.13084</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2202.13084">pdf</a>, <a href="https://arxiv.org/format/2202.13084">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1038/s42256-022-00550-z">10.1038/s42256-022-00550-z <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Visual Speech Recognition for Multiple Languages in the Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ma%2C+P">Pingchuan Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Petridis%2C+S">Stavros Petridis</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.13084v2-abstract-short" style="display: inline;"> Visual speech recognition (VSR) aims to recognize the content of speech based on lip movements, without relying on the audio stream. Advances in deep learning and the availability of large audio-visual datasets have led to the development of much more accurate and robust VSR models than ever before. However, these advances are usually due to the larger training sets rather than the model design. H&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.13084v2-abstract-full').style.display = 'inline'; document.getElementById('2202.13084v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.13084v2-abstract-full" style="display: none;"> Visual speech recognition (VSR) aims to recognize the content of speech based on lip movements, without relying on the audio stream. Advances in deep learning and the availability of large audio-visual datasets have led to the development of much more accurate and robust VSR models than ever before. However, these advances are usually due to the larger training sets rather than the model design. Here we demonstrate that designing better models is equally as important as using larger training sets. We propose the addition of prediction-based auxiliary tasks to a VSR model, and highlight the importance of hyperparameter optimization and appropriate data augmentations. We show that such a model works for different languages and outperforms all previous methods trained on publicly available datasets by a large margin. It even outperforms models that were trained on non-publicly available datasets containing up to to 21 times more data. We show, furthermore, that using additional training data, even in other languages or with automatically generated transcriptions, results in further improvement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.13084v2-abstract-full').style.display = 'none'; document.getElementById('2202.13084v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in Nature Machine Intelligence</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2201.07131">arXiv:2201.07131</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2201.07131">pdf</a>, <a href="https://arxiv.org/format/2201.07131">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Leveraging Real Talking Faces via Self-Supervision for Robust Forgery Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Haliassos%2C+A">Alexandros Haliassos</a>, <a href="/search/cs?searchtype=author&amp;query=Mira%2C+R">Rodrigo Mira</a>, <a href="/search/cs?searchtype=author&amp;query=Petridis%2C+S">Stavros Petridis</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2201.07131v3-abstract-short" style="display: inline;"> One of the most pressing challenges for the detection of face-manipulated videos is generalising to forgery methods not seen during training while remaining effective under common corruptions such as compression. In this paper, we examine whether we can tackle this issue by harnessing videos of real talking faces, which contain rich information on natural facial appearance and behaviour and are re&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.07131v3-abstract-full').style.display = 'inline'; document.getElementById('2201.07131v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2201.07131v3-abstract-full" style="display: none;"> One of the most pressing challenges for the detection of face-manipulated videos is generalising to forgery methods not seen during training while remaining effective under common corruptions such as compression. In this paper, we examine whether we can tackle this issue by harnessing videos of real talking faces, which contain rich information on natural facial appearance and behaviour and are readily available in large quantities online. Our method, termed RealForensics, consists of two stages. First, we exploit the natural correspondence between the visual and auditory modalities in real videos to learn, in a self-supervised cross-modal manner, temporally dense video representations that capture factors such as facial movements, expression, and identity. Second, we use these learned representations as targets to be predicted by our forgery detector along with the usual binary forgery classification task; this encourages it to base its real/fake decision on said factors. We show that our method achieves state-of-the-art performance on cross-manipulation generalisation and robustness experiments, and examine the factors that contribute to its performance. Our results suggest that leveraging natural and unlabelled videos is a promising direction for the development of more robust face forgery detectors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.07131v3-abstract-full').style.display = 'none'; document.getElementById('2201.07131v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2022. Code: https://github.com/ahaliassos/RealForensics</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.06271">arXiv:2111.06271</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2111.06271">pdf</a>, <a href="https://arxiv.org/format/2111.06271">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Multi-Resolution Elevation Mapping and Safe Landing Site Detection with Applications to Planetary Rotorcraft </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Schoppmann%2C+P">Pascal Schoppmann</a>, <a href="/search/cs?searchtype=author&amp;query=Proen%C3%A7a%2C+P+F">Pedro F. Proen莽a</a>, <a href="/search/cs?searchtype=author&amp;query=Delaune%2C+J">Jeff Delaune</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Michael Pantic</a>, <a href="/search/cs?searchtype=author&amp;query=Hinzmann%2C+T">Timo Hinzmann</a>, <a href="/search/cs?searchtype=author&amp;query=Matthies%2C+L">Larry Matthies</a>, <a href="/search/cs?searchtype=author&amp;query=Siegwart%2C+R">Roland Siegwart</a>, <a href="/search/cs?searchtype=author&amp;query=Brockers%2C+R">Roland Brockers</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.06271v1-abstract-short" style="display: inline;"> In this paper, we propose a resource-efficient approach to provide an autonomous UAV with an on-board perception method to detect safe, hazard-free landing sites during flights over complex 3D terrain. We aggregate 3D measurements acquired from a sequence of monocular images by a Structure-from-Motion approach into a local, robot-centric, multi-resolution elevation map of the overflown terrain, wh&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.06271v1-abstract-full').style.display = 'inline'; document.getElementById('2111.06271v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.06271v1-abstract-full" style="display: none;"> In this paper, we propose a resource-efficient approach to provide an autonomous UAV with an on-board perception method to detect safe, hazard-free landing sites during flights over complex 3D terrain. We aggregate 3D measurements acquired from a sequence of monocular images by a Structure-from-Motion approach into a local, robot-centric, multi-resolution elevation map of the overflown terrain, which fuses depth measurements according to their lateral surface resolution (pixel-footprint) in a probabilistic framework based on the concept of dynamic Level of Detail. Map aggregation only requires depth maps and the associated poses, which are obtained from an onboard Visual Odometry algorithm. An efficient landing site detection method then exploits the features of the underlying multi-resolution map to detect safe landing sites based on slope, roughness, and quality of the reconstructed terrain surface. The evaluation of the performance of the mapping and landing site detection modules are analyzed independently and jointly in simulated and real-world experiments in order to establish the efficacy of the proposed approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.06271v1-abstract-full').style.display = 'none'; document.getElementById('2111.06271v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 12 figures. Accepted at IROS 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.13859">arXiv:2110.13859</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2110.13859">pdf</a>, <a href="https://arxiv.org/format/2110.13859">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Defensive Tensorization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bulat%2C+A">Adrian Bulat</a>, <a href="/search/cs?searchtype=author&amp;query=Kossaifi%2C+J">Jean Kossaifi</a>, <a href="/search/cs?searchtype=author&amp;query=Bhattacharya%2C+S">Sourav Bhattacharya</a>, <a href="/search/cs?searchtype=author&amp;query=Panagakis%2C+Y">Yannis Panagakis</a>, <a href="/search/cs?searchtype=author&amp;query=Hospedales%2C+T">Timothy Hospedales</a>, <a href="/search/cs?searchtype=author&amp;query=Tzimiropoulos%2C+G">Georgios Tzimiropoulos</a>, <a href="/search/cs?searchtype=author&amp;query=Lane%2C+N+D">Nicholas D Lane</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.13859v1-abstract-short" style="display: inline;"> We propose defensive tensorization, an adversarial defence technique that leverages a latent high-order factorization of the network. The layers of a network are first expressed as factorized tensor layers. Tensor dropout is then applied in the latent subspace, therefore resulting in dense reconstructed weights, without the sparsity or perturbations typically induced by the randomization.Our appro&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.13859v1-abstract-full').style.display = 'inline'; document.getElementById('2110.13859v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.13859v1-abstract-full" style="display: none;"> We propose defensive tensorization, an adversarial defence technique that leverages a latent high-order factorization of the network. The layers of a network are first expressed as factorized tensor layers. Tensor dropout is then applied in the latent subspace, therefore resulting in dense reconstructed weights, without the sparsity or perturbations typically induced by the randomization.Our approach can be readily integrated with any arbitrary neural architecture and combined with techniques like adversarial training. We empirically demonstrate the effectiveness of our approach on standard image classification benchmarks. We validate the versatility of our approach across domains and low-precision architectures by considering an audio classification task and binary networks. In all cases, we demonstrate improved performance compared to prior works. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.13859v1-abstract-full').style.display = 'none'; document.getElementById('2110.13859v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To be presented at BMVC 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.09168">arXiv:2110.09168</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2110.09168">pdf</a>, <a href="https://arxiv.org/format/2110.09168">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Domain Generalisation for Apparent Emotional Facial Expression Recognition across Age-Groups </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Poyiadzi%2C+R">Rafael Poyiadzi</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+J">Jie Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Petridis%2C+S">Stavros Petridis</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yujiang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.09168v1-abstract-short" style="display: inline;"> Apparent emotional facial expression recognition has attracted a lot of research attention recently. However, the majority of approaches ignore age differences and train a generic model for all ages. In this work, we study the effect of using different age-groups for training apparent emotional facial expression recognition models. To this end, we study Domain Generalisation in the context of appa&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.09168v1-abstract-full').style.display = 'inline'; document.getElementById('2110.09168v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.09168v1-abstract-full" style="display: none;"> Apparent emotional facial expression recognition has attracted a lot of research attention recently. However, the majority of approaches ignore age differences and train a generic model for all ages. In this work, we study the effect of using different age-groups for training apparent emotional facial expression recognition models. To this end, we study Domain Generalisation in the context of apparent emotional facial expression recognition from facial imagery across different age groups. We first compare several domain generalisation algorithms on the basis of out-of-domain-generalisation, and observe that the Class-Conditional Domain-Adversarial Neural Networks (CDANN) algorithm has the best performance. We then study the effect of variety and number of age-groups used during training on generalisation to unseen age-groups and observe that an increase in the number of training age-groups tends to increase the apparent emotional facial expression recognition performance on unseen age-groups. We also show that exclusion of an age-group during training tends to affect more the performance of the neighbouring age groups. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.09168v1-abstract-full').style.display = 'none'; document.getElementById('2110.09168v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2107.04174">arXiv:2107.04174</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2107.04174">pdf</a>, <a href="https://arxiv.org/format/2107.04174">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> EasyCom: An Augmented Reality Dataset to Support Algorithms for Easy Communication in Noisy Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Donley%2C+J">Jacob Donley</a>, <a href="/search/cs?searchtype=author&amp;query=Tourbabin%2C+V">Vladimir Tourbabin</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+J">Jung-Suk Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Broyles%2C+M">Mark Broyles</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+H">Hao Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+J">Jie Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a>, <a href="/search/cs?searchtype=author&amp;query=Ithapu%2C+V+K">Vamsi Krishna Ithapu</a>, <a href="/search/cs?searchtype=author&amp;query=Mehra%2C+R">Ravish Mehra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2107.04174v2-abstract-short" style="display: inline;"> Augmented Reality (AR) as a platform has the potential to facilitate the reduction of the cocktail party effect. Future AR headsets could potentially leverage information from an array of sensors spanning many different modalities. Training and testing signal processing and machine learning algorithms on tasks such as beam-forming and speech enhancement require high quality representative data. To&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.04174v2-abstract-full').style.display = 'inline'; document.getElementById('2107.04174v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2107.04174v2-abstract-full" style="display: none;"> Augmented Reality (AR) as a platform has the potential to facilitate the reduction of the cocktail party effect. Future AR headsets could potentially leverage information from an array of sensors spanning many different modalities. Training and testing signal processing and machine learning algorithms on tasks such as beam-forming and speech enhancement require high quality representative data. To the best of the author&#39;s knowledge, as of publication there are no available datasets that contain synchronized egocentric multi-channel audio and video with dynamic movement and conversations in a noisy environment. In this work, we describe, evaluate and release a dataset that contains over 5 hours of multi-modal data useful for training and testing algorithms for the application of improving conversations for an AR glasses wearer. We provide speech intelligibility, quality and signal-to-noise ratio improvement results for a baseline method and show improvements across all tested metrics. The dataset we are releasing contains AR glasses egocentric multi-channel microphone array audio, wide field-of-view RGB video, speech source pose, headset microphone audio, annotated voice activity, speech transcriptions, head bounding boxes, target of speech and source identification labels. We have created and are releasing this dataset to facilitate research in multi-modal AR solutions to the cocktail party problem. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.04174v2-abstract-full').style.display = 'none'; document.getElementById('2107.04174v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 July, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Dataset is available at: https://github.com/facebookresearch/EasyComDataset</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.11145">arXiv:2106.11145</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2106.11145">pdf</a>, <a href="https://arxiv.org/format/2106.11145">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FP-Age: Leveraging Face Parsing Attention for Facial Age Estimation in the Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y">Yiming Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+J">Jie Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yujiang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.11145v2-abstract-short" style="display: inline;"> Image-based age estimation aims to predict a person&#39;s age from facial images. It is used in a variety of real-world applications. Although end-to-end deep models have achieved impressive results for age estimation on benchmark datasets, their performance in-the-wild still leaves much room for improvement due to the challenges caused by large variations in head pose, facial expressions, and occlusi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.11145v2-abstract-full').style.display = 'inline'; document.getElementById('2106.11145v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.11145v2-abstract-full" style="display: none;"> Image-based age estimation aims to predict a person&#39;s age from facial images. It is used in a variety of real-world applications. Although end-to-end deep models have achieved impressive results for age estimation on benchmark datasets, their performance in-the-wild still leaves much room for improvement due to the challenges caused by large variations in head pose, facial expressions, and occlusions. To address this issue, we propose a simple yet effective method to explicitly incorporate facial semantics into age estimation, so that the model would learn to correctly focus on the most informative facial components from unaligned facial images regardless of head pose and non-rigid deformation. To this end, we design a face parsing-based network to learn semantic information at different scales and a novel face parsing attention module to leverage these semantic features for age estimation. To evaluate our method on in-the-wild data, we also introduce a new challenging large-scale benchmark called IMDB-Clean. This dataset is created by semi-automatically cleaning the noisy IMDB-WIKI dataset using a constrained clustering method. Through comprehensive experiment on IMDB-Clean and other benchmark datasets, under both intra-dataset and cross-dataset evaluation protocols, we show that our method consistently outperforms all existing age estimation methods and achieves a new state-of-the-art performance. To the best of our knowledge, our work presents the first attempt of leveraging face parsing attention to achieve semantic-aware age estimation, which may be inspiring to other high level facial analysis tasks. Code and data are available on \url{https://github.com/ibug-group/fpage}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.11145v2-abstract-full').style.display = 'none'; document.getElementById('2106.11145v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Transactions of Image Processing. Code and data are available on https://github.com/ibug-group/fpage</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.10108">arXiv:2106.10108</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2106.10108">pdf</a>, <a href="https://arxiv.org/format/2106.10108">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.55417/fr.2022034">10.55417/fr.2022034 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Under the Sand: Navigation and Localization of a Micro Aerial Vehicle for Landmine Detection with Ground Penetrating Synthetic Aperture Radar </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=B%C3%A4hnemann%2C+R">Rik B盲hnemann</a>, <a href="/search/cs?searchtype=author&amp;query=Lawrance%2C+N">Nicholas Lawrance</a>, <a href="/search/cs?searchtype=author&amp;query=Streichenberg%2C+L">Lucas Streichenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Chung%2C+J+J">Jen Jen Chung</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Michael Pantic</a>, <a href="/search/cs?searchtype=author&amp;query=Grathwohl%2C+A">Alexander Grathwohl</a>, <a href="/search/cs?searchtype=author&amp;query=Waldschmidt%2C+C">Christian Waldschmidt</a>, <a href="/search/cs?searchtype=author&amp;query=Siegwart%2C+R">Roland Siegwart</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.10108v2-abstract-short" style="display: inline;"> Ground penetrating radar mounted on micro aerial vehicle (MAV) is a promising tool to assist humanitarian landmine clearance. However, the quality of synthetic aperture radar images depends on accurate and precise motion estimation of the radar antennas as well as generating informative viewpoints with the MAV. This paper presents a complete and automatic airborne ground-penetrating synthetic aper&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.10108v2-abstract-full').style.display = 'inline'; document.getElementById('2106.10108v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.10108v2-abstract-full" style="display: none;"> Ground penetrating radar mounted on micro aerial vehicle (MAV) is a promising tool to assist humanitarian landmine clearance. However, the quality of synthetic aperture radar images depends on accurate and precise motion estimation of the radar antennas as well as generating informative viewpoints with the MAV. This paper presents a complete and automatic airborne ground-penetrating synthetic aperture radar (GPSAR) system. The system consists of a spatially calibrated and temporally synchronized industrial grade sensor suite that enables navigation above ground level, radar imaging, and optical imaging. A custom mission planning framework allows generation and automatic execution of stripmap and circular (GPSAR) trajectories controlled above ground level as well as aerial imaging survey flights. A factor graph based state estimator fuses measurements from dual receiver real-time kinematic (RTK) global navigation satellite system (GNSS) and inertial measurement unit (IMU) to obtain precise, high rate platform positions and orientations. Ground truth experiments showed sensor timing as accurate as 0.8 us and as precise as 0.1 us with localization rates of 1 kHz. The dual position factor formulation improves online localization accuracy up to 40% and batch localization accuracy up to 59% compared to a single position factor with uncertain heading initialization. Our field trials validated a localization accuracy and precision that enables coherent radar measurement addition and detection of radar targets buried in sand. This validates the potential as an aerial landmine detection system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.10108v2-abstract-full').style.display = 'none'; document.getElementById('2106.10108v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to Field Robotics journal in June 2021. First revision submitted December 2021</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Report number:</span> 20.500.11850/572552 </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Field Robotics, 2, (2022), 1028-1067 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.09171">arXiv:2106.09171</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2106.09171">pdf</a>, <a href="https://arxiv.org/format/2106.09171">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> LiRA: Learning Visual Speech Representations from Audio through Self-supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ma%2C+P">Pingchuan Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Mira%2C+R">Rodrigo Mira</a>, <a href="/search/cs?searchtype=author&amp;query=Petridis%2C+S">Stavros Petridis</a>, <a href="/search/cs?searchtype=author&amp;query=Schuller%2C+B+W">Bj枚rn W. Schuller</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.09171v1-abstract-short" style="display: inline;"> The large amount of audiovisual content being shared online today has drawn substantial attention to the prospect of audiovisual self-supervised learning. Recent works have focused on each of these modalities separately, while others have attempted to model both simultaneously in a cross-modal fashion. However, comparatively little attention has been given to leveraging one modality as a training&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.09171v1-abstract-full').style.display = 'inline'; document.getElementById('2106.09171v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.09171v1-abstract-full" style="display: none;"> The large amount of audiovisual content being shared online today has drawn substantial attention to the prospect of audiovisual self-supervised learning. Recent works have focused on each of these modalities separately, while others have attempted to model both simultaneously in a cross-modal fashion. However, comparatively little attention has been given to leveraging one modality as a training objective to learn from the other. In this work, we propose Learning visual speech Representations from Audio via self-supervision (LiRA). Specifically, we train a ResNet+Conformer model to predict acoustic features from unlabelled visual speech. We find that this pre-trained model can be leveraged towards word-level and sentence-level lip-reading through feature extraction and fine-tuning experiments. We show that our approach significantly outperforms other self-supervised methods on the Lip Reading in the Wild (LRW) dataset and achieves state-of-the-art performance on Lip Reading Sentences 2 (LRS2) using only a fraction of the total labelled data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.09171v1-abstract-full').style.display = 'none'; document.getElementById('2106.09171v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication at Interspeech 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.13332">arXiv:2104.13332</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2104.13332">pdf</a>, <a href="https://arxiv.org/format/2104.13332">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TCYB.2022.3162495">10.1109/TCYB.2022.3162495 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> End-to-End Video-To-Speech Synthesis using Generative Adversarial Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mira%2C+R">Rodrigo Mira</a>, <a href="/search/cs?searchtype=author&amp;query=Vougioukas%2C+K">Konstantinos Vougioukas</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+P">Pingchuan Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Petridis%2C+S">Stavros Petridis</a>, <a href="/search/cs?searchtype=author&amp;query=Schuller%2C+B+W">Bj枚rn W. Schuller</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.13332v3-abstract-short" style="display: inline;"> Video-to-speech is the process of reconstructing the audio speech from a video of a spoken utterance. Previous approaches to this task have relied on a two-step process where an intermediate representation is inferred from the video, and is then decoded into waveform audio using a vocoder or a waveform reconstruction algorithm. In this work, we propose a new end-to-end video-to-speech model based&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.13332v3-abstract-full').style.display = 'inline'; document.getElementById('2104.13332v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.13332v3-abstract-full" style="display: none;"> Video-to-speech is the process of reconstructing the audio speech from a video of a spoken utterance. Previous approaches to this task have relied on a two-step process where an intermediate representation is inferred from the video, and is then decoded into waveform audio using a vocoder or a waveform reconstruction algorithm. In this work, we propose a new end-to-end video-to-speech model based on Generative Adversarial Networks (GANs) which translates spoken video to waveform end-to-end without using any intermediate representation or separate waveform synthesis algorithm. Our model consists of an encoder-decoder architecture that receives raw video as input and generates speech, which is then fed to a waveform critic and a power critic. The use of an adversarial loss based on these two critics enables the direct synthesis of raw audio waveform and ensures its realism. In addition, the use of our three comparative losses helps establish direct correspondence between the generated audio and the input video. We show that this model is able to reconstruct speech with remarkable realism for constrained datasets such as GRID, and that it is the first end-to-end model to produce intelligible speech for LRW (Lip Reading in the Wild), featuring hundreds of speakers recorded entirely `in the wild&#39;. We evaluate the generated samples in two different scenarios -- seen and unseen speakers -- using four objective metrics which measure the quality and intelligibility of artificial speech. We demonstrate that the proposed approach outperforms all previous works in most metrics on GRID and LRW. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.13332v3-abstract-full').style.display = 'none'; document.getElementById('2104.13332v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in IEEE Transactions on Cybernetics (April 2022)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2102.10313">arXiv:2102.10313</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2102.10313">pdf</a>, <a href="https://arxiv.org/format/2102.10313">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Mesh Manifold based Riemannian Motion Planning for Omnidirectional Micro Aerial Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Michael Pantic</a>, <a href="/search/cs?searchtype=author&amp;query=Ott%2C+L">Lionel Ott</a>, <a href="/search/cs?searchtype=author&amp;query=Cadena%2C+C">Cesar Cadena</a>, <a href="/search/cs?searchtype=author&amp;query=Siegwart%2C+R">Roland Siegwart</a>, <a href="/search/cs?searchtype=author&amp;query=Nieto%2C+J">Juan Nieto</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2102.10313v1-abstract-short" style="display: inline;"> This paper presents a novel on-line path planning method that enables aerial robots to interact with surfaces. We present a solution to the problem of finding trajectories that drive a robot towards a surface and move along it. Triangular meshes are used as a surface map representation that is free of fixed discretization and allows for very large workspaces. We propose to leverage planar parametr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.10313v1-abstract-full').style.display = 'inline'; document.getElementById('2102.10313v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2102.10313v1-abstract-full" style="display: none;"> This paper presents a novel on-line path planning method that enables aerial robots to interact with surfaces. We present a solution to the problem of finding trajectories that drive a robot towards a surface and move along it. Triangular meshes are used as a surface map representation that is free of fixed discretization and allows for very large workspaces. We propose to leverage planar parametrization methods to obtain a lower-dimensional topologically equivalent representation of the original surface. Furthermore, we interpret the original surface and its lower-dimensional representation as manifold approximations that allow the use of Riemannian Motion Policies (RMPs), resulting in an efficient, versatile, and elegant motion generation framework. We compare against several Rapidly-exploring Random Tree (RRT) planners, a customized CHOMP variant, and the discrete geodesic algorithm. Using extensive simulations on real-world data we show that the proposed planner can reliably plan high-quality near-optimal trajectories at minimal computational cost. The accompanying multimedia attachment demonstrates feasibility on a real OMAV. The obtained paths show less than 10% deviation from the theoretical optimum while facilitating reactive re-planning at kHz refresh rates, enabling flying robots to perform motion planning for interaction with complex surfaces. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.10313v1-abstract-full').style.display = 'none'; document.getElementById('2102.10313v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to RA-L 2021. Video available at https://youtu.be/S9gRr_ryREo</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2102.09281">arXiv:2102.09281</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2102.09281">pdf</a>, <a href="https://arxiv.org/format/2102.09281">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> DINO: A Conditional Energy-Based GAN for Domain Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Vougioukas%2C+K">Konstantinos Vougioukas</a>, <a href="/search/cs?searchtype=author&amp;query=Petridis%2C+S">Stavros Petridis</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2102.09281v1-abstract-short" style="display: inline;"> Domain translation is the process of transforming data from one domain to another while preserving the common semantics. Some of the most popular domain translation systems are based on conditional generative adversarial networks, which use source domain data to drive the generator and as an input to the discriminator. However, this approach does not enforce the preservation of shared semantics si&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.09281v1-abstract-full').style.display = 'inline'; document.getElementById('2102.09281v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2102.09281v1-abstract-full" style="display: none;"> Domain translation is the process of transforming data from one domain to another while preserving the common semantics. Some of the most popular domain translation systems are based on conditional generative adversarial networks, which use source domain data to drive the generator and as an input to the discriminator. However, this approach does not enforce the preservation of shared semantics since the conditional input can often be ignored by the discriminator. We propose an alternative method for conditioning and present a new framework, where two networks are simultaneously trained, in a supervised manner, to perform domain translation in opposite directions. Our method is not only better at capturing the shared information between two domains but is more generic and can be applied to a broader range of problems. The proposed framework performs well even in challenging cross-modal translations, such as video-driven speech reconstruction, for which other systems struggle to maintain correspondence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.09281v1-abstract-full').style.display = 'none'; document.getElementById('2102.09281v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICLR 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2102.06657">arXiv:2102.06657</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2102.06657">pdf</a>, <a href="https://arxiv.org/format/2102.06657">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> End-to-end Audio-visual Speech Recognition with Conformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ma%2C+P">Pingchuan Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Petridis%2C+S">Stavros Petridis</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2102.06657v1-abstract-short" style="display: inline;"> In this work, we present a hybrid CTC/Attention model based on a ResNet-18 and Convolution-augmented transformer (Conformer), that can be trained in an end-to-end manner. In particular, the audio and visual encoders learn to extract features directly from raw pixels and audio waveforms, respectively, which are then fed to conformers and then fusion takes place via a Multi-Layer Perceptron (MLP). T&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.06657v1-abstract-full').style.display = 'inline'; document.getElementById('2102.06657v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2102.06657v1-abstract-full" style="display: none;"> In this work, we present a hybrid CTC/Attention model based on a ResNet-18 and Convolution-augmented transformer (Conformer), that can be trained in an end-to-end manner. In particular, the audio and visual encoders learn to extract features directly from raw pixels and audio waveforms, respectively, which are then fed to conformers and then fusion takes place via a Multi-Layer Perceptron (MLP). The model learns to recognise characters using a combination of CTC and an attention mechanism. We show that end-to-end training, instead of using pre-computed visual features which is common in the literature, the use of a conformer, instead of a recurrent network, and the use of a transformer-based language model, significantly improve the performance of our model. We present results on the largest publicly available datasets for sentence-level speech recognition, Lip Reading Sentences 2 (LRS2) and Lip Reading Sentences 3 (LRS3), respectively. The results show that our proposed models raise the state-of-the-art performance by a large margin in audio-only, visual-only, and audio-visual experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.06657v1-abstract-full').style.display = 'none'; document.getElementById('2102.06657v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICASSP 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2102.02717">arXiv:2102.02717</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2102.02717">pdf</a>, <a href="https://arxiv.org/format/2102.02717">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RoI Tanh-polar Transformer Network for Face Parsing in the Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y">Yiming Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+J">Jie Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yujiang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2102.02717v3-abstract-short" style="display: inline;"> Face parsing aims to predict pixel-wise labels for facial components of a target face in an image. Existing approaches usually crop the target face from the input image with respect to a bounding box calculated during pre-processing, and thus can only parse inner facial Regions of Interest~(RoIs). Peripheral regions like hair are ignored and nearby faces that are partially included in the bounding&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.02717v3-abstract-full').style.display = 'inline'; document.getElementById('2102.02717v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2102.02717v3-abstract-full" style="display: none;"> Face parsing aims to predict pixel-wise labels for facial components of a target face in an image. Existing approaches usually crop the target face from the input image with respect to a bounding box calculated during pre-processing, and thus can only parse inner facial Regions of Interest~(RoIs). Peripheral regions like hair are ignored and nearby faces that are partially included in the bounding box can cause distractions. Moreover, these methods are only trained and evaluated on near-frontal portrait images and thus their performance for in-the-wild cases has been unexplored. To address these issues, this paper makes three contributions. First, we introduce iBugMask dataset for face parsing in the wild, which consists of 21,866 training images and 1,000 testing images. The training images are obtained by augmenting an existing dataset with large face poses. The testing images are manually annotated with $11$ facial regions and there are large variations in sizes, poses, expressions and background. Second, we propose RoI Tanh-polar transform that warps the whole image to a Tanh-polar representation with a fixed ratio between the face area and the context, guided by the target bounding box. The new representation contains all information in the original image, and allows for rotation equivariance in the convolutional neural networks~(CNNs). Third, we propose a hybrid residual representation learning block, coined HybridBlock, that contains convolutional layers in both the Tanh-polar space and the Tanh-Cartesian space, allowing for receptive fields of different shapes in CNNs. Through extensive experiments, we show that the proposed method improves the state-of-the-art for face parsing in the wild and does not require facial landmarks for alignment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.02717v3-abstract-full').style.display = 'none'; document.getElementById('2102.02717v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 May, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at Image and Vision Computing. Code is available on https://github.com/hhj1897/face_parsing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2101.02149">arXiv:2101.02149</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2101.02149">pdf</a>, <a href="https://arxiv.org/format/2101.02149">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Cauchy-Schwarz Regularized Autoencoder </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tran%2C+L">Linh Tran</a>, <a href="/search/cs?searchtype=author&amp;query=Pantic%2C+M">Maja Pantic</a>, <a href="/search/cs?searchtype=author&amp;query=Deisenroth%2C+M+P">Marc Peter Deisenroth</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2101.02149v2-abstract-short" style="display: inline;"> Recent work in unsupervised learning has focused on efficient inference and learning in latent variables models. Training these models by maximizing the evidence (marginal likelihood) is typically intractable. Thus, a common approximation is to maximize the Evidence Lower BOund (ELBO) instead. Variational autoencoders (VAE) are a powerful and widely-used class of generative models that optimize th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.02149v2-abstract-full').style.display = 'inline'; document.getElementById('2101.02149v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2101.02149v2-abstract-full" style="display: none;"> Recent work in unsupervised learning has focused on efficient inference and learning in latent variables models. Training these models by maximizing the evidence (marginal likelihood) is typically intractable. Thus, a common approximation is to maximize the Evidence Lower BOund (ELBO) instead. Variational autoencoders (VAE) are a powerful and widely-used class of generative models that optimize the ELBO efficiently for large datasets. However, the VAE&#39;s default Gaussian choice for the prior imposes a strong constraint on its ability to represent the true posterior, thereby degrading overall performance. A Gaussian mixture model (GMM) would be a richer prior, but cannot be handled efficiently within the VAE framework because of the intractability of the Kullback-Leibler divergence for GMMs. We deviate from the common VAE framework in favor of one with an analytical solution for Gaussian mixture prior. To perform efficient inference for GMM priors, we introduce a new constrained objective based on the Cauchy-Schwarz divergence, which can be computed analytically for GMMs. This new objective allows us to incorporate richer, multi-modal priors into the autoencoding framework. We provide empirical studies on a range of datasets and show that our objective improves upon variational auto-encoding models in density estimation, unsupervised clustering, semi-supervised learning, and face analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.02149v2-abstract-full').style.display = 'none'; document.getElementById('2101.02149v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 January, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2021. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Pantic%2C+M&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Pantic%2C+M&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Pantic%2C+M&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Pantic%2C+M&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10