CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;48 of 48 results for author: <span class="mathjax">Bello, J</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Bello%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Bello, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Bello%2C+J&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Bello, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09982">arXiv:2412.09982</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.09982">pdf</a>, <a href="https://arxiv.org/format/2412.09982">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SplineGS: Robust Motion-Adaptive Spline for Real-Time Dynamic 3D Gaussians from Monocular Video </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Park%2C+J">Jongmin Park</a>, <a href="/search/cs?searchtype=author&amp;query=Bui%2C+M+V">Minh-Quan Viet Bui</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+L+G">Juan Luis Gonzalez Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Moon%2C+J">Jaeho Moon</a>, <a href="/search/cs?searchtype=author&amp;query=Oh%2C+J">Jihyong Oh</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+M">Munchurl Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09982v2-abstract-short" style="display: inline;"> Synthesizing novel views from in-the-wild monocular videos is challenging due to scene dynamics and the lack of multi-view cues. To address this, we propose SplineGS, a COLMAP-free dynamic 3D Gaussian Splatting (3DGS) framework for high-quality reconstruction and fast rendering from monocular videos. At its core is a novel Motion-Adaptive Spline (MAS) method, which represents continuous dynamic 3D&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09982v2-abstract-full').style.display = 'inline'; document.getElementById('2412.09982v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09982v2-abstract-full" style="display: none;"> Synthesizing novel views from in-the-wild monocular videos is challenging due to scene dynamics and the lack of multi-view cues. To address this, we propose SplineGS, a COLMAP-free dynamic 3D Gaussian Splatting (3DGS) framework for high-quality reconstruction and fast rendering from monocular videos. At its core is a novel Motion-Adaptive Spline (MAS) method, which represents continuous dynamic 3D Gaussian trajectories using cubic Hermite splines with a small number of control points. For MAS, we introduce a Motion-Adaptive Control points Pruning (MACP) method to model the deformation of each dynamic 3D Gaussian across varying motions, progressively pruning control points while maintaining dynamic modeling integrity. Additionally, we present a joint optimization strategy for camera parameter estimation and 3D Gaussian attributes, leveraging photometric and geometric consistency. This eliminates the need for Structure-from-Motion preprocessing and enhances SplineGS&#39;s robustness in real-world conditions. Experiments show that SplineGS significantly outperforms state-of-the-art methods in novel view synthesis quality for dynamic scenes from monocular videos, achieving thousands times faster rendering speed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09982v2-abstract-full').style.display = 'none'; document.getElementById('2412.09982v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The first two authors contributed equally to this work (equal contribution). The last two authors advised equally to this work. Please visit our project page at this https://kaist-viclab.github.io/splinegs-site/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02711">arXiv:2411.02711</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.02711">pdf</a>, <a href="https://arxiv.org/format/2411.02711">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Self-Supervised Multi-View Learning for Disentangled Music Audio Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wilkins%2C+J">Julia Wilkins</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+S">Sivan Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Fuentes%2C+M">Magdalena Fuentes</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan Pablo Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02711v1-abstract-short" style="display: inline;"> Self-supervised learning (SSL) offers a powerful way to learn robust, generalizable representations without labeled data. In music, where labeled data is scarce, existing SSL methods typically use generated supervision and multi-view redundancy to create pretext tasks. However, these approaches often produce entangled representations and lose view-specific information. We propose a novel self-supe&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02711v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02711v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02711v1-abstract-full" style="display: none;"> Self-supervised learning (SSL) offers a powerful way to learn robust, generalizable representations without labeled data. In music, where labeled data is scarce, existing SSL methods typically use generated supervision and multi-view redundancy to create pretext tasks. However, these approaches often produce entangled representations and lose view-specific information. We propose a novel self-supervised multi-view learning framework for audio designed to incentivize separation between private and shared representation spaces. A case study on audio disentanglement in a controlled setting demonstrates the effectiveness of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02711v1-abstract-full').style.display = 'none'; document.getElementById('2411.02711v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Late Breaking Demo at ISMIR 2024. https://juliawilkins.github.io/marlbymarl/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.12260">arXiv:2407.12260</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.12260">pdf</a>, <a href="https://arxiv.org/format/2407.12260">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> HuBar: A Visual Analytics Tool to Explore Human Behaviour based on fNIRS in AR guidance systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Castelo%2C+S">Sonia Castelo</a>, <a href="/search/cs?searchtype=author&amp;query=Rulff%2C+J">Joao Rulff</a>, <a href="/search/cs?searchtype=author&amp;query=Solunke%2C+P">Parikshit Solunke</a>, <a href="/search/cs?searchtype=author&amp;query=McGowan%2C+E">Erin McGowan</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+G">Guande Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Roman%2C+I">Iran Roman</a>, <a href="/search/cs?searchtype=author&amp;query=Lopez%2C+R">Roque Lopez</a>, <a href="/search/cs?searchtype=author&amp;query=Steers%2C+B">Bea Steers</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Q">Qi Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J">Juan Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Feest%2C+B">Bradley Feest</a>, <a href="/search/cs?searchtype=author&amp;query=Middleton%2C+M">Michael Middleton</a>, <a href="/search/cs?searchtype=author&amp;query=Mckendrick%2C+R">Ryan Mckendrick</a>, <a href="/search/cs?searchtype=author&amp;query=Silva%2C+C">Claudio Silva</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.12260v1-abstract-short" style="display: inline;"> The concept of an intelligent augmented reality (AR) assistant has significant, wide-ranging applications, with potential uses in medicine, military, and mechanics domains. Such an assistant must be able to perceive the environment and actions, reason about the environment state in relation to a given task, and seamlessly interact with the task performer. These interactions typically involve an AR&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12260v1-abstract-full').style.display = 'inline'; document.getElementById('2407.12260v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.12260v1-abstract-full" style="display: none;"> The concept of an intelligent augmented reality (AR) assistant has significant, wide-ranging applications, with potential uses in medicine, military, and mechanics domains. Such an assistant must be able to perceive the environment and actions, reason about the environment state in relation to a given task, and seamlessly interact with the task performer. These interactions typically involve an AR headset equipped with sensors which capture video, audio, and haptic feedback. Previous works have sought to facilitate the development of intelligent AR assistants by visualizing these sensor data streams in conjunction with the assistant&#39;s perception and reasoning model outputs. However, existing visual analytics systems do not focus on user modeling or include biometric data, and are only capable of visualizing a single task session for a single performer at a time. Moreover, they typically assume a task involves linear progression from one step to the next. We propose a visual analytics system that allows users to compare performance during multiple task sessions, focusing on non-linear tasks where different step sequences can lead to success. In particular, we design visualizations for understanding user behavior through functional near-infrared spectroscopy (fNIRS) data as a proxy for perception, attention, and memory as well as corresponding motion data (acceleration, angular velocity, and gaze). We distill these insights into embedding representations that allow users to easily select groups of sessions with similar behaviors. We provide two case studies that demonstrate how to use these visualizations to gain insights about task performance using data collected during helicopter copilot training tasks. Finally, we evaluate our approach by conducting an in-depth examination of a think-aloud experiment with five domain experts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12260v1-abstract-full').style.display = 'none'; document.getElementById('2407.12260v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 6 figures. This is the author&#39;s version of the article that has been accepted for publication in IEEE Transactions on Visualization and Computer Graphics (TVCG)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.12238">arXiv:2401.12238</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.12238">pdf</a>, <a href="https://arxiv.org/format/2401.12238">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Spatial Scaper: A Library to Simulate and Augment Soundscapes for Sound Event Localization and Detection in Realistic Rooms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Roman%2C+I+R">Iran R. Roman</a>, <a href="/search/cs?searchtype=author&amp;query=Ick%2C+C">Christopher Ick</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+S">Sivan Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Roman%2C+A+S">Adrian S. Roman</a>, <a href="/search/cs?searchtype=author&amp;query=McFee%2C+B">Brian McFee</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan P. Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.12238v1-abstract-short" style="display: inline;"> Sound event localization and detection (SELD) is an important task in machine listening. Major advancements rely on simulated data with sound events in specific rooms and strong spatio-temporal labels. SELD data is simulated by convolving spatialy-localized room impulse responses (RIRs) with sound waveforms to place sound events in a soundscape. However, RIRs require manual collection in specific&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.12238v1-abstract-full').style.display = 'inline'; document.getElementById('2401.12238v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.12238v1-abstract-full" style="display: none;"> Sound event localization and detection (SELD) is an important task in machine listening. Major advancements rely on simulated data with sound events in specific rooms and strong spatio-temporal labels. SELD data is simulated by convolving spatialy-localized room impulse responses (RIRs) with sound waveforms to place sound events in a soundscape. However, RIRs require manual collection in specific rooms. We present SpatialScaper, a library for SELD data simulation and augmentation. Compared to existing tools, SpatialScaper emulates virtual rooms via parameters such as size and wall absorption. This allows for parameterized placement (including movement) of foreground and background sound sources. SpatialScaper also includes data augmentation pipelines that can be applied to existing SELD data. As a case study, we use SpatialScaper to add rooms to the DCASE SELD data. Training a model with our data led to progressive performance improves as a direct function of acoustic diversity. These results show that SpatialScaper is valuable to train robust SELD models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.12238v1-abstract-full').style.display = 'none'; document.getElementById('2401.12238v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 4 figures, 1 table, to be presented at ICASSP 2024 in Seoul, South Korea</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.08717">arXiv:2401.08717</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.08717">pdf</a>, <a href="https://arxiv.org/format/2401.08717">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Robust DOA estimation using deep acoustic imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Roman%2C+A+S">Adrian S. Roman</a>, <a href="/search/cs?searchtype=author&amp;query=Roman%2C+I+R">Iran R. Roman</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan P. Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.08717v1-abstract-short" style="display: inline;"> Direction of arrival estimation (DoAE) aims at tracking a sound in azimuth and elevation. Recent advancements include data-driven models with inputs derived from ambisonics intensity vectors or correlations between channels in a microphone array. A spherical intensity map (SIM), or acoustic image, is an alternative input representation that remains underexplored. SIMs benefit from high-resolution&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.08717v1-abstract-full').style.display = 'inline'; document.getElementById('2401.08717v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.08717v1-abstract-full" style="display: none;"> Direction of arrival estimation (DoAE) aims at tracking a sound in azimuth and elevation. Recent advancements include data-driven models with inputs derived from ambisonics intensity vectors or correlations between channels in a microphone array. A spherical intensity map (SIM), or acoustic image, is an alternative input representation that remains underexplored. SIMs benefit from high-resolution microphone arrays, yet most DoAE datasets use low-resolution ones. Therefore, we first propose a super-resolution method to upsample low-resolution microphones. Next, we benchmark DoAE models that use SIMs as input. We arrive to a model that uses SIMs for DoAE estimation and outperforms a baseline and a state-of-the-art model. Our study highlights the relevance of acoustic imaging for DoAE tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.08717v1-abstract-full').style.display = 'none'; document.getElementById('2401.08717v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.10118">arXiv:2312.10118</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.10118">pdf</a>, <a href="https://arxiv.org/format/2312.10118">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> From-Ground-To-Objects: Coarse-to-Fine Self-supervised Monocular Depth Estimation of Dynamic Objects with Ground Contact Prior </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Moon%2C+J">Jaeho Moon</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+L+G">Juan Luis Gonzalez Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Kwon%2C+B">Byeongjun Kwon</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+M">Munchurl Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.10118v1-abstract-short" style="display: inline;"> Self-supervised monocular depth estimation (DE) is an approach to learning depth without costly depth ground truths. However, it often struggles with moving objects that violate the static scene assumption during training. To address this issue, we introduce a coarse-to-fine training strategy leveraging the ground contacting prior based on the observation that most moving objects in outdoor scenes&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.10118v1-abstract-full').style.display = 'inline'; document.getElementById('2312.10118v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.10118v1-abstract-full" style="display: none;"> Self-supervised monocular depth estimation (DE) is an approach to learning depth without costly depth ground truths. However, it often struggles with moving objects that violate the static scene assumption during training. To address this issue, we introduce a coarse-to-fine training strategy leveraging the ground contacting prior based on the observation that most moving objects in outdoor scenes contact the ground. In the coarse training stage, we exclude the objects in dynamic classes from the reprojection loss calculation to avoid inaccurate depth learning. To provide precise supervision on the depth of the objects, we present a novel Ground-contacting-prior Disparity Smoothness Loss (GDS-Loss) that encourages a DE network to align the depth of the objects with their ground-contacting points. Subsequently, in the fine training stage, we refine the DE network to learn the detailed depth of the objects from the reprojection loss, while ensuring accurate DE on the moving object regions by employing our regularization loss with a cost-volume-based weighting factor. Our overall coarse-to-fine training strategy can easily be integrated with existing DE methods without any modifications, significantly enhancing DE performance on challenging Cityscapes and KITTI datasets, especially in the moving object regions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.10118v1-abstract-full').style.display = 'none'; document.getElementById('2312.10118v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.08136">arXiv:2312.08136</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.08136">pdf</a>, <a href="https://arxiv.org/format/2312.08136">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> ProNeRF: Learning Efficient Projection-Aware Ray Sampling for Fine-Grained Implicit Neural Radiance Fields </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+L+G">Juan Luis Gonzalez Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Bui%2C+M+V">Minh-Quan Viet Bui</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+M">Munchurl Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.08136v1-abstract-short" style="display: inline;"> Recent advances in neural rendering have shown that, albeit slow, implicit compact models can learn a scene&#39;s geometries and view-dependent appearances from multiple views. To maintain such a small memory footprint but achieve faster inference times, recent works have adopted `sampler&#39; networks that adaptively sample a small subset of points along each ray in the implicit neural radiance fields. A&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.08136v1-abstract-full').style.display = 'inline'; document.getElementById('2312.08136v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.08136v1-abstract-full" style="display: none;"> Recent advances in neural rendering have shown that, albeit slow, implicit compact models can learn a scene&#39;s geometries and view-dependent appearances from multiple views. To maintain such a small memory footprint but achieve faster inference times, recent works have adopted `sampler&#39; networks that adaptively sample a small subset of points along each ray in the implicit neural radiance fields. Although these methods achieve up to a 10$\times$ reduction in rendering time, they still suffer from considerable quality degradation compared to the vanilla NeRF. In contrast, we propose ProNeRF, which provides an optimal trade-off between memory footprint (similar to NeRF), speed (faster than HyperReel), and quality (better than K-Planes). ProNeRF is equipped with a novel projection-aware sampling (PAS) network together with a new training strategy for ray exploration and exploitation, allowing for efficient fine-grained particle sampling. Our ProNeRF yields state-of-the-art metrics, being 15-23x faster with 0.65dB higher PSNR than NeRF and yielding 0.95dB higher PSNR than the best published sampler-based method, HyperReel. Our exploration and exploitation training strategy allows ProNeRF to learn the full scenes&#39; color and density distributions while also learning efficient ray sampling focused on the highest-density regions. We provide extensive experimental results that support the effectiveness of our method on the widely adopted forward-facing and 360 datasets, LLFF and Blender, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.08136v1-abstract-full').style.display = 'none'; document.getElementById('2312.08136v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Visit our project website at https://kaist-viclab.github.io/pronerf-site/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.08071">arXiv:2312.08071</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.08071">pdf</a>, <a href="https://arxiv.org/format/2312.08071">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Novel View Synthesis with View-Dependent Effects from a Single Image </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+L+G">Juan Luis Gonzalez Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+M">Munchurl Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.08071v1-abstract-short" style="display: inline;"> In this paper, we firstly consider view-dependent effects into single image-based novel view synthesis (NVS) problems. For this, we propose to exploit the camera motion priors in NVS to model view-dependent appearance or effects (VDE) as the negative disparity in the scene. By recognizing specularities &#34;follow&#34; the camera motion, we infuse VDEs into the input images by aggregating input pixel colo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.08071v1-abstract-full').style.display = 'inline'; document.getElementById('2312.08071v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.08071v1-abstract-full" style="display: none;"> In this paper, we firstly consider view-dependent effects into single image-based novel view synthesis (NVS) problems. For this, we propose to exploit the camera motion priors in NVS to model view-dependent appearance or effects (VDE) as the negative disparity in the scene. By recognizing specularities &#34;follow&#34; the camera motion, we infuse VDEs into the input images by aggregating input pixel colors along the negative depth region of the epipolar lines. Also, we propose a `relaxed volumetric rendering&#39; approximation that allows computing the densities in a single pass, improving efficiency for NVS from single images. Our method can learn single-image NVS from image sequences only, which is a completely self-supervised learning method, for the first time requiring neither depth nor camera pose annotations. We present extensive experiment results and show that our proposed method can learn NVS with VDEs, outperforming the SOTA single-view NVS methods on the RealEstate10k and MannequinChallenge datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.08071v1-abstract-full').style.display = 'none'; document.getElementById('2312.08071v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Visit our website https://kaist-viclab.github.io/monovde-site</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.13343">arXiv:2309.13343</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.13343">pdf</a>, <a href="https://arxiv.org/format/2309.13343">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Two vs. Four-Channel Sound Event Localization and Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wilkins%2C+J">Julia Wilkins</a>, <a href="/search/cs?searchtype=author&amp;query=Fuentes%2C+M">Magdalena Fuentes</a>, <a href="/search/cs?searchtype=author&amp;query=Bondi%2C+L">Luca Bondi</a>, <a href="/search/cs?searchtype=author&amp;query=Ghaffarzadegan%2C+S">Shabnam Ghaffarzadegan</a>, <a href="/search/cs?searchtype=author&amp;query=Abavisani%2C+A">Ali Abavisani</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan Pablo Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.13343v1-abstract-short" style="display: inline;"> Sound event localization and detection (SELD) systems estimate both the direction-of-arrival (DOA) and class of sound sources over time. In the DCASE 2022 SELD Challenge (Task 3), models are designed to operate in a 4-channel setting. While beneficial to further the development of SELD systems using a multichannel recording setup such as first-order Ambisonics (FOA), most consumer electronics devi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.13343v1-abstract-full').style.display = 'inline'; document.getElementById('2309.13343v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.13343v1-abstract-full" style="display: none;"> Sound event localization and detection (SELD) systems estimate both the direction-of-arrival (DOA) and class of sound sources over time. In the DCASE 2022 SELD Challenge (Task 3), models are designed to operate in a 4-channel setting. While beneficial to further the development of SELD systems using a multichannel recording setup such as first-order Ambisonics (FOA), most consumer electronics devices rarely are able to record using more than two channels. For this reason, in this work we investigate the performance of the DCASE 2022 SELD baseline model using three audio input representations: FOA, binaural, and stereo. We perform a novel comparative analysis illustrating the effect of these audio input representations on SELD performance. Crucially, we show that binaural and stereo (i.e. 2-channel) audio-based SELD models are still able to localize and detect sound sources laterally quite well, despite overall performance degrading as less audio information is provided. Further, we segment our analysis by scenes containing varying degrees of sound source polyphony to better understand the effect of audio input representation on localization and detection performance as scene conditions become increasingly complex. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.13343v1-abstract-full').style.display = 'none'; document.getElementById('2309.13343v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.09288">arXiv:2309.09288</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.09288">pdf</a>, <a href="https://arxiv.org/format/2309.09288">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Sound Source Distance Estimation in Diverse and Dynamic Acoustic Conditions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kushwaha%2C+S+S">Saksham Singh Kushwaha</a>, <a href="/search/cs?searchtype=author&amp;query=Roman%2C+I+R">Iran R. Roman</a>, <a href="/search/cs?searchtype=author&amp;query=Fuentes%2C+M">Magdalena Fuentes</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan Pablo Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.09288v1-abstract-short" style="display: inline;"> Localizing a moving sound source in the real world involves determining its direction-of-arrival (DOA) and distance relative to a microphone. Advancements in DOA estimation have been facilitated by data-driven methods optimized with large open-source datasets with microphone array recordings in diverse environments. In contrast, estimating a sound source&#39;s distance remains understudied. Existing a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.09288v1-abstract-full').style.display = 'inline'; document.getElementById('2309.09288v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.09288v1-abstract-full" style="display: none;"> Localizing a moving sound source in the real world involves determining its direction-of-arrival (DOA) and distance relative to a microphone. Advancements in DOA estimation have been facilitated by data-driven methods optimized with large open-source datasets with microphone array recordings in diverse environments. In contrast, estimating a sound source&#39;s distance remains understudied. Existing approaches assume recordings by non-coincident microphones to use methods that are susceptible to differences in room reverberation. We present a CRNN able to estimate the distance of moving sound sources across multiple datasets featuring diverse rooms, outperforming a recently-published approach. We also characterize our model&#39;s performance as a function of sound source distance and different training losses. This analysis reveals optimal training using a loss that weighs model errors as an inverse function of the sound source true distance. Our study is the first to demonstrate that sound source distance estimation can be performed across diverse acoustic conditions using deep learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.09288v1-abstract-full').style.display = 'none'; document.getElementById('2309.09288v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in WASPAA 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.09089">arXiv:2308.09089</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.09089">pdf</a>, <a href="https://arxiv.org/format/2308.09089">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Bridging High-Quality Audio and Video via Language for Sound Effects Retrieval from Visual Queries </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wilkins%2C+J">Julia Wilkins</a>, <a href="/search/cs?searchtype=author&amp;query=Salamon%2C+J">Justin Salamon</a>, <a href="/search/cs?searchtype=author&amp;query=Fuentes%2C+M">Magdalena Fuentes</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan Pablo Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Nieto%2C+O">Oriol Nieto</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.09089v1-abstract-short" style="display: inline;"> Finding the right sound effects (SFX) to match moments in a video is a difficult and time-consuming task, and relies heavily on the quality and completeness of text metadata. Retrieving high-quality (HQ) SFX using a video frame directly as the query is an attractive alternative, removing the reliance on text metadata and providing a low barrier to entry for non-experts. Due to the lack of HQ audio&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.09089v1-abstract-full').style.display = 'inline'; document.getElementById('2308.09089v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.09089v1-abstract-full" style="display: none;"> Finding the right sound effects (SFX) to match moments in a video is a difficult and time-consuming task, and relies heavily on the quality and completeness of text metadata. Retrieving high-quality (HQ) SFX using a video frame directly as the query is an attractive alternative, removing the reliance on text metadata and providing a low barrier to entry for non-experts. Due to the lack of HQ audio-visual training data, previous work on audio-visual retrieval relies on YouTube (in-the-wild) videos of varied quality for training, where the audio is often noisy and the video of amateur quality. As such it is unclear whether these systems would generalize to the task of matching HQ audio to production-quality video. To address this, we propose a multimodal framework for recommending HQ SFX given a video frame by (1) leveraging large language models and foundational vision-language models to bridge HQ audio and video to create audio-visual pairs, resulting in a highly scalable automatic audio-visual data curation pipeline; and (2) using pre-trained audio and visual encoders to train a contrastive learning-based retrieval system. We show that our system, trained using our automatic data curation pipeline, significantly outperforms baselines trained on in-the-wild data on the task of HQ SFX retrieval for video. Furthermore, while the baselines fail to generalize to this task, our system generalizes well from clean to in-the-wild data, outperforming the baselines on a dataset of YouTube videos despite only being trained on the HQ audio-visual pairs. A user study confirms that people prefer SFX retrieved by our system over the baseline 67% of the time both for HQ and in-the-wild data. Finally, we present ablations to determine the impact of model and data pipeline design choices on downstream retrieval performance. Please visit our project website to listen to and view our SFX retrieval results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.09089v1-abstract-full').style.display = 'none'; document.getElementById('2308.09089v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">WASPAA 2023. Project page: https://juliawilkins.github.io/sound-effects-retrieval-from-video/. 4 pages, 2 figures, 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.06246">arXiv:2308.06246</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.06246">pdf</a>, <a href="https://arxiv.org/format/2308.06246">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> ARGUS: Visualization of AI-Assisted Task Guidance in AR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Castelo%2C+S">Sonia Castelo</a>, <a href="/search/cs?searchtype=author&amp;query=Rulff%2C+J">Joao Rulff</a>, <a href="/search/cs?searchtype=author&amp;query=McGowan%2C+E">Erin McGowan</a>, <a href="/search/cs?searchtype=author&amp;query=Steers%2C+B">Bea Steers</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+G">Guande Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Shaoyu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Roman%2C+I">Iran Roman</a>, <a href="/search/cs?searchtype=author&amp;query=Lopez%2C+R">Roque Lopez</a>, <a href="/search/cs?searchtype=author&amp;query=Brewer%2C+E">Ethan Brewer</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+C">Chen Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Qian%2C+J">Jing Qian</a>, <a href="/search/cs?searchtype=author&amp;query=Cho%2C+K">Kyunghyun Cho</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+H">He He</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Q">Qi Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Vo%2C+H">Huy Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J">Juan Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Krone%2C+M">Michael Krone</a>, <a href="/search/cs?searchtype=author&amp;query=Silva%2C+C">Claudio Silva</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.06246v1-abstract-short" style="display: inline;"> The concept of augmented reality (AR) assistants has captured the human imagination for decades, becoming a staple of modern science fiction. To pursue this goal, it is necessary to develop artificial intelligence (AI)-based methods that simultaneously perceive the 3D environment, reason about physical tasks, and model the performer, all in real-time. Within this framework, a wide variety of senso&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.06246v1-abstract-full').style.display = 'inline'; document.getElementById('2308.06246v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.06246v1-abstract-full" style="display: none;"> The concept of augmented reality (AR) assistants has captured the human imagination for decades, becoming a staple of modern science fiction. To pursue this goal, it is necessary to develop artificial intelligence (AI)-based methods that simultaneously perceive the 3D environment, reason about physical tasks, and model the performer, all in real-time. Within this framework, a wide variety of sensors are needed to generate data across different modalities, such as audio, video, depth, speech, and time-of-flight. The required sensors are typically part of the AR headset, providing performer sensing and interaction through visual, audio, and haptic feedback. AI assistants not only record the performer as they perform activities, but also require machine learning (ML) models to understand and assist the performer as they interact with the physical world. Therefore, developing such assistants is a challenging task. We propose ARGUS, a visual analytics system to support the development of intelligent AR assistants. Our system was designed as part of a multi year-long collaboration between visualization researchers and ML and AR experts. This co-design process has led to advances in the visualization of ML in AR. Our system allows for online visualization of object, action, and step detection as well as offline analysis of previously recorded AR sessions. It visualizes not only the multimodal sensor data streams but also the output of the ML models. This allows developers to gain insights into the performer activities as well as the ML models, helping them troubleshoot, improve, and fine tune the components of the AR assistant. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.06246v1-abstract-full').style.display = 'none'; document.getElementById('2308.06246v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 8 figures. This is the author&#39;s version of the article of the article that has been accepted for publication in IEEE Transactions on Visualization and Computer Graphics</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.10667">arXiv:2303.10667</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2303.10667">pdf</a>, <a href="https://arxiv.org/format/2303.10667">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Audio-Text Models Do Not Yet Leverage Natural Language </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wu%2C+H">Ho-Hsiang Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Nieto%2C+O">Oriol Nieto</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan Pablo Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Salamon%2C+J">Justin Salamon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.10667v1-abstract-short" style="display: inline;"> Multi-modal contrastive learning techniques in the audio-text domain have quickly become a highly active area of research. Most works are evaluated with standard audio retrieval and classification benchmarks assuming that (i) these models are capable of leveraging the rich information contained in natural language, and (ii) current benchmarks are able to capture the nuances of such information. In&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.10667v1-abstract-full').style.display = 'inline'; document.getElementById('2303.10667v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.10667v1-abstract-full" style="display: none;"> Multi-modal contrastive learning techniques in the audio-text domain have quickly become a highly active area of research. Most works are evaluated with standard audio retrieval and classification benchmarks assuming that (i) these models are capable of leveraging the rich information contained in natural language, and (ii) current benchmarks are able to capture the nuances of such information. In this work, we show that state-of-the-art audio-text models do not yet really understand natural language, especially contextual concepts such as sequential or concurrent ordering of sound events. Our results suggest that existing benchmarks are not sufficient to assess these models&#39; capabilities to match complex contexts from the audio and text modalities. We propose a Transformer-based architecture and show that, unlike prior work, it is capable of modeling the sequential relationship between sound events in the text and audio, given appropriate benchmark data. We advocate for the collection or generation of additional, diverse, data to allow future research to fully leverage natural language for audio-text modeling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.10667v1-abstract-full').style.display = 'none'; document.getElementById('2303.10667v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Copyright 2023 IEEE. Personal use of this material is permitted. Permission from IEEE must be obtained for all other uses, in any current or future media, including reprinting/republishing this material for advertising or promotional purposes, creating new collective works, for resale or redistribution to servers or lists, or reuse of any copyrighted component of this work in other works</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.08367">arXiv:2211.08367</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2211.08367">pdf</a>, <a href="https://arxiv.org/format/2211.08367">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> FlowGrad: Using Motion for Visual Sound Source Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Singh%2C+R">Rajsuryan Singh</a>, <a href="/search/cs?searchtype=author&amp;query=Zinemanas%2C+P">Pablo Zinemanas</a>, <a href="/search/cs?searchtype=author&amp;query=Serra%2C+X">Xavier Serra</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan Pablo Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Fuentes%2C+M">Magdalena Fuentes</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.08367v2-abstract-short" style="display: inline;"> Most recent work in visual sound source localization relies on semantic audio-visual representations learned in a self-supervised manner, and by design excludes temporal information present in videos. While it proves to be effective for widely used benchmark datasets, the method falls short for challenging scenarios like urban traffic. This work introduces temporal context into the state-of-the-ar&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.08367v2-abstract-full').style.display = 'inline'; document.getElementById('2211.08367v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.08367v2-abstract-full" style="display: none;"> Most recent work in visual sound source localization relies on semantic audio-visual representations learned in a self-supervised manner, and by design excludes temporal information present in videos. While it proves to be effective for widely used benchmark datasets, the method falls short for challenging scenarios like urban traffic. This work introduces temporal context into the state-of-the-art methods for sound source localization in urban scenes using optical flow as a means to encode motion information. An analysis of the strengths and weaknesses of our methods helps us better understand the problem of visual sound source localization and sheds light on open challenges for audio-visual scene understanding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.08367v2-abstract-full').style.display = 'none'; document.getElementById('2211.08367v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in ICASSP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.13064">arXiv:2205.13064</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2205.13064">pdf</a>, <a href="https://arxiv.org/format/2205.13064">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1111/cgf.14534">10.1111/cgf.14534 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Urban Rhapsody: Large-scale exploration of urban soundscapes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Rulff%2C+J">Joao Rulff</a>, <a href="/search/cs?searchtype=author&amp;query=Miranda%2C+F">Fabio Miranda</a>, <a href="/search/cs?searchtype=author&amp;query=Hosseini%2C+M">Maryam Hosseini</a>, <a href="/search/cs?searchtype=author&amp;query=Lage%2C+M">Marcos Lage</a>, <a href="/search/cs?searchtype=author&amp;query=Cartwright%2C+M">Mark Cartwright</a>, <a href="/search/cs?searchtype=author&amp;query=Dove%2C+G">Graham Dove</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J">Juan Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Silva%2C+C+T">Claudio T. Silva</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.13064v1-abstract-short" style="display: inline;"> Noise is one of the primary quality-of-life issues in urban environments. In addition to annoyance, noise negatively impacts public health and educational performance. While low-cost sensors can be deployed to monitor ambient noise levels at high temporal resolutions, the amount of data they produce and the complexity of these data pose significant analytical challenges. One way to address these c&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.13064v1-abstract-full').style.display = 'inline'; document.getElementById('2205.13064v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.13064v1-abstract-full" style="display: none;"> Noise is one of the primary quality-of-life issues in urban environments. In addition to annoyance, noise negatively impacts public health and educational performance. While low-cost sensors can be deployed to monitor ambient noise levels at high temporal resolutions, the amount of data they produce and the complexity of these data pose significant analytical challenges. One way to address these challenges is through machine listening techniques, which are used to extract features in attempts to classify the source of noise and understand temporal patterns of a city&#39;s noise situation. However, the overwhelming number of noise sources in the urban environment and the scarcity of labeled data makes it nearly impossible to create classification models with large enough vocabularies that capture the true dynamism of urban soundscapes In this paper, we first identify a set of requirements in the yet unexplored domain of urban soundscape exploration. To satisfy the requirements and tackle the identified challenges, we propose Urban Rhapsody, a framework that combines state-of-the-art audio representation, machine learning, and visual analytics to allow users to interactively create classification models, understand noise patterns of a city, and quickly retrieve and label audio excerpts in order to create a large high-precision annotated database of urban sound recordings. We demonstrate the tool&#39;s utility through case studies performed by domain experts using data generated over the five-year deployment of a one-of-a-kind sensor network in New York City. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.13064v1-abstract-full').style.display = 'none'; document.getElementById('2205.13064v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at EuroVis 2022. Source code available at: https://github.com/VIDA-NYU/Urban-Rhapsody</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.08851">arXiv:2205.08851</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2205.08851">pdf</a>, <a href="https://arxiv.org/format/2205.08851">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Positional Information is All You Need: A Novel Pipeline for Self-Supervised SVDE from Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+L+G">Juan Luis Gonzalez Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Moon%2C+J">Jaeho Moon</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+M">Munchurl Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.08851v1-abstract-short" style="display: inline;"> Recently, much attention has been drawn to learning the underlying 3D structures of a scene from monocular videos in a fully self-supervised fashion. One of the most challenging aspects of this task is handling the independently moving objects as they break the rigid-scene assumption. For the first time, we show that pixel positional information can be exploited to learn SVDE (Single View Depth Es&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.08851v1-abstract-full').style.display = 'inline'; document.getElementById('2205.08851v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.08851v1-abstract-full" style="display: none;"> Recently, much attention has been drawn to learning the underlying 3D structures of a scene from monocular videos in a fully self-supervised fashion. One of the most challenging aspects of this task is handling the independently moving objects as they break the rigid-scene assumption. For the first time, we show that pixel positional information can be exploited to learn SVDE (Single View Depth Estimation) from videos. Our proposed moving object (MO) masks, which are induced by shifted positional information (SPI) and referred to as `SPIMO&#39; masks, are very robust and consistently remove the independently moving objects in the scenes, allowing for better learning of SVDE from videos. Additionally, we introduce a new adaptive quantization scheme that assigns the best per-pixel quantization curve for our depth discretization. Finally, we employ existing boosting techniques in a new way to further self-supervise the depth of the moving objects. With these features, our pipeline is robust against moving objects and generalizes well to high-resolution images, even when trained with small patches, yielding state-of-the-art (SOTA) results with almost 8.5x fewer parameters than the previous works that learn from videos. We present extensive experiments on KITTI and CityScapes that show the effectiveness of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.08851v1-abstract-full').style.display = 'none'; document.getElementById('2205.08851v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.01273">arXiv:2205.01273</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2205.01273">pdf</a>, <a href="https://arxiv.org/format/2205.01273">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Few-Shot Musical Source Separation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Stoller%2C+D">Daniel Stoller</a>, <a href="/search/cs?searchtype=author&amp;query=Bittner%2C+R+M">Rachel M. Bittner</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan Pablo Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.01273v1-abstract-short" style="display: inline;"> Deep learning-based approaches to musical source separation are often limited to the instrument classes that the models are trained on and do not generalize to separate unseen instruments. To address this, we propose a few-shot musical source separation paradigm. We condition a generic U-Net source separation model using few audio examples of the target instrument. We train a few-shot conditioning&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.01273v1-abstract-full').style.display = 'inline'; document.getElementById('2205.01273v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.01273v1-abstract-full" style="display: none;"> Deep learning-based approaches to musical source separation are often limited to the instrument classes that the models are trained on and do not generalize to separate unseen instruments. To address this, we propose a few-shot musical source separation paradigm. We condition a generic U-Net source separation model using few audio examples of the target instrument. We train a few-shot conditioning encoder jointly with the U-Net to encode the audio examples into a conditioning vector to configure the U-Net via feature-wise linear modulation (FiLM). We evaluate the trained models on real musical recordings in the MUSDB18 and MedleyDB datasets. We show that our proposed few-shot conditioning paradigm outperforms the baseline one-hot instrument-class conditioned model for both seen and unseen instruments. To extend the scope of our approach to a wider variety of real-world scenarios, we also experiment with different conditioning example characteristics, including examples from different recordings, with multiple sources, or negative conditioning examples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.01273v1-abstract-full').style.display = 'none'; document.getElementById('2205.01273v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICASSP 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.05156">arXiv:2204.05156</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2204.05156">pdf</a>, <a href="https://arxiv.org/format/2204.05156">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> How to Listen? Rethinking Visual Sound Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wu%2C+H">Ho-Hsiang Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Fuentes%2C+M">Magdalena Fuentes</a>, <a href="/search/cs?searchtype=author&amp;query=Seetharaman%2C+P">Prem Seetharaman</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan Pablo Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.05156v1-abstract-short" style="display: inline;"> Localizing visual sounds consists on locating the position of objects that emit sound within an image. It is a growing research area with potential applications in monitoring natural and urban environments, such as wildlife migration and urban traffic. Previous works are usually evaluated with datasets having mostly a single dominant visible object, and proposed models usually require the introduc&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.05156v1-abstract-full').style.display = 'inline'; document.getElementById('2204.05156v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.05156v1-abstract-full" style="display: none;"> Localizing visual sounds consists on locating the position of objects that emit sound within an image. It is a growing research area with potential applications in monitoring natural and urban environments, such as wildlife migration and urban traffic. Previous works are usually evaluated with datasets having mostly a single dominant visible object, and proposed models usually require the introduction of localization modules during training or dedicated sampling strategies, but it remains unclear how these design choices play a role in the adaptability of these methods in more challenging scenarios. In this work, we analyze various model choices for visual sound localization and discuss how their different components affect the model&#39;s performance, namely the encoders&#39; architecture, the loss function and the localization strategy. Furthermore, we study the interaction between these decisions, the model performance, and the data, by digging into different evaluation datasets spanning different difficulties and characteristics, and discuss the implications of such decisions in the context of real-world applications. Our code and model weights are open-sourced and made available for further applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.05156v1-abstract-full').style.display = 'none'; document.getElementById('2204.05156v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to INTERSPEECH 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.10425">arXiv:2203.10425</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2203.10425">pdf</a>, <a href="https://arxiv.org/format/2203.10425">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> A Study on Robustness to Perturbations for Representations of Environmental Sound </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Srivastava%2C+S">Sangeeta Srivastava</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+H">Ho-Hsiang Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Rulff%2C+J">Joao Rulff</a>, <a href="/search/cs?searchtype=author&amp;query=Fuentes%2C+M">Magdalena Fuentes</a>, <a href="/search/cs?searchtype=author&amp;query=Cartwright%2C+M">Mark Cartwright</a>, <a href="/search/cs?searchtype=author&amp;query=Silva%2C+C">Claudio Silva</a>, <a href="/search/cs?searchtype=author&amp;query=Arora%2C+A">Anish Arora</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan Pablo Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.10425v3-abstract-short" style="display: inline;"> Audio applications involving environmental sound analysis increasingly use general-purpose audio representations, also known as embeddings, for transfer learning. Recently, Holistic Evaluation of Audio Representations (HEAR) evaluated twenty-nine embedding models on nineteen diverse tasks. However, the evaluation&#39;s effectiveness depends on the variation already captured within a given dataset. The&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.10425v3-abstract-full').style.display = 'inline'; document.getElementById('2203.10425v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.10425v3-abstract-full" style="display: none;"> Audio applications involving environmental sound analysis increasingly use general-purpose audio representations, also known as embeddings, for transfer learning. Recently, Holistic Evaluation of Audio Representations (HEAR) evaluated twenty-nine embedding models on nineteen diverse tasks. However, the evaluation&#39;s effectiveness depends on the variation already captured within a given dataset. Therefore, for a given data domain, it is unclear how the representations would be affected by the variations caused by myriad microphones&#39; range and acoustic conditions -- commonly known as channel effects. We aim to extend HEAR to evaluate invariance to channel effects in this work. To accomplish this, we imitate channel effects by injecting perturbations to the audio signal and measure the shift in the new (perturbed) embeddings with three distance measures, making the evaluation domain-dependent but not task-dependent. Combined with the downstream performance, it helps us make a more informed prediction of how robust the embeddings are to the channel effects. We evaluate two embeddings -- YAMNet, and OpenL3 on monophonic (UrbanSound8K) and polyphonic (SONYC-UST) urban datasets. We show that one distance measure does not suffice in such task-independent evaluation. Although Fr茅chet Audio Distance (FAD) correlates with the trend of the performance drop in the downstream task most accurately, we show that we need to study FAD in conjunction with the other distances to get a clear understanding of the overall effect of the perturbation. In terms of the embedding performance, we find OpenL3 to be more robust than YAMNet, which aligns with the HEAR evaluation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.10425v3-abstract-full').style.display = 'none'; document.getElementById('2203.10425v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in EUSIPCO 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.06220">arXiv:2203.06220</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2203.06220">pdf</a>, <a href="https://arxiv.org/format/2203.06220">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Infrastructure-free, Deep Learned Urban Noise Monitoring at $\sim$100mW </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yun%2C+J">Jihoon Yun</a>, <a href="/search/cs?searchtype=author&amp;query=Srivastava%2C+S">Sangeeta Srivastava</a>, <a href="/search/cs?searchtype=author&amp;query=Roy%2C+D">Dhrubojyoti Roy</a>, <a href="/search/cs?searchtype=author&amp;query=Stohs%2C+N">Nathan Stohs</a>, <a href="/search/cs?searchtype=author&amp;query=Mydlarz%2C+C">Charlie Mydlarz</a>, <a href="/search/cs?searchtype=author&amp;query=Salman%2C+M">Mahin Salman</a>, <a href="/search/cs?searchtype=author&amp;query=Steers%2C+B">Bea Steers</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan Pablo Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Arora%2C+A">Anish Arora</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.06220v1-abstract-short" style="display: inline;"> The Sounds of New York City (SONYC) wireless sensor network (WSN) has been fielded in Manhattan and Brooklyn over the past five years, as part of a larger human-in-the-loop cyber-physical control system for monitoring, analyzing, and mitigating urban noise pollution. We describe the evolution of the 2-tier SONYC WSN from an acoustic data collection fabric into a 3-tier in situ noise complaint moni&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.06220v1-abstract-full').style.display = 'inline'; document.getElementById('2203.06220v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.06220v1-abstract-full" style="display: none;"> The Sounds of New York City (SONYC) wireless sensor network (WSN) has been fielded in Manhattan and Brooklyn over the past five years, as part of a larger human-in-the-loop cyber-physical control system for monitoring, analyzing, and mitigating urban noise pollution. We describe the evolution of the 2-tier SONYC WSN from an acoustic data collection fabric into a 3-tier in situ noise complaint monitoring WSN, and its current evaluation. The added tier consists of long-range (LoRa), multi-hop networks of a new low-power acoustic mote, MKII (&#34;Mach 2&#34;), that we have designed and fabricated. MKII motes are notable in three ways: First, they advance machine learning capability at mote-scale in this application domain by introducing a real-time Convolutional Neural Network (CNN) based embedding model that is competitive with alternatives while also requiring 10$\times$ lesser training data and $\sim$2 orders of magnitude fewer runtime resources. Second, they are conveniently deployed relatively far from higher-tier base station nodes without assuming power or network infrastructure support at operationally relevant sites (such as construction zones), yielding a relatively low-cost solution. And third, their networking is frequency agile, unlike conventional LoRa networks: it tolerates in a distributed, self-stabilizing way the variable external interference and link fading in the cluttered 902-928MHz ISM band urban environment by dynamically choosing good frequencies using an efficient new method that combines passive and active measurements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.06220v1-abstract-full').style.display = 'none'; document.getElementById('2203.06220v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in ICCPS 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.11499">arXiv:2110.11499</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2110.11499">pdf</a>, <a href="https://arxiv.org/format/2110.11499">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Wav2CLIP: Learning Robust Audio Representations From CLIP </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wu%2C+H">Ho-Hsiang Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Seetharaman%2C+P">Prem Seetharaman</a>, <a href="/search/cs?searchtype=author&amp;query=Kumar%2C+K">Kundan Kumar</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan Pablo Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.11499v2-abstract-short" style="display: inline;"> We propose Wav2CLIP, a robust audio representation learning method by distilling from Contrastive Language-Image Pre-training (CLIP). We systematically evaluate Wav2CLIP on a variety of audio tasks including classification, retrieval, and generation, and show that Wav2CLIP can outperform several publicly available pre-trained audio representation algorithms. Wav2CLIP projects audio into a shared e&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.11499v2-abstract-full').style.display = 'inline'; document.getElementById('2110.11499v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.11499v2-abstract-full" style="display: none;"> We propose Wav2CLIP, a robust audio representation learning method by distilling from Contrastive Language-Image Pre-training (CLIP). We systematically evaluate Wav2CLIP on a variety of audio tasks including classification, retrieval, and generation, and show that Wav2CLIP can outperform several publicly available pre-trained audio representation algorithms. Wav2CLIP projects audio into a shared embedding space with images and text, which enables multimodal applications such as zero-shot classification, and cross-modal retrieval. Furthermore, Wav2CLIP needs just ~10% of the data to achieve competitive performance on downstream tasks compared with fully supervised models, and is more efficient to pre-train than competing methods as it does not require learning a visual model in concert with an auditory model. Finally, we demonstrate image generation from Wav2CLIP as qualitative assessment of the shared embedding space. Our code and model weights are open sourced and made available for further applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.11499v2-abstract-full').style.display = 'none'; document.getElementById('2110.11499v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Copyright 2022 IEEE. Personal use of this material is permitted. Permission from IEEE must be obtained for all other uses, in any current or future media, including reprinting/republishing this material for advertising or promotional purposes, creating new collective works, for resale or redistribution to servers or lists, or reuse of any copyrighted component of this work in other works</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.09600">arXiv:2110.09600</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2110.09600">pdf</a>, <a href="https://arxiv.org/format/2110.09600">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Who calls the shots? Rethinking Few-Shot Learning for Audio </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Bryan%2C+N+J">Nicholas J. Bryan</a>, <a href="/search/cs?searchtype=author&amp;query=Salamon%2C+J">Justin Salamon</a>, <a href="/search/cs?searchtype=author&amp;query=Cartwright%2C+M">Mark Cartwright</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan Pablo Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.09600v1-abstract-short" style="display: inline;"> Few-shot learning aims to train models that can recognize novel classes given just a handful of labeled examples, known as the support set. While the field has seen notable advances in recent years, they have often focused on multi-class image classification. Audio, in contrast, is often multi-label due to overlapping sounds, resulting in unique properties such as polyphony and signal-to-noise rat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.09600v1-abstract-full').style.display = 'inline'; document.getElementById('2110.09600v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.09600v1-abstract-full" style="display: none;"> Few-shot learning aims to train models that can recognize novel classes given just a handful of labeled examples, known as the support set. While the field has seen notable advances in recent years, they have often focused on multi-class image classification. Audio, in contrast, is often multi-label due to overlapping sounds, resulting in unique properties such as polyphony and signal-to-noise ratios (SNR). This leads to unanswered questions concerning the impact such audio properties may have on few-shot learning system design, performance, and human-computer interaction, as it is typically up to the user to collect and provide inference-time support set examples. We address these questions through a series of experiments designed to elucidate the answers to these questions. We introduce two novel datasets, FSD-MIX-CLIPS and FSD-MIX-SED, whose programmatic generation allows us to explore these questions systematically. Our experiments lead to audio-specific insights on few-shot learning, some of which are at odds with recent findings in the image domain: there is no best one-size-fits-all model, method, and support set selection criterion. Rather, it depends on the expected application scenario. Our code and data are available at https://github.com/wangyu/rethink-audio-fsl. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.09600v1-abstract-full').style.display = 'none'; document.getElementById('2110.09600v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">WASPAA 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2109.12690">arXiv:2109.12690</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2109.12690">pdf</a>, <a href="https://arxiv.org/ps/2109.12690">ps</a>, <a href="https://arxiv.org/format/2109.12690">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Soundata: A Python library for reproducible use of audio datasets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Fuentes%2C+M">Magdalena Fuentes</a>, <a href="/search/cs?searchtype=author&amp;query=Salamon%2C+J">Justin Salamon</a>, <a href="/search/cs?searchtype=author&amp;query=Zinemanas%2C+P">Pablo Zinemanas</a>, <a href="/search/cs?searchtype=author&amp;query=Rocamora%2C+M">Mart铆n Rocamora</a>, <a href="/search/cs?searchtype=author&amp;query=Paja%2C+G">Gen铆s Paja</a>, <a href="/search/cs?searchtype=author&amp;query=Rom%C3%A1n%2C+I+R">Ir谩n R. Rom谩n</a>, <a href="/search/cs?searchtype=author&amp;query=Miron%2C+M">Marius Miron</a>, <a href="/search/cs?searchtype=author&amp;query=Serra%2C+X">Xavier Serra</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan Pablo Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2109.12690v2-abstract-short" style="display: inline;"> Soundata is a Python library for loading and working with audio datasets in a standardized way, removing the need for writing custom loaders in every project, and improving reproducibility by providing tools to validate data against a canonical version. It speeds up research pipelines by allowing users to quickly download a dataset, load it into memory in a standardized and reproducible way, valid&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.12690v2-abstract-full').style.display = 'inline'; document.getElementById('2109.12690v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2109.12690v2-abstract-full" style="display: none;"> Soundata is a Python library for loading and working with audio datasets in a standardized way, removing the need for writing custom loaders in every project, and improving reproducibility by providing tools to validate data against a canonical version. It speeds up research pipelines by allowing users to quickly download a dataset, load it into memory in a standardized and reproducible way, validate that the dataset is complete and correct, and more. Soundata is based and inspired on mirdata and design to complement mirdata by working with environmental sound, bioacoustic and speech datasets, among others. Soundata was created to be easy to use, easy to contribute to, and to increase reproducibility and standardize usage of sound datasets in a flexible way. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.12690v2-abstract-full').style.display = 'none'; document.getElementById('2109.12690v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.01149">arXiv:2106.01149</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2106.01149">pdf</a>, <a href="https://arxiv.org/format/2106.01149">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Exploring modality-agnostic representations for music classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wu%2C+H">Ho-Hsiang Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Fuentes%2C+M">Magdalena Fuentes</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan P. Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.01149v1-abstract-short" style="display: inline;"> Music information is often conveyed or recorded across multiple data modalities including but not limited to audio, images, text and scores. However, music information retrieval research has almost exclusively focused on single modality recognition, requiring development of separate models for each modality. Some multi-modal works require multiple coexisting modalities given to the model as inputs&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.01149v1-abstract-full').style.display = 'inline'; document.getElementById('2106.01149v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.01149v1-abstract-full" style="display: none;"> Music information is often conveyed or recorded across multiple data modalities including but not limited to audio, images, text and scores. However, music information retrieval research has almost exclusively focused on single modality recognition, requiring development of separate models for each modality. Some multi-modal works require multiple coexisting modalities given to the model as inputs, constraining the use of these models to the few cases where data from all modalities are available. To the best of our knowledge, no existing model has the ability to take inputs from varying modalities, e.g. images or sounds, and classify them into unified music categories. We explore the use of cross-modal retrieval as a pretext task to learn modality-agnostic representations, which can then be used as inputs to classifiers that are independent of modality. We select instrument classification as an example task for our study as both visual and audio components provide relevant semantic information. We train music instrument classifiers that can take both images or sounds as input, and perform comparably to sound-only or image-only classifiers. Furthermore, we explore the case when there is limited labeled data for a given modality, and the impact in performance by using labeled data from other modalities. We are able to achieve almost 70% of best performing system in a zero-shot setting. We provide a detailed analysis of experimental results to understand the potential and limitations of the approach, and discuss future steps towards modality-agnostic classifiers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.01149v1-abstract-full').style.display = 'none'; document.getElementById('2106.01149v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2105.02911">arXiv:2105.02911</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2105.02911">pdf</a>, <a href="https://arxiv.org/format/2105.02911">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Weakly Supervised Source-Specific Sound Level Estimation in Noisy Soundscapes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cramer%2C+A">Aurora Cramer</a>, <a href="/search/cs?searchtype=author&amp;query=Cartwright%2C+M">Mark Cartwright</a>, <a href="/search/cs?searchtype=author&amp;query=Pishdadian%2C+F">Fatemeh Pishdadian</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan Pablo Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2105.02911v2-abstract-short" style="display: inline;"> While the estimation of what sound sources are, when they occur, and from where they originate has been well-studied, the estimation of how loud these sound sources are has been often overlooked. Current solutions to this task, which we refer to as source-specific sound level estimation (SSSLE), suffer from challenges due to the impracticality of acquiring realistic data and a lack of robustness t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2105.02911v2-abstract-full').style.display = 'inline'; document.getElementById('2105.02911v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2105.02911v2-abstract-full" style="display: none;"> While the estimation of what sound sources are, when they occur, and from where they originate has been well-studied, the estimation of how loud these sound sources are has been often overlooked. Current solutions to this task, which we refer to as source-specific sound level estimation (SSSLE), suffer from challenges due to the impracticality of acquiring realistic data and a lack of robustness to realistic recording conditions. Recently proposed weakly supervised source separation offer a means of leveraging clip-level source annotations to train source separation models, which we augment with modified loss functions to bridge the gap between source separation and SSSLE and to address the presence of background. We show that our approach improves SSSLE performance compared to baseline source separation models and provide an ablation analysis to explore our method&#39;s design choices, showing that SSSLE in practical recording and annotation scenarios is possible. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2105.02911v2-abstract-full').style.display = 'none'; document.getElementById('2105.02911v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 May, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures, WASPAA 2021 preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2103.07362">arXiv:2103.07362</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2103.07362">pdf</a>, <a href="https://arxiv.org/format/2103.07362">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PLADE-Net: Towards Pixel-Level Accuracy for Self-Supervised Single-View Depth Estimation with Neural Positional Encoding and Distilled Matting Loss </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+L+G">Juan Luis Gonzalez Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+M">Munchurl Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2103.07362v1-abstract-short" style="display: inline;"> In this paper, we propose a self-supervised single-view pixel-level accurate depth estimation network, called PLADE-Net. The PLADE-Net is the first work that shows unprecedented accuracy levels, exceeding 95\% in terms of the $未^1$ metric on the challenging KITTI dataset. Our PLADE-Net is based on a new network architecture with neural positional encoding and a novel loss function that borrows fro&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.07362v1-abstract-full').style.display = 'inline'; document.getElementById('2103.07362v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2103.07362v1-abstract-full" style="display: none;"> In this paper, we propose a self-supervised single-view pixel-level accurate depth estimation network, called PLADE-Net. The PLADE-Net is the first work that shows unprecedented accuracy levels, exceeding 95\% in terms of the $未^1$ metric on the challenging KITTI dataset. Our PLADE-Net is based on a new network architecture with neural positional encoding and a novel loss function that borrows from the closed-form solution of the matting Laplacian to learn pixel-level accurate depth estimation from stereo images. Neural positional encoding allows our PLADE-Net to obtain more consistent depth estimates by letting the network reason about location-specific image properties such as lens and projection distortions. Our novel distilled matting Laplacian loss allows our network to predict sharp depths at object boundaries and more consistent depths in highly homogeneous regions. Our proposed method outperforms all previous self-supervised single-view depth estimation methods by a large margin on the challenging KITTI dataset, with unprecedented levels of accuracy. Furthermore, our PLADE-Net, naively extended for stereo inputs, outperforms the most recent self-supervised stereo methods, even without any advanced blocks like 1D correlations, 3D convolutions, or spatial pyramid pooling. We present extensive ablation studies and experiments that support our method&#39;s effectiveness on the KITTI, CityScapes, and Make3D datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.07362v1-abstract-full').style.display = 'none'; document.getElementById('2103.07362v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted paper (poster) at CVPR2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2103.03727">arXiv:2103.03727</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2103.03727">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Suicide Classificaction for News Media Using Convolutional Neural Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bello%2C+H+J">Hugo J. Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Palomar-Ciria%2C+N">Nora Palomar-Ciria</a>, <a href="/search/cs?searchtype=author&amp;query=Baca-Garc%C3%ADa%2C+E">Enrique Baca-Garc铆a</a>, <a href="/search/cs?searchtype=author&amp;query=Lozano%2C+C">Celia Lozano</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2103.03727v1-abstract-short" style="display: inline;"> Currently, the process of evaluating suicides is highly subjective, which limits the efficacy and accuracy of prevention efforts. Artificial intelligence (AI) has emerged as a means of investigating large datasets to identify patterns within &#34;big data&#34; that can determine the factors on suicide outcomes. Here, we use AI tools to extract the topic from (press and social) media text. However, news me&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.03727v1-abstract-full').style.display = 'inline'; document.getElementById('2103.03727v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2103.03727v1-abstract-full" style="display: none;"> Currently, the process of evaluating suicides is highly subjective, which limits the efficacy and accuracy of prevention efforts. Artificial intelligence (AI) has emerged as a means of investigating large datasets to identify patterns within &#34;big data&#34; that can determine the factors on suicide outcomes. Here, we use AI tools to extract the topic from (press and social) media text. However, news media articles lack of suicide tags. Using tweets with hashtags related to sucide, we train a neuronal model which identifies if a given text has a suicidade-related contagion. Our results suggest a high level of the impact of mediatic into suicide cases, and a intrinsic thematic relationship of suicide news. These results pave the way to build more interpretable suicide data, which may help to better track, understand its origin, and improve prevention strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.03727v1-abstract-full').style.display = 'none'; document.getElementById('2103.03727v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2102.03229">arXiv:2102.03229</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2102.03229">pdf</a>, <a href="https://arxiv.org/format/2102.03229">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Multi-Task Self-Supervised Pre-Training for Music Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wu%2C+H">Ho-Hsiang Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Kao%2C+C">Chieh-Chi Kao</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+Q">Qingming Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Ming Sun</a>, <a href="/search/cs?searchtype=author&amp;query=McFee%2C+B">Brian McFee</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan Pablo Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Chao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2102.03229v1-abstract-short" style="display: inline;"> Deep learning is very data hungry, and supervised learning especially requires massive labeled data to work well. Machine listening research often suffers from limited labeled data problem, as human annotations are costly to acquire, and annotations for audio are time consuming and less intuitive. Besides, models learned from labeled dataset often embed biases specific to that particular dataset.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.03229v1-abstract-full').style.display = 'inline'; document.getElementById('2102.03229v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2102.03229v1-abstract-full" style="display: none;"> Deep learning is very data hungry, and supervised learning especially requires massive labeled data to work well. Machine listening research often suffers from limited labeled data problem, as human annotations are costly to acquire, and annotations for audio are time consuming and less intuitive. Besides, models learned from labeled dataset often embed biases specific to that particular dataset. Therefore, unsupervised learning techniques become popular approaches in solving machine listening problems. Particularly, a self-supervised learning technique utilizing reconstructions of multiple hand-crafted audio features has shown promising results when it is applied to speech domain such as emotion recognition and automatic speech recognition (ASR). In this paper, we apply self-supervised and multi-task learning methods for pre-training music encoders, and explore various design choices including encoder architectures, weighting mechanisms to combine losses from multiple tasks, and worker selections of pretext tasks. We investigate how these design choices interact with various downstream music classification tasks. We find that using various music specific workers altogether with weighting mechanisms to balance the losses during pre-training helps improve and generalize to the downstream tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.03229v1-abstract-full').style.display = 'none'; document.getElementById('2102.03229v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Copyright 2021 IEEE. Personal use of this material is permitted. Permission from IEEE must be obtained for all other uses, in any current or future media, including reprinting/republishing this material for advertising or promotional purposes, creating new collective works, for resale or redistribution to servers or lists, or reuse of any copyrighted component of this work in other works</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2012.07490">arXiv:2012.07490</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2012.07490">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Machine Learning to study the impact of gender-based violence in the news media </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bello%2C+H+J">Hugo J. Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Palomar%2C+N">Nora Palomar</a>, <a href="/search/cs?searchtype=author&amp;query=Gallego%2C+E">Elisa Gallego</a>, <a href="/search/cs?searchtype=author&amp;query=Navascu%C3%A9s%2C+L+J">Lourdes Jim茅nez Navascu茅s</a>, <a href="/search/cs?searchtype=author&amp;query=Lozano%2C+C">Celia Lozano</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2012.07490v1-abstract-short" style="display: inline;"> While it remains a taboo topic, gender-based violence (GBV) undermines the health, dignity, security and autonomy of its victims. Many factors have been studied to generate or maintain this kind of violence, however, the influence of the media is still uncertain. Here, we use Machine Learning tools to extrapolate the effect of the news in GBV. By feeding neural networks with news, the topic inform&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.07490v1-abstract-full').style.display = 'inline'; document.getElementById('2012.07490v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2012.07490v1-abstract-full" style="display: none;"> While it remains a taboo topic, gender-based violence (GBV) undermines the health, dignity, security and autonomy of its victims. Many factors have been studied to generate or maintain this kind of violence, however, the influence of the media is still uncertain. Here, we use Machine Learning tools to extrapolate the effect of the news in GBV. By feeding neural networks with news, the topic information associated with each article can be recovered. Our findings show a relationship between GBV news and public awareness, the effect of mediatic GBV cases, and the intrinsic thematic relationship of GBV news. Because the used neural model can be easily adjusted, this also allows us to extend our approach to other media sources or topics <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.07490v1-abstract-full').style.display = 'none'; document.getElementById('2012.07490v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2009.05188">arXiv:2009.05188</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2009.05188">pdf</a>, <a href="https://arxiv.org/format/2009.05188">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SONYC-UST-V2: An Urban Sound Tagging Dataset with Spatiotemporal Context </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cartwright%2C+M">Mark Cartwright</a>, <a href="/search/cs?searchtype=author&amp;query=Cramer%2C+J">Jason Cramer</a>, <a href="/search/cs?searchtype=author&amp;query=Mendez%2C+A+E+M">Ana Elisa Mendez Mendez</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+H">Ho-Hsiang Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Lostanlen%2C+V">Vincent Lostanlen</a>, <a href="/search/cs?searchtype=author&amp;query=Fuentes%2C+M">Magdalena Fuentes</a>, <a href="/search/cs?searchtype=author&amp;query=Dove%2C+G">Graham Dove</a>, <a href="/search/cs?searchtype=author&amp;query=Mydlarz%2C+C">Charlie Mydlarz</a>, <a href="/search/cs?searchtype=author&amp;query=Salamon%2C+J">Justin Salamon</a>, <a href="/search/cs?searchtype=author&amp;query=Nov%2C+O">Oded Nov</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan Pablo Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2009.05188v1-abstract-short" style="display: inline;"> We present SONYC-UST-V2, a dataset for urban sound tagging with spatiotemporal information. This dataset is aimed for the development and evaluation of machine listening systems for real-world urban noise monitoring. While datasets of urban recordings are available, this dataset provides the opportunity to investigate how spatiotemporal metadata can aid in the prediction of urban sound tags. SONYC&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2009.05188v1-abstract-full').style.display = 'inline'; document.getElementById('2009.05188v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2009.05188v1-abstract-full" style="display: none;"> We present SONYC-UST-V2, a dataset for urban sound tagging with spatiotemporal information. This dataset is aimed for the development and evaluation of machine listening systems for real-world urban noise monitoring. While datasets of urban recordings are available, this dataset provides the opportunity to investigate how spatiotemporal metadata can aid in the prediction of urban sound tags. SONYC-UST-V2 consists of 18510 audio recordings from the &#34;Sounds of New York City&#34; (SONYC) acoustic sensor network, including the timestamp of audio acquisition and location of the sensor. The dataset contains annotations by volunteers from the Zooniverse citizen science platform, as well as a two-stage verification with our team. In this article, we describe our data collection procedure and propose evaluation metrics for multilabel classification of urban sound tags. We report the results of a simple baseline model that exploits spatiotemporal information. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2009.05188v1-abstract-full').style.display = 'none'; document.getElementById('2009.05188v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2008.02791">arXiv:2008.02791</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2008.02791">pdf</a>, <a href="https://arxiv.org/format/2008.02791">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Few-Shot Drum Transcription in Polyphonic Music </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Salamon%2C+J">Justin Salamon</a>, <a href="/search/cs?searchtype=author&amp;query=Cartwright%2C+M">Mark Cartwright</a>, <a href="/search/cs?searchtype=author&amp;query=Bryan%2C+N+J">Nicholas J. Bryan</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan Pablo Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2008.02791v1-abstract-short" style="display: inline;"> Data-driven approaches to automatic drum transcription (ADT) are often limited to a predefined, small vocabulary of percussion instrument classes. Such models cannot recognize out-of-vocabulary classes nor are they able to adapt to finer-grained vocabularies. In this work, we address open vocabulary ADT by introducing few-shot learning to the task. We train a Prototypical Network on a synthetic da&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.02791v1-abstract-full').style.display = 'inline'; document.getElementById('2008.02791v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2008.02791v1-abstract-full" style="display: none;"> Data-driven approaches to automatic drum transcription (ADT) are often limited to a predefined, small vocabulary of percussion instrument classes. Such models cannot recognize out-of-vocabulary classes nor are they able to adapt to finer-grained vocabularies. In this work, we address open vocabulary ADT by introducing few-shot learning to the task. We train a Prototypical Network on a synthetic dataset and evaluate the model on multiple real-world ADT datasets with polyphonic accompaniment. We show that, given just a handful of selected examples at inference time, we can match and in some cases outperform a state-of-the-art supervised ADT approach under a fixed vocabulary setting. At the same time, we show that our model can successfully generalize to finer-grained or extended vocabularies unseen during training, a scenario where supervised approaches cannot operate at all. We provide a detailed analysis of our experimental results, including a breakdown of performance by sound class and by polyphony. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.02791v1-abstract-full').style.display = 'none'; document.getElementById('2008.02791v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ISMIR 2020 camera-ready</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2003.01037">arXiv:2003.01037</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2003.01037">pdf</a>, <a href="https://arxiv.org/format/2003.01037">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> One or Two Components? The Scattering Transform Answers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lostanlen%2C+V">Vincent Lostanlen</a>, <a href="/search/cs?searchtype=author&amp;query=Cohen-Hadria%2C+A">Alice Cohen-Hadria</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan Pablo Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2003.01037v2-abstract-short" style="display: inline;"> With the aim of constructing a biologically plausible model of machine listening, we study the representation of a multicomponent stationary signal by a wavelet scattering network. First, we show that renormalizing second-order nodes by their first-order parents gives a simple numerical criterion to assess whether two neighboring components will interfere psychoacoustically. Secondly, we run a man&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.01037v2-abstract-full').style.display = 'inline'; document.getElementById('2003.01037v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2003.01037v2-abstract-full" style="display: none;"> With the aim of constructing a biologically plausible model of machine listening, we study the representation of a multicomponent stationary signal by a wavelet scattering network. First, we show that renormalizing second-order nodes by their first-order parents gives a simple numerical criterion to assess whether two neighboring components will interfere psychoacoustically. Secondly, we run a manifold learning algorithm (Isomap) on scattering coefficients to visualize the similarity space underlying parametric additive synthesis. Thirdly, we generalize the &#34;one or two components&#34; framework to three sine waves or more, and prove that the effective scattering depth of a Fourier series grows in logarithmic proportion to its bandwidth. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.01037v2-abstract-full').style.display = 'none'; document.getElementById('2003.01037v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 March, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 4 figures, in English. Proceedings of the European Signal Processing Conference (EUSIPCO 2020)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1911.00417">arXiv:1911.00417</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1911.00417">pdf</a>, <a href="https://arxiv.org/format/1911.00417">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.33682/ts6e-sn53">10.33682/ts6e-sn53 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Long-distance Detection of Bioacoustic Events with Per-channel Energy Normalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lostanlen%2C+V">Vincent Lostanlen</a>, <a href="/search/cs?searchtype=author&amp;query=Palmer%2C+K">Kaitlin Palmer</a>, <a href="/search/cs?searchtype=author&amp;query=Knight%2C+E">Elly Knight</a>, <a href="/search/cs?searchtype=author&amp;query=Clark%2C+C">Christopher Clark</a>, <a href="/search/cs?searchtype=author&amp;query=Klinck%2C+H">Holger Klinck</a>, <a href="/search/cs?searchtype=author&amp;query=Farnsworth%2C+A">Andrew Farnsworth</a>, <a href="/search/cs?searchtype=author&amp;query=Wong%2C+T">Tina Wong</a>, <a href="/search/cs?searchtype=author&amp;query=Cramer%2C+J">Jason Cramer</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan Pablo Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1911.00417v1-abstract-short" style="display: inline;"> This paper proposes to perform unsupervised detection of bioacoustic events by pooling the magnitudes of spectrogram frames after per-channel energy normalization (PCEN). Although PCEN was originally developed for speech recognition, it also has beneficial effects in enhancing animal vocalizations, despite the presence of atmospheric absorption and intermittent noise. We prove that PCEN generalize&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1911.00417v1-abstract-full').style.display = 'inline'; document.getElementById('1911.00417v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1911.00417v1-abstract-full" style="display: none;"> This paper proposes to perform unsupervised detection of bioacoustic events by pooling the magnitudes of spectrogram frames after per-channel energy normalization (PCEN). Although PCEN was originally developed for speech recognition, it also has beneficial effects in enhancing animal vocalizations, despite the presence of atmospheric absorption and intermittent noise. We prove that PCEN generalizes logarithm-based spectral flux, yet with a tunable time scale for background noise estimation. In comparison with pointwise logarithm, PCEN reduces false alarm rate by 50x in the near field and 5x in the far field, both on avian and marine bioacoustic datasets. Such improvements come at moderate computational cost and require no human intervention, thus heralding a promising future for PCEN in bioacoustics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1911.00417v1-abstract-full').style.display = 'none'; document.getElementById('1911.00417v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures. Presented at the 3rd International Workshop on Detection and Classification of Acoustic Scenes and Events (DCASE). 25--26 October 2019, New York, NY, USA</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1910.10246">arXiv:1910.10246</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1910.10246">pdf</a>, <a href="https://arxiv.org/format/1910.10246">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Learning the helix topology of musical pitch </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lostanlen%2C+V">Vincent Lostanlen</a>, <a href="/search/cs?searchtype=author&amp;query=Sridhar%2C+S">Sripathi Sridhar</a>, <a href="/search/cs?searchtype=author&amp;query=McFee%2C+B">Brian McFee</a>, <a href="/search/cs?searchtype=author&amp;query=Farnsworth%2C+A">Andrew Farnsworth</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan Pablo Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1910.10246v2-abstract-short" style="display: inline;"> To explain the consonance of octaves, music psychologists represent pitch as a helix where azimuth and axial coordinate correspond to pitch class and pitch height respectively. This article addresses the problem of discovering this helical structure from unlabeled audio data. We measure Pearson correlations in the constant-Q transform (CQT) domain to build a K-nearest neighbor graph between freque&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.10246v2-abstract-full').style.display = 'inline'; document.getElementById('1910.10246v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1910.10246v2-abstract-full" style="display: none;"> To explain the consonance of octaves, music psychologists represent pitch as a helix where azimuth and axial coordinate correspond to pitch class and pitch height respectively. This article addresses the problem of discovering this helical structure from unlabeled audio data. We measure Pearson correlations in the constant-Q transform (CQT) domain to build a K-nearest neighbor graph between frequency subbands. Then, we run the Isomap manifold learning algorithm to represent this graph in a three-dimensional space in which straight lines approximate graph geodesics. Experiments on isolated musical notes demonstrate that the resulting manifold resembles a helix which makes a full turn at every octave. A circular shape is also found in English speech, but not in urban noise. We discuss the impact of various design choices on the visualization: instrumentarium, loudness mapping function, and number of neighbors K. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.10246v2-abstract-full').style.display = 'none'; document.getElementById('1910.10246v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 6 figures. To appear in the Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP). Barcelona, Spain, May 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1910.01089">arXiv:1910.01089</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1910.01089">pdf</a>, <a href="https://arxiv.org/format/1910.01089">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Deep 3D Pan via adaptive &#34;t-shaped&#34; convolutions with global and local adaptive dilations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+L+G">Juan Luis Gonzalez Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+M">Munchurl Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1910.01089v3-abstract-short" style="display: inline;"> Recent advances in deep learning have shown promising results in many low-level vision tasks. However, solving the single-image-based view synthesis is still an open problem. In particular, the generation of new images at parallel camera views given a single input image is of great interest, as it enables 3D visualization of the 2D input scenery. We propose a novel network architecture to perform&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.01089v3-abstract-full').style.display = 'inline'; document.getElementById('1910.01089v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1910.01089v3-abstract-full" style="display: none;"> Recent advances in deep learning have shown promising results in many low-level vision tasks. However, solving the single-image-based view synthesis is still an open problem. In particular, the generation of new images at parallel camera views given a single input image is of great interest, as it enables 3D visualization of the 2D input scenery. We propose a novel network architecture to perform stereoscopic view synthesis at arbitrary camera positions along the X-axis, or Deep 3D Pan, with &#34;t-shaped&#34; adaptive kernels equipped with globally and locally adaptive dilations. Our proposed network architecture, the monster-net, is devised with a novel &#34;t-shaped&#34; adaptive kernel with globally and locally adaptive dilation, which can efficiently incorporate global camera shift into and handle local 3D geometries of the target image&#39;s pixels for the synthesis of naturally looking 3D panned views when a 2-D input image is given. Extensive experiments were performed on the KITTI, CityScapes and our VICLAB_STEREO indoors dataset to prove the efficacy of our method. Our monster-net significantly outperforms the state-of-the-art method, SOTA, by a large margin in all metrics of RMSE, PSNR, and SSIM. Our proposed monster-net is capable of reconstructing more reliable image structures in synthesized images with coherent geometry. Moreover, the disparity information that can be extracted from the &#34;t-shaped&#34; kernel is much more reliable than that of the SOTA for the unsupervised monocular depth estimation task, confirming the effectiveness of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.01089v3-abstract-full').style.display = 'none'; document.getElementById('1910.01089v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Check our video at https://www.youtube.com/watch?v=o0b-e282Rt4</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1909.09349">arXiv:1909.09349</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1909.09349">pdf</a>, <a href="https://arxiv.org/format/1909.09349">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Deep 3D-Zoom Net: Unsupervised Learning of Photo-Realistic 3D-Zoom </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+L+G">Juan Luis Gonzalez Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+M">Munchurl Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1909.09349v2-abstract-short" style="display: inline;"> The 3D-zoom operation is the positive translation of the camera in the Z-axis, perpendicular to the image plane. In contrast, the optical zoom changes the focal length and the digital zoom is used to enlarge a certain region of an image to the original image size. In this paper, we are the first to formulate an unsupervised 3D-zoom learning problem where images with an arbitrary zoom factor can be&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1909.09349v2-abstract-full').style.display = 'inline'; document.getElementById('1909.09349v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1909.09349v2-abstract-full" style="display: none;"> The 3D-zoom operation is the positive translation of the camera in the Z-axis, perpendicular to the image plane. In contrast, the optical zoom changes the focal length and the digital zoom is used to enlarge a certain region of an image to the original image size. In this paper, we are the first to formulate an unsupervised 3D-zoom learning problem where images with an arbitrary zoom factor can be generated from a given single image. An unsupervised framework is convenient, as it is a challenging task to obtain a 3D-zoom dataset of natural scenes due to the need for special equipment to ensure camera movement is restricted to the Z-axis. In addition, the objects in the scenes should not move when being captured, which hinders the construction of a large dataset of outdoor scenes. We present a novel unsupervised framework to learn how to generate arbitrarily 3D-zoomed versions of a single image, not requiring a 3D-zoom ground truth, called the Deep 3D-Zoom Net. The Deep 3D-Zoom Net incorporates the following features: (i) transfer learning from a pre-trained disparity estimation network via a back re-projection reconstruction loss; (ii) a fully convolutional network architecture that models depth-image-based rendering (DIBR), taking into account high-frequency details without the need for estimating the intermediate disparity; and (iii) incorporating a discriminator network that acts as a no-reference penalty for unnaturally rendered areas. Even though there is no baseline to fairly compare our results, our method outperforms previous novel view synthesis research in terms of realistic appearance on large camera baselines. We performed extensive experiments to verify the effectiveness of our method on the KITTI and Cityscapes datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1909.09349v2-abstract-full').style.display = 'none'; document.getElementById('1909.09349v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 September, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Check our video at https://www.youtube.com/watch?v=Gz76VYwUzZ8</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1906.08512">arXiv:1906.08512</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1906.08512">pdf</a>, <a href="https://arxiv.org/format/1906.08512">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Adversarial Learning for Improved Onsets and Frames Music Transcription </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kim%2C+J+W">Jong Wook Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan Pablo Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1906.08512v1-abstract-short" style="display: inline;"> Automatic music transcription is considered to be one of the hardest problems in music information retrieval, yet recent deep learning approaches have achieved substantial improvements on transcription performance. These approaches commonly employ supervised learning models that predict various time-frequency representations, by minimizing element-wise losses such as the cross entropy function. Ho&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.08512v1-abstract-full').style.display = 'inline'; document.getElementById('1906.08512v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1906.08512v1-abstract-full" style="display: none;"> Automatic music transcription is considered to be one of the hardest problems in music information retrieval, yet recent deep learning approaches have achieved substantial improvements on transcription performance. These approaches commonly employ supervised learning models that predict various time-frequency representations, by minimizing element-wise losses such as the cross entropy function. However, applying the loss in this manner assumes conditional independence of each label given the input, and thus cannot accurately express inter-label dependencies. To address this issue, we introduce an adversarial training scheme that operates directly on the time-frequency representations and makes the output distribution closer to the ground-truth. Through adversarial learning, we achieve a consistent improvement in both frame-level and note-level metrics over Onsets and Frames, a state-of-the-art music transcription model. Our results show that adversarial learning can significantly reduce the error rate while increasing the confidence of the model estimations. Our approach is generic and applicable to any transcription model based on multi-label predictions, which are very common in music signal analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.08512v1-abstract-full').style.display = 'none'; document.getElementById('1906.08512v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 June, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1905.08352">arXiv:1905.08352</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1905.08352">pdf</a>, <a href="https://arxiv.org/format/1905.08352">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1371/journal.pone.0214168">10.1371/journal.pone.0214168 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Robust sound event detection in bioacoustic sensor networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lostanlen%2C+V">Vincent Lostanlen</a>, <a href="/search/cs?searchtype=author&amp;query=Salamon%2C+J">Justin Salamon</a>, <a href="/search/cs?searchtype=author&amp;query=Farnsworth%2C+A">Andrew Farnsworth</a>, <a href="/search/cs?searchtype=author&amp;query=Kelling%2C+S">Steve Kelling</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan Pablo Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1905.08352v2-abstract-short" style="display: inline;"> Bioacoustic sensors, sometimes known as autonomous recording units (ARUs), can record sounds of wildlife over long periods of time in scalable and minimally invasive ways. Deriving per-species abundance estimates from these sensors requires detection, classification, and quantification of animal vocalizations as individual acoustic events. Yet, variability in ambient noise, both over time and acro&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1905.08352v2-abstract-full').style.display = 'inline'; document.getElementById('1905.08352v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1905.08352v2-abstract-full" style="display: none;"> Bioacoustic sensors, sometimes known as autonomous recording units (ARUs), can record sounds of wildlife over long periods of time in scalable and minimally invasive ways. Deriving per-species abundance estimates from these sensors requires detection, classification, and quantification of animal vocalizations as individual acoustic events. Yet, variability in ambient noise, both over time and across sensors, hinders the reliability of current automated systems for sound event detection (SED), such as convolutional neural networks (CNN) in the time-frequency domain. In this article, we develop, benchmark, and combine several machine listening techniques to improve the generalizability of SED models across heterogeneous acoustic environments. As a case study, we consider the problem of detecting avian flight calls from a ten-hour recording of nocturnal bird migration, recorded by a network of six ARUs in the presence of heterogeneous background noise. Starting from a CNN yielding state-of-the-art accuracy on this task, we introduce two noise adaptation techniques, respectively integrating short-term (60 milliseconds) and long-term (30 minutes) context. First, we apply per-channel energy normalization (PCEN) in the time-frequency domain, which applies short-term automatic gain control to every subband in the mel-frequency spectrogram. Secondly, we replace the last dense layer in the network by a context-adaptive neural network (CA-NN) layer. Combining them yields state-of-the-art results that are unmatched by artificial data augmentation alone. We release a pre-trained version of our best performing system under the name of BirdVoxDetect, a ready-to-use detector of avian flight calls in field recordings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1905.08352v2-abstract-full').style.display = 'none'; document.getElementById('1905.08352v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 May, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">32 pages, in English. Submitted to PLOS ONE journal in February 2019; revised August 2019; published October 2019</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1903.08514">arXiv:1903.08514</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1903.08514">pdf</a>, <a href="https://arxiv.org/format/1903.08514">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Novel Monocular Disparity Estimation Network with Domain Transformation and Ambiguity Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+L+G">Juan Luis Gonzalez Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+M">Munchurl Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1903.08514v1-abstract-short" style="display: inline;"> Convolutional neural networks (CNN) have shown state-of-the-art results for low-level computer vision problems such as stereo and monocular disparity estimations, but still, have much room to further improve their performance in terms of accuracy, numbers of parameters, etc. Recent works have uncovered the advantages of using an unsupervised scheme to train CNN&#39;s to estimate monocular disparity, w&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1903.08514v1-abstract-full').style.display = 'inline'; document.getElementById('1903.08514v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1903.08514v1-abstract-full" style="display: none;"> Convolutional neural networks (CNN) have shown state-of-the-art results for low-level computer vision problems such as stereo and monocular disparity estimations, but still, have much room to further improve their performance in terms of accuracy, numbers of parameters, etc. Recent works have uncovered the advantages of using an unsupervised scheme to train CNN&#39;s to estimate monocular disparity, where only the relatively-easy-to-obtain stereo images are needed for training. We propose a novel encoder-decoder architecture that outperforms previous unsupervised monocular depth estimation networks by (i) taking into account ambiguities, (ii) efficient fusion between encoder and decoder features with rectangular convolutions and (iii) domain transformations between encoder and decoder. Our architecture outperforms the Monodepth baseline in all metrics, even with a considerable reduction of parameters. Furthermore, our architecture is capable of estimating a full disparity map in a single forward pass, whereas the baseline needs two passes. We perform extensive experiments to verify the effectiveness of our method on the KITTI dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1903.08514v1-abstract-full').style.display = 'none'; document.getElementById('1903.08514v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1903.03195">arXiv:1903.03195</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1903.03195">pdf</a>, <a href="https://arxiv.org/format/1903.03195">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.3390/s19061415">10.3390/s19061415 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> The life of a New York City noise sensor network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mydlarz%2C+C">Charlie Mydlarz</a>, <a href="/search/cs?searchtype=author&amp;query=Sharma%2C+M">Mohit Sharma</a>, <a href="/search/cs?searchtype=author&amp;query=Lockerman%2C+Y">Yitzchak Lockerman</a>, <a href="/search/cs?searchtype=author&amp;query=Steers%2C+B">Ben Steers</a>, <a href="/search/cs?searchtype=author&amp;query=Silva%2C+C">Claudio Silva</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan Pablo Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1903.03195v2-abstract-short" style="display: inline;"> Noise pollution is one of the topmost quality of life issues for urban residents in the United States. Continued exposure to high levels of noise has proven effects on health, including acute effects such as sleep disruption, and long-term effects such as hypertension, heart disease, and hearing loss. To investigate and ultimately aid in the mitigation of urban noise, a network of 55 sensor nodes&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1903.03195v2-abstract-full').style.display = 'inline'; document.getElementById('1903.03195v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1903.03195v2-abstract-full" style="display: none;"> Noise pollution is one of the topmost quality of life issues for urban residents in the United States. Continued exposure to high levels of noise has proven effects on health, including acute effects such as sleep disruption, and long-term effects such as hypertension, heart disease, and hearing loss. To investigate and ultimately aid in the mitigation of urban noise, a network of 55 sensor nodes has been deployed across New York City for over two years, collecting sound pressure level (SPL) and audio data. This network has cumulatively amassed over 75 years of calibrated, high-resolution SPL measurements and 35 years of audio data. In addition, high frequency telemetry data has been collected that provides an indication of a sensors&#39; health. This telemetry data was analyzed over an 18 month period across 31 of the sensors. It has been used to develop a prototype model for pre-failure detection which has the ability to identify sensors in a prefail state 69.1% of the time. The entire network infrastructure is outlined, including the operation of the sensors, followed by an analysis of its data yield and the development of the fault detection approach and the future system integration plans for this. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1903.03195v2-abstract-full').style.display = 'none'; document.getElementById('1903.03195v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 March, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This article belongs to the Section Intelligent Sensors, 24 pages, 15 figures, 3 tables, 45 references</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Sensors 2019, 19, 1415 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1811.00223">arXiv:1811.00223</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1811.00223">pdf</a>, <a href="https://arxiv.org/format/1811.00223">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Neural Music Synthesis for Flexible Timbre Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kim%2C+J+W">Jong Wook Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Bittner%2C+R">Rachel Bittner</a>, <a href="/search/cs?searchtype=author&amp;query=Kumar%2C+A">Aparna Kumar</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan Pablo Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1811.00223v1-abstract-short" style="display: inline;"> The recent success of raw audio waveform synthesis models like WaveNet motivates a new approach for music synthesis, in which the entire process --- creating audio samples from a score and instrument information --- is modeled using generative neural networks. This paper describes a neural music synthesis model with flexible timbre controls, which consists of a recurrent neural network conditioned&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1811.00223v1-abstract-full').style.display = 'inline'; document.getElementById('1811.00223v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1811.00223v1-abstract-full" style="display: none;"> The recent success of raw audio waveform synthesis models like WaveNet motivates a new approach for music synthesis, in which the entire process --- creating audio samples from a score and instrument information --- is modeled using generative neural networks. This paper describes a neural music synthesis model with flexible timbre controls, which consists of a recurrent neural network conditioned on a learned instrument embedding followed by a WaveNet vocoder. The learned embedding space successfully captures the diverse variations in timbres within a large dataset and enables timbre control and morphing by interpolating between instruments in the embedding space. The synthesis quality is evaluated both numerically and perceptually, and an interactive web demo is presented. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1811.00223v1-abstract-full').style.display = 'none'; document.getElementById('1811.00223v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2018. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1809.00381">arXiv:1809.00381</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1809.00381">pdf</a>, <a href="https://arxiv.org/format/1809.00381">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Multitask Learning for Fundamental Frequency Estimation in Music </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bittner%2C+R+M">Rachel M. Bittner</a>, <a href="/search/cs?searchtype=author&amp;query=McFee%2C+B">Brian McFee</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan P. Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1809.00381v1-abstract-short" style="display: inline;"> Fundamental frequency (f0) estimation from polyphonic music includes the tasks of multiple-f0, melody, vocal, and bass line estimation. Historically these problems have been approached separately, and only recently, using learning-based approaches. We present a multitask deep learning architecture that jointly estimates outputs for various tasks including multiple-f0, melody, vocal and bass line e&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1809.00381v1-abstract-full').style.display = 'inline'; document.getElementById('1809.00381v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1809.00381v1-abstract-full" style="display: none;"> Fundamental frequency (f0) estimation from polyphonic music includes the tasks of multiple-f0, melody, vocal, and bass line estimation. Historically these problems have been approached separately, and only recently, using learning-based approaches. We present a multitask deep learning architecture that jointly estimates outputs for various tasks including multiple-f0, melody, vocal and bass line estimation, and is trained using a large, semi-automatically annotated dataset. We show that the multitask model outperforms its single-task counterparts, and explore the effect of various design decisions in our approach, and show that it performs better or at least competitively when compared against strong baseline methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1809.00381v1-abstract-full').style.display = 'none'; document.getElementById('1809.00381v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2018. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1805.00889">arXiv:1805.00889</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1805.00889">pdf</a>, <a href="https://arxiv.org/format/1805.00889">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SONYC: A System for the Monitoring, Analysis and Mitigation of Urban Noise Pollution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan Pablo Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Silva%2C+C">Claudio Silva</a>, <a href="/search/cs?searchtype=author&amp;query=Nov%2C+O">Oded Nov</a>, <a href="/search/cs?searchtype=author&amp;query=DuBois%2C+R+L">R. Luke DuBois</a>, <a href="/search/cs?searchtype=author&amp;query=Arora%2C+A">Anish Arora</a>, <a href="/search/cs?searchtype=author&amp;query=Salamon%2C+J">Justin Salamon</a>, <a href="/search/cs?searchtype=author&amp;query=Mydlarz%2C+C">Charles Mydlarz</a>, <a href="/search/cs?searchtype=author&amp;query=Doraiswamy%2C+H">Harish Doraiswamy</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1805.00889v2-abstract-short" style="display: inline;"> We present the Sounds of New York City (SONYC) project, a smart cities initiative focused on developing a cyber-physical system for the monitoring, analysis and mitigation of urban noise pollution. Noise pollution is one of the topmost quality of life issues for urban residents in the U.S. with proven effects on health, education, the economy, and the environment. Yet, most cities lack the resourc&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1805.00889v2-abstract-full').style.display = 'inline'; document.getElementById('1805.00889v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1805.00889v2-abstract-full" style="display: none;"> We present the Sounds of New York City (SONYC) project, a smart cities initiative focused on developing a cyber-physical system for the monitoring, analysis and mitigation of urban noise pollution. Noise pollution is one of the topmost quality of life issues for urban residents in the U.S. with proven effects on health, education, the economy, and the environment. Yet, most cities lack the resources to continuously monitor noise and understand the contribution of individual sources, the tools to analyze patterns of noise pollution at city-scale, and the means to empower city agencies to take effective, data-driven action for noise mitigation. The SONYC project advances novel technological and socio-technical solutions that help address these needs. SONYC includes a distributed network of both sensors and people for large-scale noise monitoring. The sensors use low-cost, low-power technology, and cutting-edge machine listening techniques, to produce calibrated acoustic measurements and recognize individual sound sources in real time. Citizen science methods are used to help urban residents connect to city agencies and each other, understand their noise footprint, and facilitate reporting and self-regulation. Crucially, SONYC utilizes big data solutions to analyze, retrieve and visualize information from sensors and citizens, creating a comprehensive acoustic model of the city that can be used to identify significant patterns of noise pollution. These data can be used to drive the strategic application of noise code enforcement by city agencies to optimize the reduction of noise pollution. The entire system, integrating cyber, physical and social infrastructure, forms a closed loop of continuous sensing, analysis and actuation on the environment. SONYC provides a blueprint for the mitigation of noise pollution that can potentially be applied to other cities in the US and abroad. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1805.00889v2-abstract-full').style.display = 'none'; document.getElementById('1805.00889v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 May, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 May, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted May 2018, Communications of the ACM. This is the author&#39;s version of the work. It is posted here for your personal use. Not for redistribution. The definitive Version of Record will be published in Communications of the ACM</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1804.10070">arXiv:1804.10070</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1804.10070">pdf</a>, <a href="https://arxiv.org/format/1804.10070">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Adaptive pooling operators for weakly labeled sound event detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=McFee%2C+B">Brian McFee</a>, <a href="/search/cs?searchtype=author&amp;query=Salamon%2C+J">Justin Salamon</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan Pablo Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1804.10070v2-abstract-short" style="display: inline;"> Sound event detection (SED) methods are tasked with labeling segments of audio recordings by the presence of active sound sources. SED is typically posed as a supervised machine learning problem, requiring strong annotations for the presence or absence of each sound source at every time instant within the recording. However, strong annotations of this type are both labor- and cost-intensive for hu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1804.10070v2-abstract-full').style.display = 'inline'; document.getElementById('1804.10070v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1804.10070v2-abstract-full" style="display: none;"> Sound event detection (SED) methods are tasked with labeling segments of audio recordings by the presence of active sound sources. SED is typically posed as a supervised machine learning problem, requiring strong annotations for the presence or absence of each sound source at every time instant within the recording. However, strong annotations of this type are both labor- and cost-intensive for human annotators to produce, which limits the practical scalability of SED methods. In this work, we treat SED as a multiple instance learning (MIL) problem, where training labels are static over a short excerpt, indicating the presence or absence of sound sources but not their temporal locality. The models, however, must still produce temporally dynamic predictions, which must be aggregated (pooled) when comparing against static labels during training. To facilitate this aggregation, we develop a family of adaptive pooling operators---referred to as auto-pool---which smoothly interpolate between common pooling operators, such as min-, max-, or average-pooling, and automatically adapt to the characteristics of the sound sources in question. We evaluate the proposed pooling operators on three datasets, and demonstrate that in each case, the proposed methods outperform non-adaptive pooling operators for static prediction, and nearly match the performance of models trained with strong, dynamic annotations. The proposed method is evaluated in conjunction with convolutional neural networks, but can be readily applied to any differentiable model for time-series label prediction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1804.10070v2-abstract-full').style.display = 'none'; document.getElementById('1804.10070v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 August, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 April, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2018. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1802.06182">arXiv:1802.06182</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1802.06182">pdf</a>, <a href="https://arxiv.org/format/1802.06182">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> CREPE: A Convolutional Representation for Pitch Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kim%2C+J+W">Jong Wook Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Salamon%2C+J">Justin Salamon</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+P">Peter Li</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan Pablo Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1802.06182v1-abstract-short" style="display: inline;"> The task of estimating the fundamental frequency of a monophonic sound recording, also known as pitch tracking, is fundamental to audio processing with multiple applications in speech processing and music information retrieval. To date, the best performing techniques, such as the pYIN algorithm, are based on a combination of DSP pipelines and heuristics. While such techniques perform very well on&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1802.06182v1-abstract-full').style.display = 'inline'; document.getElementById('1802.06182v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1802.06182v1-abstract-full" style="display: none;"> The task of estimating the fundamental frequency of a monophonic sound recording, also known as pitch tracking, is fundamental to audio processing with multiple applications in speech processing and music information retrieval. To date, the best performing techniques, such as the pYIN algorithm, are based on a combination of DSP pipelines and heuristics. While such techniques perform very well on average, there remain many cases in which they fail to correctly estimate the pitch. In this paper, we propose a data-driven pitch tracking algorithm, CREPE, which is based on a deep convolutional neural network that operates directly on the time-domain waveform. We show that the proposed model produces state-of-the-art results, performing equally or better than pYIN. Furthermore, we evaluate the model&#39;s generalizability in terms of noise robustness. A pre-trained version of CREPE is made freely available as an open-source Python module for easy application. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1802.06182v1-abstract-full').style.display = 'none'; document.getElementById('1802.06182v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICASSP 2018</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1608.04363">arXiv:1608.04363</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1608.04363">pdf</a>, <a href="https://arxiv.org/format/1608.04363">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/LSP.2017.2657381">10.1109/LSP.2017.2657381 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Deep Convolutional Neural Networks and Data Augmentation for Environmental Sound Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Salamon%2C+J">Justin Salamon</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan Pablo Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1608.04363v2-abstract-short" style="display: inline;"> The ability of deep convolutional neural networks (CNN) to learn discriminative spectro-temporal patterns makes them well suited to environmental sound classification. However, the relative scarcity of labeled data has impeded the exploitation of this family of high-capacity models. This study has two primary contributions: first, we propose a deep convolutional neural network architecture for env&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1608.04363v2-abstract-full').style.display = 'inline'; document.getElementById('1608.04363v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1608.04363v2-abstract-full" style="display: none;"> The ability of deep convolutional neural networks (CNN) to learn discriminative spectro-temporal patterns makes them well suited to environmental sound classification. However, the relative scarcity of labeled data has impeded the exploitation of this family of high-capacity models. This study has two primary contributions: first, we propose a deep convolutional neural network architecture for environmental sound classification. Second, we propose the use of audio data augmentation for overcoming the problem of data scarcity and explore the influence of different augmentations on the performance of the proposed CNN architecture. Combined with data augmentation, the proposed model produces state-of-the-art results for environmental sound classification. We show that the improved performance stems from the combination of a deep, high-capacity model and an augmented training set: this combination outperforms both the proposed CNN without augmentation and a &#34;shallow&#34; dictionary learning model with augmentation. Finally, we examine the influence of each augmentation on the model&#39;s classification accuracy for each class, and observe that the accuracy for each class is influenced differently by each augmentation, suggesting that the performance of the model could be improved further by applying class-conditional data augmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1608.04363v2-abstract-full').style.display = 'none'; document.getElementById('1608.04363v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2016; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 August, 2016; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2016. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted November 2016, IEEE Signal Processing Letters. Copyright IEEE. Personal use of this material is permitted. Permission from IEEE must be obtained for all other uses, in any current or future media, including reprinting/republishing this material, creating new collective works, for resale or redistribution, or reuse of any copyrighted component of this work in other works</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1605.08450">arXiv:1605.08450</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1605.08450">pdf</a>, <a href="https://arxiv.org/format/1605.08450">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> The Implementation of Low-cost Urban Acoustic Monitoring Devices </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mydlarz%2C+C">Charlie Mydlarz</a>, <a href="/search/cs?searchtype=author&amp;query=Salamon%2C+J">Justin Salamon</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">Juan Pablo Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1605.08450v1-abstract-short" style="display: inline;"> The urban sound environment of New York City (NYC) can be, amongst other things: loud, intrusive, exciting and dynamic. As indicated by the large majority of noise complaints registered with the NYC 311 information/complaints line, the urban sound environment has a profound effect on the quality of life of the city&#39;s inhabitants. To monitor and ultimately understand these sonic environments, a pro&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1605.08450v1-abstract-full').style.display = 'inline'; document.getElementById('1605.08450v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1605.08450v1-abstract-full" style="display: none;"> The urban sound environment of New York City (NYC) can be, amongst other things: loud, intrusive, exciting and dynamic. As indicated by the large majority of noise complaints registered with the NYC 311 information/complaints line, the urban sound environment has a profound effect on the quality of life of the city&#39;s inhabitants. To monitor and ultimately understand these sonic environments, a process of long-term acoustic measurement and analysis is required. The traditional method of environmental acoustic monitoring utilizes short term measurement periods using expensive equipment, setup and operated by experienced and costly personnel. In this paper a different approach is proposed to this application which implements a smart, low-cost, static, acoustic sensing device based around consumer hardware. These devices can be deployed in numerous and varied urban locations for long periods of time, allowing for the collection of longitudinal urban acoustic data. The varied environmental conditions of urban settings make for a challenge in gathering calibrated sound pressure level data for prospective stakeholders. This paper details the sensors&#39; design, development and potential future applications, with a focus on the calibration of the devices&#39; Microelectromechanical systems (MEMS) microphone in order to generate reliable decibel levels at the type/class 2 level. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1605.08450v1-abstract-full').style.display = 'none'; document.getElementById('1605.08450v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 May, 2016; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2016. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted into the Journal of Applied Acoustics special issue: Acoustics of Smart Cities. 26 pages, 12 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> H.5.5; C.0; C.3; C.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1605.08396">arXiv:1605.08396</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1605.08396">pdf</a>, <a href="https://arxiv.org/format/1605.08396">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Robust Downbeat Tracking Using an Ensemble of Convolutional Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Durand%2C+S">S. Durand</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+J+P">J. P. Bello</a>, <a href="/search/cs?searchtype=author&amp;query=David%2C+B">B. David</a>, <a href="/search/cs?searchtype=author&amp;query=Richard%2C+G">G. Richard</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1605.08396v1-abstract-short" style="display: inline;"> In this paper, we present a novel state of the art system for automatic downbeat tracking from music signals. The audio signal is first segmented in frames which are synchronized at the tatum level of the music. We then extract different kind of features based on harmony, melody, rhythm and bass content to feed convolutional neural networks that are adapted to take advantage of each feature charac&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1605.08396v1-abstract-full').style.display = 'inline'; document.getElementById('1605.08396v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1605.08396v1-abstract-full" style="display: none;"> In this paper, we present a novel state of the art system for automatic downbeat tracking from music signals. The audio signal is first segmented in frames which are synchronized at the tatum level of the music. We then extract different kind of features based on harmony, melody, rhythm and bass content to feed convolutional neural networks that are adapted to take advantage of each feature characteristics. This ensemble of neural networks is combined to obtain one downbeat likelihood per tatum. The downbeat sequence is finally decoded with a flexible and efficient temporal model which takes advantage of the metrical continuity of a song. We then perform an evaluation of our system on a large base of 9 datasets, compare its performance to 4 other published algorithms and obtain a significant increase of 16.8 percent points compared to the second best system, for altogether a moderate cost in test and training. The influence of each step of the method is studied to show its strengths and shortcomings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1605.08396v1-abstract-full').style.display = 'none'; document.getElementById('1605.08396v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 May, 2016; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2016. </p> </li> </ol> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10