Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–35 of 35 results for author: <span class="mathjax">Lostanlen, V</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Lostanlen%2C+V">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Lostanlen, V"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Lostanlen%2C+V&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Lostanlen, V"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12907">arXiv:2501.12907</a> <span> [<a href="https://arxiv.org/pdf/2501.12907">pdf</a>, <a href="https://arxiv.org/format/2501.12907">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> S-KEY: Self-supervised Learning of Major and Minor Keys from Audio </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kong%2C+Y">Yuexuan Kong</a>, <a href="/search/cs?searchtype=author&query=Meseguer-Brocal%2C+G">Gabriel Meseguer-Brocal</a>, <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a>, <a href="/search/cs?searchtype=author&query=Lagrange%2C+M">Mathieu Lagrange</a>, <a href="/search/cs?searchtype=author&query=Hennequin%2C+R">Romain Hennequin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12907v1-abstract-short" style="display: inline;"> STONE, the current method in self-supervised learning for tonality estimation in music signals, cannot distinguish relative keys, such as C major versus A minor. In this article, we extend the neural network architecture and learning objective of STONE to perform self-supervised learning of major and minor keys (S-KEY). Our main contribution is an auxiliary pretext task to STONE, formulated using… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12907v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12907v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12907v1-abstract-full" style="display: none;"> STONE, the current method in self-supervised learning for tonality estimation in music signals, cannot distinguish relative keys, such as C major versus A minor. In this article, we extend the neural network architecture and learning objective of STONE to perform self-supervised learning of major and minor keys (S-KEY). Our main contribution is an auxiliary pretext task to STONE, formulated using transposition-invariant chroma features as a source of pseudo-labels. S-KEY matches the supervised state of the art in tonality estimation on FMAKv2 and GTZAN datasets while requiring no human annotation and having the same parameter budget as STONE. We build upon this result and expand the training set of S-KEY to a million songs, thus showing the potential of large-scale self-supervised learning in music information retrieval. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12907v1-abstract-full').style.display = 'none'; document.getElementById('2501.12907v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11439">arXiv:2409.11439</a> <span> [<a href="https://arxiv.org/pdf/2409.11439">pdf</a>, <a href="https://arxiv.org/format/2409.11439">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Machine listening in a neonatal intensive care unit </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tailleur%2C+M">Modan Tailleur</a>, <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a>, <a href="/search/cs?searchtype=author&query=Rivi%C3%A8re%2C+J">Jean-Philippe Rivi猫re</a>, <a href="/search/cs?searchtype=author&query=Aumond%2C+P">Pierre Aumond</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11439v2-abstract-short" style="display: inline;"> Oxygenators, alarm devices, and footsteps are some of the most common sound sources in a hospital. Detecting them has scientific value for environmental psychology but comes with challenges of its own: namely, privacy preservation and limited labeled data. In this paper, we address these two challenges via a combination of edge computing and cloud computing. For privacy preservation, we have desig… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11439v2-abstract-full').style.display = 'inline'; document.getElementById('2409.11439v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11439v2-abstract-full" style="display: none;"> Oxygenators, alarm devices, and footsteps are some of the most common sound sources in a hospital. Detecting them has scientific value for environmental psychology but comes with challenges of its own: namely, privacy preservation and limited labeled data. In this paper, we address these two challenges via a combination of edge computing and cloud computing. For privacy preservation, we have designed an acoustic sensor which computes third-octave spectrograms on the fly instead of recording audio waveforms. For sample-efficient machine learning, we have repurposed a pretrained audio neural network (PANN) via spectral transcoding and label space adaptation. A small-scale study in a neonatological intensive care unit (NICU) confirms that the time series of detected events align with another modality of measurement: i.e., electronic badges for parents and healthcare professionals. Hence, this paper demonstrates the feasibility of polyphonic machine listening in a hospital ward while guaranteeing privacy by design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11439v2-abstract-full').style.display = 'none'; document.getElementById('2409.11439v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Workshop on Detection and Classification of Acoustic Scenes and Events (DCASE), Oct 2024, Tokyo, Japan </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.17358">arXiv:2408.17358</a> <span> [<a href="https://arxiv.org/pdf/2408.17358">pdf</a>, <a href="https://arxiv.org/format/2408.17358">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Hold Me Tight: Stable Encoder-Decoder Design for Speech Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Haider%2C+D">Daniel Haider</a>, <a href="/search/cs?searchtype=author&query=Perfler%2C+F">Felix Perfler</a>, <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a>, <a href="/search/cs?searchtype=author&query=Ehler%2C+M">Martin Ehler</a>, <a href="/search/cs?searchtype=author&query=Balazs%2C+P">Peter Balazs</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.17358v1-abstract-short" style="display: inline;"> Convolutional layers with 1-D filters are often used as frontend to encode audio signals. Unlike fixed time-frequency representations, they can adapt to the local characteristics of input data. However, 1-D filters on raw audio are hard to train and often suffer from instabilities. In this paper, we address these problems with hybrid solutions, i.e., combining theory-driven and data-driven approac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17358v1-abstract-full').style.display = 'inline'; document.getElementById('2408.17358v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.17358v1-abstract-full" style="display: none;"> Convolutional layers with 1-D filters are often used as frontend to encode audio signals. Unlike fixed time-frequency representations, they can adapt to the local characteristics of input data. However, 1-D filters on raw audio are hard to train and often suffer from instabilities. In this paper, we address these problems with hybrid solutions, i.e., combining theory-driven and data-driven approaches. First, we preprocess the audio signals via a auditory filterbank, guaranteeing good frequency localization for the learned encoder. Second, we use results from frame theory to define an unsupervised learning objective that encourages energy conservation and perfect reconstruction. Third, we adapt mixed compressed spectral norms as learning objectives to the encoder coefficients. Using these solutions in a low-complexity encoder-mask-decoder model significantly improves the perceptual evaluation of speech quality (PESQ) in speech enhancement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17358v1-abstract-full').style.display = 'none'; document.getElementById('2408.17358v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at INTERSPEECH 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07408">arXiv:2407.07408</a> <span> [<a href="https://arxiv.org/pdf/2407.07408">pdf</a>, <a href="https://arxiv.org/format/2407.07408">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> STONE: Self-supervised Tonality Estimator </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kong%2C+Y">Yuexuan Kong</a>, <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a>, <a href="/search/cs?searchtype=author&query=Meseguer-Brocal%2C+G">Gabriel Meseguer-Brocal</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+S">Stella Wong</a>, <a href="/search/cs?searchtype=author&query=Lagrange%2C+M">Mathieu Lagrange</a>, <a href="/search/cs?searchtype=author&query=Hennequin%2C+R">Romain Hennequin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07408v3-abstract-short" style="display: inline;"> Although deep neural networks can estimate the key of a musical piece, their supervision incurs a massive annotation effort. Against this shortcoming, we present STONE, the first self-supervised tonality estimator. The architecture behind STONE, named ChromaNet, is a convnet with octave equivalence which outputs a key signature profile (KSP) of 12 structured logits. First, we train ChromaNet to re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07408v3-abstract-full').style.display = 'inline'; document.getElementById('2407.07408v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07408v3-abstract-full" style="display: none;"> Although deep neural networks can estimate the key of a musical piece, their supervision incurs a massive annotation effort. Against this shortcoming, we present STONE, the first self-supervised tonality estimator. The architecture behind STONE, named ChromaNet, is a convnet with octave equivalence which outputs a key signature profile (KSP) of 12 structured logits. First, we train ChromaNet to regress artificial pitch transpositions between any two unlabeled musical excerpts from the same audio track, as measured as cross-power spectral density (CPSD) within the circle of fifths (CoF). We observe that this self-supervised pretext task leads KSP to correlate with tonal key signature. Based on this observation, we extend STONE to output a structured KSP of 24 logits, and introduce supervision so as to disambiguate major versus minor keys sharing the same key signature. Applying different amounts of supervision yields semi-supervised and fully supervised tonality estimators: i.e., Semi-TONEs and Sup-TONEs. We evaluate these estimators on FMAK, a new dataset of 5489 real-world musical recordings with expert annotation of 24 major and minor keys. We find that Semi-TONE matches the classification accuracy of Sup-TONE with reduced supervision and outperforms it with equal supervision. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07408v3-abstract-full').style.display = 'none'; document.getElementById('2407.07408v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.09598">arXiv:2403.09598</a> <span> [<a href="https://arxiv.org/pdf/2403.09598">pdf</a>, <a href="https://arxiv.org/format/2403.09598">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Mixture of Mixups for Multi-label Classification of Rare Anuran Sounds </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Moummad%2C+I">Ilyass Moummad</a>, <a href="/search/cs?searchtype=author&query=Farrugia%2C+N">Nicolas Farrugia</a>, <a href="/search/cs?searchtype=author&query=Serizel%2C+R">Romain Serizel</a>, <a href="/search/cs?searchtype=author&query=Froidevaux%2C+J">Jeremy Froidevaux</a>, <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.09598v2-abstract-short" style="display: inline;"> Multi-label imbalanced classification poses a significant challenge in machine learning, particularly evident in bioacoustics where animal sounds often co-occur, and certain sounds are much less frequent than others. This paper focuses on the specific case of classifying anuran species sounds using the dataset AnuraSet, that contains both class imbalance and multi-label examples. To address these… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.09598v2-abstract-full').style.display = 'inline'; document.getElementById('2403.09598v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.09598v2-abstract-full" style="display: none;"> Multi-label imbalanced classification poses a significant challenge in machine learning, particularly evident in bioacoustics where animal sounds often co-occur, and certain sounds are much less frequent than others. This paper focuses on the specific case of classifying anuran species sounds using the dataset AnuraSet, that contains both class imbalance and multi-label examples. To address these challenges, we introduce Mixture of Mixups (Mix2), a framework that leverages mixing regularization methods Mixup, Manifold Mixup, and MultiMix. Experimental results show that these methods, individually, may lead to suboptimal results; however, when applied randomly, with one selected at each training iteration, they prove effective in addressing the mentioned challenges, particularly for rare classes with few occurrences. Further analysis reveals that Mix2 is also proficient in classifying sounds across various levels of class co-occurrences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.09598v2-abstract-full').style.display = 'none'; document.getElementById('2403.09598v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.14213">arXiv:2311.14213</a> <span> [<a href="https://arxiv.org/pdf/2311.14213">pdf</a>, <a href="https://arxiv.org/format/2311.14213">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TASLP.2024.3393738">10.1109/TASLP.2024.3393738 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Learning to Solve Inverse Problems for Perceptual Sound Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+H">Han Han</a>, <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a>, <a href="/search/cs?searchtype=author&query=Lagrange%2C+M">Mathieu Lagrange</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.14213v2-abstract-short" style="display: inline;"> Perceptual sound matching (PSM) aims to find the input parameters to a synthesizer so as to best imitate an audio target. Deep learning for PSM optimizes a neural network to analyze and reconstruct prerecorded samples. In this context, our article addresses the problem of designing a suitable loss function when the training set is generated by a differentiable synthesizer. Our main contribution is… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.14213v2-abstract-full').style.display = 'inline'; document.getElementById('2311.14213v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.14213v2-abstract-full" style="display: none;"> Perceptual sound matching (PSM) aims to find the input parameters to a synthesizer so as to best imitate an audio target. Deep learning for PSM optimizes a neural network to analyze and reconstruct prerecorded samples. In this context, our article addresses the problem of designing a suitable loss function when the training set is generated by a differentiable synthesizer. Our main contribution is perceptual-neural-physical loss (PNP), which aims at addressing a tradeoff between perceptual relevance and computational efficiency. The key idea behind PNP is to linearize the effect of synthesis parameters upon auditory features in the vicinity of each training sample. The linearization procedure is massively paralellizable, can be precomputed, and offers a 100-fold speedup during gradient descent compared to differentiable digital signal processing (DDSP). We demonstrate PNP on two datasets of nonstationary sounds: an AM/FM arpeggiator and a physical model of rectangular membranes. We show that PNP is able to accelerate DDSP with joint time-frequency scattering transform (JTFS) as auditory feature, while preserving its perceptual fidelity. Additionally, we evaluate the impact of other design choices in PSM: parameter rescaling, pretraining, auditory representation, and gradient clipping. We report state-of-the-art results on both datasets and find that PNP-accelerated JTFS has greater influence on PSM performance than any other design choice. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.14213v2-abstract-full').style.display = 'none'; document.getElementById('2311.14213v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.05855">arXiv:2309.05855</a> <span> [<a href="https://arxiv.org/pdf/2309.05855">pdf</a>, <a href="https://arxiv.org/format/2309.05855">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/LSP.2024.3386492">10.1109/LSP.2024.3386492 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Instabilities in Convnets for Raw Audio </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Haider%2C+D">Daniel Haider</a>, <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a>, <a href="/search/cs?searchtype=author&query=Ehler%2C+M">Martin Ehler</a>, <a href="/search/cs?searchtype=author&query=Balazs%2C+P">Peter Balazs</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.05855v4-abstract-short" style="display: inline;"> What makes waveform-based deep learning so hard? Despite numerous attempts at training convolutional neural networks (convnets) for filterbank design, they often fail to outperform hand-crafted baselines. These baselines are linear time-invariant systems: as such, they can be approximated by convnets with wide receptive fields. Yet, in practice, gradient-based optimization leads to suboptimal appr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.05855v4-abstract-full').style.display = 'inline'; document.getElementById('2309.05855v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.05855v4-abstract-full" style="display: none;"> What makes waveform-based deep learning so hard? Despite numerous attempts at training convolutional neural networks (convnets) for filterbank design, they often fail to outperform hand-crafted baselines. These baselines are linear time-invariant systems: as such, they can be approximated by convnets with wide receptive fields. Yet, in practice, gradient-based optimization leads to suboptimal approximations. In our article, we approach this phenomenon from the perspective of initialization. We present a theory of large deviations for the energy response of FIR filterbanks with random Gaussian weights. We find that deviations worsen for large filters and locally periodic input signals, which are both typical for audio signal processing applications. Numerical simulations align with our theory and suggest that the condition number of a convolutional layer follows a logarithmic scaling law between the number and length of the filters, which is reminiscent of discrete wavelet bases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.05855v4-abstract-full').style.display = 'none'; document.getElementById('2309.05855v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, 5 figures, 1 page appendix with mathematical proofs</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Signal Processing Letters 31 (2024) 1084-1088 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.13821">arXiv:2307.13821</a> <span> [<a href="https://arxiv.org/pdf/2307.13821">pdf</a>, <a href="https://arxiv.org/format/2307.13821">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Functional Analysis">math.FA</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/waspaa58266.2023.10248131">10.1109/waspaa58266.2023.10248131 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Fitting Auditory Filterbanks with Multiresolution Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a>, <a href="/search/cs?searchtype=author&query=Haider%2C+D">Daniel Haider</a>, <a href="/search/cs?searchtype=author&query=Han%2C+H">Han Han</a>, <a href="/search/cs?searchtype=author&query=Lagrange%2C+M">Mathieu Lagrange</a>, <a href="/search/cs?searchtype=author&query=Balazs%2C+P">Peter Balazs</a>, <a href="/search/cs?searchtype=author&query=Ehler%2C+M">Martin Ehler</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.13821v1-abstract-short" style="display: inline;"> Waveform-based deep learning faces a dilemma between nonparametric and parametric approaches. On one hand, convolutional neural networks (convnets) may approximate any linear time-invariant system; yet, in practice, their frequency responses become more irregular as their receptive fields grow. On the other hand, a parametric model such as LEAF is guaranteed to yield Gabor filters, hence an optima… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.13821v1-abstract-full').style.display = 'inline'; document.getElementById('2307.13821v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.13821v1-abstract-full" style="display: none;"> Waveform-based deep learning faces a dilemma between nonparametric and parametric approaches. On one hand, convolutional neural networks (convnets) may approximate any linear time-invariant system; yet, in practice, their frequency responses become more irregular as their receptive fields grow. On the other hand, a parametric model such as LEAF is guaranteed to yield Gabor filters, hence an optimal time-frequency localization; yet, this strong inductive bias comes at the detriment of representational capacity. In this paper, we aim to overcome this dilemma by introducing a neural audio model, named multiresolution neural network (MuReNN). The key idea behind MuReNN is to train separate convolutional operators over the octave subbands of a discrete wavelet transform (DWT). Since the scale of DWT atoms grows exponentially between octaves, the receptive fields of the subsequent learnable convolutions in MuReNN are dilated accordingly. For a given real-world dataset, we fit the magnitude response of MuReNN to that of a well-established auditory filterbank: Gammatone for speech, CQT for music, and third-octave for urban sounds, respectively. This is a form of knowledge distillation (KD), in which the filterbank ''teacher'' is engineered by domain knowledge while the neural network ''student'' is optimized from data. We compare MuReNN to the state of the art in terms of goodness of fit after KD on a hold-out set and in terms of Heisenberg time-frequency localization. Compared to convnets and Gabor convolutions, we find that MuReNN reaches state-of-the-art performance on all three optimization problems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.13821v1-abstract-full').style.display = 'none'; document.getElementById('2307.13821v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, 4 figures, 1 table, conference</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> 2023 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA 2023) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.09223">arXiv:2306.09223</a> <span> [<a href="https://arxiv.org/pdf/2306.09223">pdf</a>, <a href="https://arxiv.org/format/2306.09223">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Few-shot bioacoustic event detection at the DCASE 2023 challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nolasco%2C+I">Ines Nolasco</a>, <a href="/search/cs?searchtype=author&query=Ghani%2C+B">Burooj Ghani</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+S">Shubhr Singh</a>, <a href="/search/cs?searchtype=author&query=Vida%C3%B1a-Vila%2C+E">Ester Vida帽a-Vila</a>, <a href="/search/cs?searchtype=author&query=Whitehead%2C+H">Helen Whitehead</a>, <a href="/search/cs?searchtype=author&query=Grout%2C+E">Emily Grout</a>, <a href="/search/cs?searchtype=author&query=Emmerson%2C+M">Michael Emmerson</a>, <a href="/search/cs?searchtype=author&query=Jensen%2C+F">Frants Jensen</a>, <a href="/search/cs?searchtype=author&query=Kiskin%2C+I">Ivan Kiskin</a>, <a href="/search/cs?searchtype=author&query=Morford%2C+J">Joe Morford</a>, <a href="/search/cs?searchtype=author&query=Strandburg-Peshkin%2C+A">Ariana Strandburg-Peshkin</a>, <a href="/search/cs?searchtype=author&query=Gill%2C+L">Lisa Gill</a>, <a href="/search/cs?searchtype=author&query=Pamu%C5%82a%2C+H">Hanna Pamu艂a</a>, <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a>, <a href="/search/cs?searchtype=author&query=Stowell%2C+D">Dan Stowell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.09223v1-abstract-short" style="display: inline;"> Few-shot bioacoustic event detection consists in detecting sound events of specified types, in varying soundscapes, while having access to only a few examples of the class of interest. This task ran as part of the DCASE challenge for the third time this year with an evaluation set expanded to include new animal species, and a new rule: ensemble models were no longer allowed. The 2023 few shot task… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.09223v1-abstract-full').style.display = 'inline'; document.getElementById('2306.09223v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.09223v1-abstract-full" style="display: none;"> Few-shot bioacoustic event detection consists in detecting sound events of specified types, in varying soundscapes, while having access to only a few examples of the class of interest. This task ran as part of the DCASE challenge for the third time this year with an evaluation set expanded to include new animal species, and a new rule: ensemble models were no longer allowed. The 2023 few shot task received submissions from 6 different teams with F-scores reaching as high as 63% on the evaluation set. Here we describe the task, focusing on describing the elements that differed from previous years. We also take a look back at past editions to describe how the task has evolved. Not only have the F-score results steadily improved (40% to 60% to 63%), but the type of systems proposed have also become more complex. Sound event detection systems are no longer simple variations of the baselines provided: multiple few-shot learning methodologies are still strong contenders for the task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.09223v1-abstract-full').style.display = 'none'; document.getElementById('2306.09223v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to DCASE 2023 workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.13210">arXiv:2305.13210</a> <span> [<a href="https://arxiv.org/pdf/2305.13210">pdf</a>, <a href="https://arxiv.org/format/2305.13210">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.ecoinf.2023.102258">10.1016/j.ecoinf.2023.102258 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Learning to detect an animal sound from five examples </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nolasco%2C+I">In锚s Nolasco</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+S">Shubhr Singh</a>, <a href="/search/cs?searchtype=author&query=Morfi%2C+V">Veronica Morfi</a>, <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a>, <a href="/search/cs?searchtype=author&query=Strandburg-Peshkin%2C+A">Ariana Strandburg-Peshkin</a>, <a href="/search/cs?searchtype=author&query=Vida%C3%B1a-Vila%2C+E">Ester Vida帽a-Vila</a>, <a href="/search/cs?searchtype=author&query=Gill%2C+L">Lisa Gill</a>, <a href="/search/cs?searchtype=author&query=Pamu%C5%82a%2C+H">Hanna Pamu艂a</a>, <a href="/search/cs?searchtype=author&query=Whitehead%2C+H">Helen Whitehead</a>, <a href="/search/cs?searchtype=author&query=Kiskin%2C+I">Ivan Kiskin</a>, <a href="/search/cs?searchtype=author&query=Jensen%2C+F+H">Frants H. Jensen</a>, <a href="/search/cs?searchtype=author&query=Morford%2C+J">Joe Morford</a>, <a href="/search/cs?searchtype=author&query=Emmerson%2C+M+G">Michael G. Emmerson</a>, <a href="/search/cs?searchtype=author&query=Versace%2C+E">Elisabetta Versace</a>, <a href="/search/cs?searchtype=author&query=Grout%2C+E">Emily Grout</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haohe Liu</a>, <a href="/search/cs?searchtype=author&query=Stowell%2C+D">Dan Stowell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.13210v1-abstract-short" style="display: inline;"> Automatic detection and classification of animal sounds has many applications in biodiversity monitoring and animal behaviour. In the past twenty years, the volume of digitised wildlife sound available has massively increased, and automatic classification through deep learning now shows strong results. However, bioacoustics is not a single task but a vast range of small-scale tasks (such as indivi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.13210v1-abstract-full').style.display = 'inline'; document.getElementById('2305.13210v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.13210v1-abstract-full" style="display: none;"> Automatic detection and classification of animal sounds has many applications in biodiversity monitoring and animal behaviour. In the past twenty years, the volume of digitised wildlife sound available has massively increased, and automatic classification through deep learning now shows strong results. However, bioacoustics is not a single task but a vast range of small-scale tasks (such as individual ID, call type, emotional indication) with wide variety in data characteristics, and most bioacoustic tasks do not come with strongly-labelled training data. The standard paradigm of supervised learning, focussed on a single large-scale dataset and/or a generic pre-trained algorithm, is insufficient. In this work we recast bioacoustic sound event detection within the AI framework of few-shot learning. We adapt this framework to sound event detection, such that a system can be given the annotated start/end times of as few as 5 events, and can then detect events in long-duration audio -- even when the sound category was not known at the time of algorithm training. We introduce a collection of open datasets designed to strongly test a system's ability to perform few-shot sound event detections, and we present the results of a public contest to address the task. We show that prototypical networks are a strong-performing method, when enhanced with adaptations for general characteristics of animal sounds. We demonstrate that widely-varying sound event durations are an important factor in performance, as well as non-stationarity, i.e. gradual changes in conditions throughout the duration of a recording. For fine-grained bioacoustic recognition tasks without massive annotated training data, our results demonstrate that few-shot sound event detection is a powerful new method, strongly outperforming traditional signal-processing detection methods in the fully automated scenario. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.13210v1-abstract-full').style.display = 'none'; document.getElementById('2305.13210v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.10183">arXiv:2301.10183</a> <span> [<a href="https://arxiv.org/pdf/2301.10183">pdf</a>, <a href="https://arxiv.org/format/2301.10183">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Mesostructures: Beyond Spectrogram Loss in Differentiable Time-Frequency Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Vahidi%2C+C">Cyrus Vahidi</a>, <a href="/search/cs?searchtype=author&query=Han%2C+H">Han Han</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Changhong Wang</a>, <a href="/search/cs?searchtype=author&query=Lagrange%2C+M">Mathieu Lagrange</a>, <a href="/search/cs?searchtype=author&query=Fazekas%2C+G">Gy枚rgy Fazekas</a>, <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.10183v1-abstract-short" style="display: inline;"> Computer musicians refer to mesostructures as the intermediate levels of articulation between the microstructure of waveshapes and the macrostructure of musical forms. Examples of mesostructures include melody, arpeggios, syncopation, polyphonic grouping, and textural contrast. Despite their central role in musical expression, they have received limited attention in deep learning. Currently, autoe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.10183v1-abstract-full').style.display = 'inline'; document.getElementById('2301.10183v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.10183v1-abstract-full" style="display: none;"> Computer musicians refer to mesostructures as the intermediate levels of articulation between the microstructure of waveshapes and the macrostructure of musical forms. Examples of mesostructures include melody, arpeggios, syncopation, polyphonic grouping, and textural contrast. Despite their central role in musical expression, they have received limited attention in deep learning. Currently, autoencoders and neural audio synthesizers are only trained and evaluated at the scale of microstructure: i.e., local amplitude variations up to 100 milliseconds or so. In this paper, we formulate and address the problem of mesostructural audio modeling via a composition of a differentiable arpeggiator and time-frequency scattering. We empirically demonstrate that time--frequency scattering serves as a differentiable model of similarity between synthesis parameters that govern mesostructure. By exposing the sensitivity of short-time spectral distances to time alignment, we motivate the need for a time-invariant and multiscale differentiable time--frequency model of similarity at the level of both local spectra and spectrotemporal modulations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.10183v1-abstract-full').style.display = 'none'; document.getElementById('2301.10183v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.02886">arXiv:2301.02886</a> <span> [<a href="https://arxiv.org/pdf/2301.02886">pdf</a>, <a href="https://arxiv.org/format/2301.02886">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Perceptual-Neural-Physical Sound Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+H">Han Han</a>, <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a>, <a href="/search/cs?searchtype=author&query=Lagrange%2C+M">Mathieu Lagrange</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.02886v2-abstract-short" style="display: inline;"> Sound matching algorithms seek to approximate a target waveform by parametric audio synthesis. Deep neural networks have achieved promising results in matching sustained harmonic tones. However, the task is more challenging when targets are nonstationary and inharmonic, e.g., percussion. We attribute this problem to the inadequacy of loss function. On one hand, mean square error in the parametric… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.02886v2-abstract-full').style.display = 'inline'; document.getElementById('2301.02886v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.02886v2-abstract-full" style="display: none;"> Sound matching algorithms seek to approximate a target waveform by parametric audio synthesis. Deep neural networks have achieved promising results in matching sustained harmonic tones. However, the task is more challenging when targets are nonstationary and inharmonic, e.g., percussion. We attribute this problem to the inadequacy of loss function. On one hand, mean square error in the parametric domain, known as "P-loss", is simple and fast but fails to accommodate the differing perceptual significance of each parameter. On the other hand, mean square error in the spectrotemporal domain, known as "spectral loss", is perceptually motivated and serves in differentiable digital signal processing (DDSP). Yet, spectral loss is a poor predictor of pitch intervals and its gradient may be computationally expensive; hence a slow convergence. Against this conundrum, we present Perceptual-Neural-Physical loss (PNP). PNP is the optimal quadratic approximation of spectral loss while being as fast as P-loss during training. We instantiate PNP with physical modeling synthesis as decoder and joint time-frequency scattering transform (JTFS) as spectral representation. We demonstrate its potential on matching synthetic drum sounds in comparison with other loss functions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.02886v2-abstract-full').style.display = 'none'; document.getElementById('2301.02886v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.07911">arXiv:2207.07911</a> <span> [<a href="https://arxiv.org/pdf/2207.07911">pdf</a>, <a href="https://arxiv.org/format/2207.07911">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Few-shot bioacoustic event detection at the DCASE 2022 challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nolasco%2C+I">I. Nolasco</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+S">S. Singh</a>, <a href="/search/cs?searchtype=author&query=Vidana-Villa%2C+E">E. Vidana-Villa</a>, <a href="/search/cs?searchtype=author&query=Grout%2C+E">E. Grout</a>, <a href="/search/cs?searchtype=author&query=Morford%2C+J">J. Morford</a>, <a href="/search/cs?searchtype=author&query=Emmerson%2C+M">M. Emmerson</a>, <a href="/search/cs?searchtype=author&query=Jensens%2C+F">F. Jensens</a>, <a href="/search/cs?searchtype=author&query=Whitehead%2C+H">H. Whitehead</a>, <a href="/search/cs?searchtype=author&query=Kiskin%2C+I">I. Kiskin</a>, <a href="/search/cs?searchtype=author&query=Strandburg-Peshkin%2C+A">A. Strandburg-Peshkin</a>, <a href="/search/cs?searchtype=author&query=Gill%2C+L">L. Gill</a>, <a href="/search/cs?searchtype=author&query=Pamula%2C+H">H. Pamula</a>, <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">V. Lostanlen</a>, <a href="/search/cs?searchtype=author&query=Morfi%2C+V">V. Morfi</a>, <a href="/search/cs?searchtype=author&query=Stowell%2C+D">D. Stowell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.07911v1-abstract-short" style="display: inline;"> Few-shot sound event detection is the task of detecting sound events, despite having only a few labelled examples of the class of interest. This framework is particularly useful in bioacoustics, where often there is a need to annotate very long recordings but the expert annotator time is limited. This paper presents an overview of the second edition of the few-shot bioacoustic sound event detectio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.07911v1-abstract-full').style.display = 'inline'; document.getElementById('2207.07911v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.07911v1-abstract-full" style="display: none;"> Few-shot sound event detection is the task of detecting sound events, despite having only a few labelled examples of the class of interest. This framework is particularly useful in bioacoustics, where often there is a need to annotate very long recordings but the expert annotator time is limited. This paper presents an overview of the second edition of the few-shot bioacoustic sound event detection task included in the DCASE 2022 challenge. A detailed description of the task objectives, dataset, and baselines is presented, together with the main results obtained and characteristics of the submitted systems. This task received submissions from 15 different teams from which 13 scored higher than the baselines. The highest F-score was of 60% on the evaluation set, which leads to a huge improvement over last year's edition. Highly-performing methods made use of prototypical networks, transductive learning, and addressed the variable length of events from all target classes. Furthermore, by analysing results on each of the subsets we can identify the main difficulties that the systems face, and conclude that few-show bioacoustic sound event detection remains an open challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.07911v1-abstract-full').style.display = 'none'; document.getElementById('2207.07911v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to DCASE2022 workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.08269">arXiv:2204.08269</a> <span> [<a href="https://arxiv.org/pdf/2204.08269">pdf</a>, <a href="https://arxiv.org/format/2204.08269">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Differentiable Time-Frequency Scattering on GPU </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Muradeli%2C+J">John Muradeli</a>, <a href="/search/cs?searchtype=author&query=Vahidi%2C+C">Cyrus Vahidi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Changhong Wang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+H">Han Han</a>, <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a>, <a href="/search/cs?searchtype=author&query=Lagrange%2C+M">Mathieu Lagrange</a>, <a href="/search/cs?searchtype=author&query=Fazekas%2C+G">George Fazekas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.08269v4-abstract-short" style="display: inline;"> Joint time-frequency scattering (JTFS) is a convolutional operator in the time-frequency domain which extracts spectrotemporal modulations at various rates and scales. It offers an idealized model of spectrotemporal receptive fields (STRF) in the primary auditory cortex, and thus may serve as a biological plausible surrogate for human perceptual judgments at the scale of isolated audio events. Yet… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.08269v4-abstract-full').style.display = 'inline'; document.getElementById('2204.08269v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.08269v4-abstract-full" style="display: none;"> Joint time-frequency scattering (JTFS) is a convolutional operator in the time-frequency domain which extracts spectrotemporal modulations at various rates and scales. It offers an idealized model of spectrotemporal receptive fields (STRF) in the primary auditory cortex, and thus may serve as a biological plausible surrogate for human perceptual judgments at the scale of isolated audio events. Yet, prior implementations of JTFS and STRF have remained outside of the standard toolkit of perceptual similarity measures and evaluation methods for audio generation. We trace this issue down to three limitations: differentiability, speed, and flexibility. In this paper, we present an implementation of time-frequency scattering in Python. Unlike prior implementations, ours accommodates NumPy, PyTorch, and TensorFlow as backends and is thus portable on both CPU and GPU. We demonstrate the usefulness of JTFS via three applications: unsupervised manifold learning of spectrotemporal modulations, supervised classification of musical instruments, and texture resynthesis of bioacoustic sounds. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.08269v4-abstract-full').style.display = 'none'; document.getElementById('2204.08269v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 6 figures. Submitted to the International Conference on Digital Audio Effects (DAFX) 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2010.00673">arXiv:2010.00673</a> <span> [<a href="https://arxiv.org/pdf/2010.00673">pdf</a>, <a href="https://arxiv.org/format/2010.00673">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Helicality: An Isomap-based Measure of Octave Equivalence in Audio Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sridhar%2C+S">Sripathi Sridhar</a>, <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2010.00673v1-abstract-short" style="display: inline;"> Octave equivalence serves as domain-knowledge in MIR systems, including chromagram, spiral convolutional networks, and harmonic CQT. Prior work has applied the Isomap manifold learning algorithm to unlabeled audio data to embed frequency sub-bands in 3-D space where the Euclidean distances are inversely proportional to the strength of their Pearson correlations. However, discovering octave equival… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.00673v1-abstract-full').style.display = 'inline'; document.getElementById('2010.00673v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2010.00673v1-abstract-full" style="display: none;"> Octave equivalence serves as domain-knowledge in MIR systems, including chromagram, spiral convolutional networks, and harmonic CQT. Prior work has applied the Isomap manifold learning algorithm to unlabeled audio data to embed frequency sub-bands in 3-D space where the Euclidean distances are inversely proportional to the strength of their Pearson correlations. However, discovering octave equivalence via Isomap requires visual inspection and is not scalable. To address this problem, we define "helicality" as the goodness of fit of the 3-D Isomap embedding to a Shepherd-Risset helix. Our method is unsupervised and uses a custom Frank-Wolfe algorithm to minimize a least-squares objective inside a convex hull. Numerical experiments indicate that isolated musical notes have a higher helicality than speech, followed by drum hits. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.00673v1-abstract-full').style.display = 'none'; document.getElementById('2010.00673v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">3 pages, 3 figures. To be presented at the 21st International Society for Music Information Retrieval (ISMIR) Conference. Montreal, Canada, October 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2009.09321">arXiv:2009.09321</a> <span> [<a href="https://arxiv.org/pdf/2009.09321">pdf</a>, <a href="https://arxiv.org/format/2009.09321">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Learning a Lie Algebra from Unlabeled Data Pairs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ick%2C+C">Christopher Ick</a>, <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2009.09321v3-abstract-short" style="display: inline;"> Deep convolutional networks (convnets) show a remarkable ability to learn disentangled representations. In recent years, the generalization of deep learning to Lie groups beyond rigid motion in $\mathbb{R}^n$ has allowed to build convnets over datasets with non-trivial symmetries, such as patterns over the surface of a sphere. However, one limitation of this approach is the need to explicitly defi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2009.09321v3-abstract-full').style.display = 'inline'; document.getElementById('2009.09321v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2009.09321v3-abstract-full" style="display: none;"> Deep convolutional networks (convnets) show a remarkable ability to learn disentangled representations. In recent years, the generalization of deep learning to Lie groups beyond rigid motion in $\mathbb{R}^n$ has allowed to build convnets over datasets with non-trivial symmetries, such as patterns over the surface of a sphere. However, one limitation of this approach is the need to explicitly define the Lie group underlying the desired invariance property before training the convnet. Whereas rotations on the sphere have a well-known symmetry group ($\mathrm{SO}(3)$), the same cannot be said of many real-world factors of variability. For example, the disentanglement of pitch, intensity dynamics, and playing technique remains a challenging task in music information retrieval. This article proposes a machine learning method to discover a nonlinear transformation of the space $\mathbb{R}^n$ which maps a collection of $n$-dimensional vectors $(\boldsymbol{x}_i)_i$ onto a collection of target vectors $(\boldsymbol{y}_i)_i$. The key idea is to approximate every target $\boldsymbol{y}_i$ by a matrix--vector product of the form $\boldsymbol{\widetilde{y}}_i = \boldsymbol蠁(t_i) \boldsymbol{x}_i$, where the matrix $\boldsymbol蠁(t_i)$ belongs to a one-parameter subgroup of $\mathrm{GL}_n (\mathbb{R})$. Crucially, the value of the parameter $t_i \in \mathbb{R}$ may change between data pairs $(\boldsymbol{x}_i, \boldsymbol{y}_i)$ and does not need to be known in advance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2009.09321v3-abstract-full').style.display = 'none'; document.getElementById('2009.09321v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 September, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2 pages, 1 figure. Presented at the first DeepMath conference, New York City, NY, USA, November 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2009.05188">arXiv:2009.05188</a> <span> [<a href="https://arxiv.org/pdf/2009.05188">pdf</a>, <a href="https://arxiv.org/format/2009.05188">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SONYC-UST-V2: An Urban Sound Tagging Dataset with Spatiotemporal Context </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cartwright%2C+M">Mark Cartwright</a>, <a href="/search/cs?searchtype=author&query=Cramer%2C+J">Jason Cramer</a>, <a href="/search/cs?searchtype=author&query=Mendez%2C+A+E+M">Ana Elisa Mendez Mendez</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Ho-Hsiang Wu</a>, <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a>, <a href="/search/cs?searchtype=author&query=Fuentes%2C+M">Magdalena Fuentes</a>, <a href="/search/cs?searchtype=author&query=Dove%2C+G">Graham Dove</a>, <a href="/search/cs?searchtype=author&query=Mydlarz%2C+C">Charlie Mydlarz</a>, <a href="/search/cs?searchtype=author&query=Salamon%2C+J">Justin Salamon</a>, <a href="/search/cs?searchtype=author&query=Nov%2C+O">Oded Nov</a>, <a href="/search/cs?searchtype=author&query=Bello%2C+J+P">Juan Pablo Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2009.05188v1-abstract-short" style="display: inline;"> We present SONYC-UST-V2, a dataset for urban sound tagging with spatiotemporal information. This dataset is aimed for the development and evaluation of machine listening systems for real-world urban noise monitoring. While datasets of urban recordings are available, this dataset provides the opportunity to investigate how spatiotemporal metadata can aid in the prediction of urban sound tags. SONYC… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2009.05188v1-abstract-full').style.display = 'inline'; document.getElementById('2009.05188v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2009.05188v1-abstract-full" style="display: none;"> We present SONYC-UST-V2, a dataset for urban sound tagging with spatiotemporal information. This dataset is aimed for the development and evaluation of machine listening systems for real-world urban noise monitoring. While datasets of urban recordings are available, this dataset provides the opportunity to investigate how spatiotemporal metadata can aid in the prediction of urban sound tags. SONYC-UST-V2 consists of 18510 audio recordings from the "Sounds of New York City" (SONYC) acoustic sensor network, including the timestamp of audio acquisition and location of the sensor. The dataset contains annotations by volunteers from the Zooniverse citizen science platform, as well as a two-stage verification with our team. In this article, we describe our data collection procedure and propose evaluation metrics for multilabel classification of urban sound tags. We report the results of a simple baseline model that exploits spatiotemporal information. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2009.05188v1-abstract-full').style.display = 'none'; document.getElementById('2009.05188v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2007.10926">arXiv:2007.10926</a> <span> [<a href="https://arxiv.org/pdf/2007.10926">pdf</a>, <a href="https://arxiv.org/format/2007.10926">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Time-Frequency Scattering Accurately Models Auditory Similarities Between Instrumental Playing Techniques </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a>, <a href="/search/cs?searchtype=author&query=El-Hajj%2C+C">Christian El-Hajj</a>, <a href="/search/cs?searchtype=author&query=Rossignol%2C+M">Mathias Rossignol</a>, <a href="/search/cs?searchtype=author&query=Lafay%2C+G">Gr茅goire Lafay</a>, <a href="/search/cs?searchtype=author&query=And%C3%A9n%2C+J">Joakim And茅n</a>, <a href="/search/cs?searchtype=author&query=Lagrange%2C+M">Mathieu Lagrange</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2007.10926v2-abstract-short" style="display: inline;"> Instrumental playing techniques such as vibratos, glissandos, and trills often denote musical expressivity, both in classical and folk contexts. However, most existing approaches to music similarity retrieval fail to describe timbre beyond the so-called "ordinary" technique, use instrument identity as a proxy for timbre quality, and do not allow for customization to the perceptual idiosyncrasies o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.10926v2-abstract-full').style.display = 'inline'; document.getElementById('2007.10926v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2007.10926v2-abstract-full" style="display: none;"> Instrumental playing techniques such as vibratos, glissandos, and trills often denote musical expressivity, both in classical and folk contexts. However, most existing approaches to music similarity retrieval fail to describe timbre beyond the so-called "ordinary" technique, use instrument identity as a proxy for timbre quality, and do not allow for customization to the perceptual idiosyncrasies of a new subject. In this article, we ask 31 human subjects to organize 78 isolated notes into a set of timbre clusters. Analyzing their responses suggests that timbre perception operates within a more flexible taxonomy than those provided by instruments or playing techniques alone. In addition, we propose a machine listening model to recover the cluster graph of auditory similarities across instruments, mutes, and techniques. Our model relies on joint time--frequency scattering features to extract spectrotemporal modulations as acoustic features. Furthermore, it minimizes triplet loss in the cluster graph by means of the large-margin nearest neighbor (LMNN) metric learning algorithm. Over a dataset of 9346 isolated notes, we report a state-of-the-art average precision at rank five (AP@5) of $99.0\%\pm1$. An ablation study demonstrates that removing either the joint time--frequency scattering transform or the metric learning algorithm noticeably degrades performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.10926v2-abstract-full').style.display = 'none'; document.getElementById('2007.10926v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">32 pages, 5 figures. To appear in EURASIP Journal on Audio, Speech, and Music Processing (JASMP)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2007.10299">arXiv:2007.10299</a> <span> [<a href="https://arxiv.org/pdf/2007.10299">pdf</a>, <a href="https://arxiv.org/format/2007.10299">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> wav2shape: Hearing the Shape of a Drum Machine </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+H">Han Han</a>, <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2007.10299v1-abstract-short" style="display: inline;"> Disentangling and recovering physical attributes, such as shape and material, from a few waveform examples is a challenging inverse problem in audio signal processing, with numerous applications in musical acoustics as well as structural engineering. We propose to address this problem via a combination of time--frequency analysis and supervised machine learning. We start by synthesizing a dataset… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.10299v1-abstract-full').style.display = 'inline'; document.getElementById('2007.10299v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2007.10299v1-abstract-full" style="display: none;"> Disentangling and recovering physical attributes, such as shape and material, from a few waveform examples is a challenging inverse problem in audio signal processing, with numerous applications in musical acoustics as well as structural engineering. We propose to address this problem via a combination of time--frequency analysis and supervised machine learning. We start by synthesizing a dataset of sounds using the functional transformation method. Then, we represent each percussive sound in terms of its time-invariant scattering transform coefficients and formulate the parametric estimation of the resonator as multidimensional regression with a deep convolutional neural network. We interpolate scattering coefficients over the surface of the drum as a surrogate for potentially missing data, and study the response of the neural network to interpolated samples. Lastly, we resynthesize drum sounds from scattering coefficients, therefore paving the way towards a deep generative model of drum sounds whose latent variables are physically interpretable. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.10299v1-abstract-full').style.display = 'none'; document.getElementById('2007.10299v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 7 figures. To appear in the Proceedings of Forum Acusticum, Lyon (France), December 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2007.00763">arXiv:2007.00763</a> <span> [<a href="https://arxiv.org/pdf/2007.00763">pdf</a>, <a href="https://arxiv.org/format/2007.00763">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> OrchideaSOL: a dataset of extended instrumental techniques for computer-aided orchestration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cella%2C+C+E">Carmine Emanuele Cella</a>, <a href="/search/cs?searchtype=author&query=Ghisi%2C+D">Daniele Ghisi</a>, <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a>, <a href="/search/cs?searchtype=author&query=L%C3%A9vy%2C+F">Fabien L茅vy</a>, <a href="/search/cs?searchtype=author&query=Fineberg%2C+J">Joshua Fineberg</a>, <a href="/search/cs?searchtype=author&query=Maresz%2C+Y">Yan Maresz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2007.00763v1-abstract-short" style="display: inline;"> This paper introduces OrchideaSOL, a free dataset of samples of extended instrumental playing techniques, designed to be used as default dataset for the Orchidea framework for target-based computer-aided orchestration. OrchideaSOL is a reduced and modified subset of Studio On Line, or SOL for short, a dataset developed at Ircam between 1996 and 1998. We motivate the reasons behind OrchideaSOL and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.00763v1-abstract-full').style.display = 'inline'; document.getElementById('2007.00763v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2007.00763v1-abstract-full" style="display: none;"> This paper introduces OrchideaSOL, a free dataset of samples of extended instrumental playing techniques, designed to be used as default dataset for the Orchidea framework for target-based computer-aided orchestration. OrchideaSOL is a reduced and modified subset of Studio On Line, or SOL for short, a dataset developed at Ircam between 1996 and 1998. We motivate the reasons behind OrchideaSOL and describe the differences between the original SOL and our dataset. We will also show the work done in improving the dynamic ranges of orchestral families and other aspects of the data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.00763v1-abstract-full').style.display = 'none'; document.getElementById('2007.00763v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 6 figures, in English. To appear in the proceedings of the International Computer Music Conference (ICMC 2020). Please visit: https://icmc2020.org/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2003.01037">arXiv:2003.01037</a> <span> [<a href="https://arxiv.org/pdf/2003.01037">pdf</a>, <a href="https://arxiv.org/format/2003.01037">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> One or Two Components? The Scattering Transform Answers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a>, <a href="/search/cs?searchtype=author&query=Cohen-Hadria%2C+A">Alice Cohen-Hadria</a>, <a href="/search/cs?searchtype=author&query=Bello%2C+J+P">Juan Pablo Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2003.01037v2-abstract-short" style="display: inline;"> With the aim of constructing a biologically plausible model of machine listening, we study the representation of a multicomponent stationary signal by a wavelet scattering network. First, we show that renormalizing second-order nodes by their first-order parents gives a simple numerical criterion to assess whether two neighboring components will interfere psychoacoustically. Secondly, we run a man… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.01037v2-abstract-full').style.display = 'inline'; document.getElementById('2003.01037v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2003.01037v2-abstract-full" style="display: none;"> With the aim of constructing a biologically plausible model of machine listening, we study the representation of a multicomponent stationary signal by a wavelet scattering network. First, we show that renormalizing second-order nodes by their first-order parents gives a simple numerical criterion to assess whether two neighboring components will interfere psychoacoustically. Secondly, we run a manifold learning algorithm (Isomap) on scattering coefficients to visualize the similarity space underlying parametric additive synthesis. Thirdly, we generalize the "one or two components" framework to three sine waves or more, and prove that the effective scattering depth of a Fourier series grows in logarithmic proportion to its bandwidth. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.01037v2-abstract-full').style.display = 'none'; document.getElementById('2003.01037v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 March, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 4 figures, in English. Proceedings of the European Signal Processing Conference (EUSIPCO 2020)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1911.00417">arXiv:1911.00417</a> <span> [<a href="https://arxiv.org/pdf/1911.00417">pdf</a>, <a href="https://arxiv.org/format/1911.00417">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.33682/ts6e-sn53">10.33682/ts6e-sn53 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Long-distance Detection of Bioacoustic Events with Per-channel Energy Normalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a>, <a href="/search/cs?searchtype=author&query=Palmer%2C+K">Kaitlin Palmer</a>, <a href="/search/cs?searchtype=author&query=Knight%2C+E">Elly Knight</a>, <a href="/search/cs?searchtype=author&query=Clark%2C+C">Christopher Clark</a>, <a href="/search/cs?searchtype=author&query=Klinck%2C+H">Holger Klinck</a>, <a href="/search/cs?searchtype=author&query=Farnsworth%2C+A">Andrew Farnsworth</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+T">Tina Wong</a>, <a href="/search/cs?searchtype=author&query=Cramer%2C+J">Jason Cramer</a>, <a href="/search/cs?searchtype=author&query=Bello%2C+J+P">Juan Pablo Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1911.00417v1-abstract-short" style="display: inline;"> This paper proposes to perform unsupervised detection of bioacoustic events by pooling the magnitudes of spectrogram frames after per-channel energy normalization (PCEN). Although PCEN was originally developed for speech recognition, it also has beneficial effects in enhancing animal vocalizations, despite the presence of atmospheric absorption and intermittent noise. We prove that PCEN generalize… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1911.00417v1-abstract-full').style.display = 'inline'; document.getElementById('1911.00417v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1911.00417v1-abstract-full" style="display: none;"> This paper proposes to perform unsupervised detection of bioacoustic events by pooling the magnitudes of spectrogram frames after per-channel energy normalization (PCEN). Although PCEN was originally developed for speech recognition, it also has beneficial effects in enhancing animal vocalizations, despite the presence of atmospheric absorption and intermittent noise. We prove that PCEN generalizes logarithm-based spectral flux, yet with a tunable time scale for background noise estimation. In comparison with pointwise logarithm, PCEN reduces false alarm rate by 50x in the near field and 5x in the far field, both on avian and marine bioacoustic datasets. Such improvements come at moderate computational cost and require no human intervention, thus heralding a promising future for PCEN in bioacoustics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1911.00417v1-abstract-full').style.display = 'none'; document.getElementById('1911.00417v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures. Presented at the 3rd International Workshop on Detection and Classification of Acoustic Scenes and Events (DCASE). 25--26 October 2019, New York, NY, USA</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1910.10246">arXiv:1910.10246</a> <span> [<a href="https://arxiv.org/pdf/1910.10246">pdf</a>, <a href="https://arxiv.org/format/1910.10246">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Learning the helix topology of musical pitch </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a>, <a href="/search/cs?searchtype=author&query=Sridhar%2C+S">Sripathi Sridhar</a>, <a href="/search/cs?searchtype=author&query=McFee%2C+B">Brian McFee</a>, <a href="/search/cs?searchtype=author&query=Farnsworth%2C+A">Andrew Farnsworth</a>, <a href="/search/cs?searchtype=author&query=Bello%2C+J+P">Juan Pablo Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1910.10246v2-abstract-short" style="display: inline;"> To explain the consonance of octaves, music psychologists represent pitch as a helix where azimuth and axial coordinate correspond to pitch class and pitch height respectively. This article addresses the problem of discovering this helical structure from unlabeled audio data. We measure Pearson correlations in the constant-Q transform (CQT) domain to build a K-nearest neighbor graph between freque… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.10246v2-abstract-full').style.display = 'inline'; document.getElementById('1910.10246v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1910.10246v2-abstract-full" style="display: none;"> To explain the consonance of octaves, music psychologists represent pitch as a helix where azimuth and axial coordinate correspond to pitch class and pitch height respectively. This article addresses the problem of discovering this helical structure from unlabeled audio data. We measure Pearson correlations in the constant-Q transform (CQT) domain to build a K-nearest neighbor graph between frequency subbands. Then, we run the Isomap manifold learning algorithm to represent this graph in a three-dimensional space in which straight lines approximate graph geodesics. Experiments on isolated musical notes demonstrate that the resulting manifold resembles a helix which makes a full turn at every octave. A circular shape is also found in English speech, but not in urban noise. We discuss the impact of various design choices on the visualization: instrumentarium, loudness mapping function, and number of neighbors K. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.10246v2-abstract-full').style.display = 'none'; document.getElementById('1910.10246v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 6 figures. To appear in the Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP). Barcelona, Spain, May 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1906.09334">arXiv:1906.09334</a> <span> [<a href="https://arxiv.org/pdf/1906.09334">pdf</a>, <a href="https://arxiv.org/format/1906.09334">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> The Shape of RemiXXXes to Come: Audio Texture Synthesis with Time-frequency Scattering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a>, <a href="/search/cs?searchtype=author&query=Hecker%2C+F">Florian Hecker</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1906.09334v2-abstract-short" style="display: inline;"> This article explains how to apply time--frequency scattering, a convolutional operator extracting modulations in the time--frequency domain at different rates and scales, to the re-synthesis and manipulation of audio textures. After implementing phase retrieval in the scattering network by gradient backpropagation, we introduce scale--rate DAFx, a class of audio transformations expressed in the d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.09334v2-abstract-full').style.display = 'inline'; document.getElementById('1906.09334v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1906.09334v2-abstract-full" style="display: none;"> This article explains how to apply time--frequency scattering, a convolutional operator extracting modulations in the time--frequency domain at different rates and scales, to the re-synthesis and manipulation of audio textures. After implementing phase retrieval in the scattering network by gradient backpropagation, we introduce scale--rate DAFx, a class of audio transformations expressed in the domain of time--frequency scattering coefficients. One example of scale--rate DAFx is chirp rate inversion, which causes each sonic event to be locally reversed in time while leaving the arrow of time globally unchanged. Over the past two years, our work has led to the creation of four electroacoustic pieces: ``FAVN''; ``Modulator (Scattering Transform)''; ``Experimental Palimpsest''; ``Inspection''; and a remix of Lorenzo Senni's ``XAllegroX'', released by Warp Records on a vinyl entitled ``The Shape of RemiXXXes to Come''. The source code to reproduce experiments and figures is made freely available at: https://github.com/lostanlen/scattering.m. A companion website containing demos is at: https://lostanlen.com/pubs/dafx2019 <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.09334v2-abstract-full').style.display = 'none'; document.getElementById('1906.09334v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 June, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 June, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 3 figures. To appear in the proceedings of the International Conference on Digital Audio Effects (DAFX-19), Birmingham, UK</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1905.08601">arXiv:1905.08601</a> <span> [<a href="https://arxiv.org/pdf/1905.08601">pdf</a>, <a href="https://arxiv.org/format/1905.08601">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Une ou deux composantes ? La r茅ponse de la diffusion en ondelettes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1905.08601v3-abstract-short" style="display: inline;"> With the aim of constructing a biologically plausible model of machine listening, we study the representation of a multicomponent stationary signal by a wavelet scattering network. First, we show that renormalizing second-order nodes by their first-order parents gives a simple numerical criterion to establish whether two neighboring components will interfere psychoacoustically. Secondly, we genera… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1905.08601v3-abstract-full').style.display = 'inline'; document.getElementById('1905.08601v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1905.08601v3-abstract-full" style="display: none;"> With the aim of constructing a biologically plausible model of machine listening, we study the representation of a multicomponent stationary signal by a wavelet scattering network. First, we show that renormalizing second-order nodes by their first-order parents gives a simple numerical criterion to establish whether two neighboring components will interfere psychoacoustically. Secondly, we generalize the `one or two components' framework to three sine waves or more, and show that a network of depth $M = \log_2 N$ suffices to characterize the relative amplitudes of the first $N$ terms in a Fourier series, while enjoying properties of invariance to frequency transposition and component-wise phase shifts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1905.08601v3-abstract-full').style.display = 'none'; document.getElementById('1905.08601v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 May, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, in French. Submitted to the GRETSI workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1905.08352">arXiv:1905.08352</a> <span> [<a href="https://arxiv.org/pdf/1905.08352">pdf</a>, <a href="https://arxiv.org/format/1905.08352">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1371/journal.pone.0214168">10.1371/journal.pone.0214168 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Robust sound event detection in bioacoustic sensor networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a>, <a href="/search/cs?searchtype=author&query=Salamon%2C+J">Justin Salamon</a>, <a href="/search/cs?searchtype=author&query=Farnsworth%2C+A">Andrew Farnsworth</a>, <a href="/search/cs?searchtype=author&query=Kelling%2C+S">Steve Kelling</a>, <a href="/search/cs?searchtype=author&query=Bello%2C+J+P">Juan Pablo Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1905.08352v2-abstract-short" style="display: inline;"> Bioacoustic sensors, sometimes known as autonomous recording units (ARUs), can record sounds of wildlife over long periods of time in scalable and minimally invasive ways. Deriving per-species abundance estimates from these sensors requires detection, classification, and quantification of animal vocalizations as individual acoustic events. Yet, variability in ambient noise, both over time and acro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1905.08352v2-abstract-full').style.display = 'inline'; document.getElementById('1905.08352v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1905.08352v2-abstract-full" style="display: none;"> Bioacoustic sensors, sometimes known as autonomous recording units (ARUs), can record sounds of wildlife over long periods of time in scalable and minimally invasive ways. Deriving per-species abundance estimates from these sensors requires detection, classification, and quantification of animal vocalizations as individual acoustic events. Yet, variability in ambient noise, both over time and across sensors, hinders the reliability of current automated systems for sound event detection (SED), such as convolutional neural networks (CNN) in the time-frequency domain. In this article, we develop, benchmark, and combine several machine listening techniques to improve the generalizability of SED models across heterogeneous acoustic environments. As a case study, we consider the problem of detecting avian flight calls from a ten-hour recording of nocturnal bird migration, recorded by a network of six ARUs in the presence of heterogeneous background noise. Starting from a CNN yielding state-of-the-art accuracy on this task, we introduce two noise adaptation techniques, respectively integrating short-term (60 milliseconds) and long-term (30 minutes) context. First, we apply per-channel energy normalization (PCEN) in the time-frequency domain, which applies short-term automatic gain control to every subband in the mel-frequency spectrogram. Secondly, we replace the last dense layer in the network by a context-adaptive neural network (CA-NN) layer. Combining them yields state-of-the-art results that are unmatched by artificial data augmentation alone. We release a pre-trained version of our best performing system under the name of BirdVoxDetect, a ready-to-use detector of avian flight calls in field recordings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1905.08352v2-abstract-full').style.display = 'none'; document.getElementById('1905.08352v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 May, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">32 pages, in English. Submitted to PLOS ONE journal in February 2019; revised August 2019; published October 2019</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1812.11214">arXiv:1812.11214</a> <span> [<a href="https://arxiv.org/pdf/1812.11214">pdf</a>, <a href="https://arxiv.org/ps/1812.11214">ps</a>, <a href="https://arxiv.org/format/1812.11214">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Kymatio: Scattering Transforms in Python </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Andreux%2C+M">Mathieu Andreux</a>, <a href="/search/cs?searchtype=author&query=Angles%2C+T">Tom谩s Angles</a>, <a href="/search/cs?searchtype=author&query=Exarchakis%2C+G">Georgios Exarchakis</a>, <a href="/search/cs?searchtype=author&query=Leonarduzzi%2C+R">Roberto Leonarduzzi</a>, <a href="/search/cs?searchtype=author&query=Rochette%2C+G">Gaspar Rochette</a>, <a href="/search/cs?searchtype=author&query=Thiry%2C+L">Louis Thiry</a>, <a href="/search/cs?searchtype=author&query=Zarka%2C+J">John Zarka</a>, <a href="/search/cs?searchtype=author&query=Mallat%2C+S">St茅phane Mallat</a>, <a href="/search/cs?searchtype=author&query=and%C3%A9n%2C+J">Joakim and茅n</a>, <a href="/search/cs?searchtype=author&query=Belilovsky%2C+E">Eugene Belilovsky</a>, <a href="/search/cs?searchtype=author&query=Bruna%2C+J">Joan Bruna</a>, <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a>, <a href="/search/cs?searchtype=author&query=Chaudhary%2C+M">Muawiz Chaudhary</a>, <a href="/search/cs?searchtype=author&query=Hirn%2C+M+J">Matthew J. Hirn</a>, <a href="/search/cs?searchtype=author&query=Oyallon%2C+E">Edouard Oyallon</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Sixin Zhang</a>, <a href="/search/cs?searchtype=author&query=Cella%2C+C">Carmine Cella</a>, <a href="/search/cs?searchtype=author&query=Eickenberg%2C+M">Michael Eickenberg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1812.11214v3-abstract-short" style="display: inline;"> The wavelet scattering transform is an invariant signal representation suitable for many signal processing and machine learning applications. We present the Kymatio software package, an easy-to-use, high-performance Python implementation of the scattering transform in 1D, 2D, and 3D that is compatible with modern deep learning frameworks. All transforms may be executed on a GPU (in addition to CPU… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1812.11214v3-abstract-full').style.display = 'inline'; document.getElementById('1812.11214v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1812.11214v3-abstract-full" style="display: none;"> The wavelet scattering transform is an invariant signal representation suitable for many signal processing and machine learning applications. We present the Kymatio software package, an easy-to-use, high-performance Python implementation of the scattering transform in 1D, 2D, and 3D that is compatible with modern deep learning frameworks. All transforms may be executed on a GPU (in addition to CPU), offering a considerable speed up over CPU implementations. The package also has a small memory footprint, resulting inefficient memory usage. The source code, documentation, and examples are available undera BSD license at https://www.kymat.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1812.11214v3-abstract-full').style.display = 'none'; document.getElementById('1812.11214v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 December, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2018. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1810.04506">arXiv:1810.04506</a> <span> [<a href="https://arxiv.org/pdf/1810.04506">pdf</a>, <a href="https://arxiv.org/ps/1810.04506">ps</a>, <a href="https://arxiv.org/format/1810.04506">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> On Time-frequency Scattering and Computer Music </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1810.04506v3-abstract-short" style="display: inline;"> Time-frequency scattering is a mathematical transformation of sound waves. Its core purpose is to mimick the way the human auditory system extracts information from its environment. In the context of improving the artificial intelligence of sounds, it has found succesful applications in automatic speech transcription as well as the recognition of urban sounds and musical sounds. In this article, w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1810.04506v3-abstract-full').style.display = 'inline'; document.getElementById('1810.04506v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1810.04506v3-abstract-full" style="display: none;"> Time-frequency scattering is a mathematical transformation of sound waves. Its core purpose is to mimick the way the human auditory system extracts information from its environment. In the context of improving the artificial intelligence of sounds, it has found succesful applications in automatic speech transcription as well as the recognition of urban sounds and musical sounds. In this article, we show that time-frequency scattering can also be useful for applications in contemporary music creations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1810.04506v3-abstract-full').style.display = 'none'; document.getElementById('1810.04506v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 May, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 October, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages. Published as a chapter in the book: "Florian Hecker: Halluzination, Perspektive, Synthese", pp. 97--102. Nicolaus Schafhausen, Vanessa Joan M眉ller, editors. Sternberg Press, Berlin, 2019</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1810.00790">arXiv:1810.00790</a> <span> [<a href="https://arxiv.org/pdf/1810.00790">pdf</a>, <a href="https://arxiv.org/format/1810.00790">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Eigentriads and Eigenprogressions on the Tonnetz </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1810.00790v1-abstract-short" style="display: inline;"> We introduce a new multidimensional representation, named eigenprogression transform, that characterizes some essential patterns of Western tonal harmony while being equivariant to time shifts and pitch transpositions. This representation is deep, multiscale, and convolutional in the piano-roll domain, yet incurs no prior training, and is thus suited to both supervised and unsupervised MIR tasks.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1810.00790v1-abstract-full').style.display = 'inline'; document.getElementById('1810.00790v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1810.00790v1-abstract-full" style="display: none;"> We introduce a new multidimensional representation, named eigenprogression transform, that characterizes some essential patterns of Western tonal harmony while being equivariant to time shifts and pitch transpositions. This representation is deep, multiscale, and convolutional in the piano-roll domain, yet incurs no prior training, and is thus suited to both supervised and unsupervised MIR tasks. The eigenprogression transform combines ideas from the spiral scattering transform, spectral graph theory, and wavelet shrinkage denoising. We report state-of-the-art results on a task of supervised composer recognition (Haydn vs. Mozart) from polyphonic music pieces in MIDI format. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1810.00790v1-abstract-full').style.display = 'none'; document.getElementById('1810.00790v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Proceedings of the Late-Breaking / Demo session (LBD) of the International Society of Music Information Retrieval (ISMIR). September 2018, Paris, France. Source code at github.com/lostanlen/ismir2018-lbd</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1808.09730">arXiv:1808.09730</a> <span> [<a href="https://arxiv.org/pdf/1808.09730">pdf</a>, <a href="https://arxiv.org/format/1808.09730">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Extended playing techniques: The next milestone in musical instrument recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a>, <a href="/search/cs?searchtype=author&query=And%C3%A9n%2C+J">Joakim And茅n</a>, <a href="/search/cs?searchtype=author&query=Lagrange%2C+M">Mathieu Lagrange</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1808.09730v1-abstract-short" style="display: inline;"> The expressive variability in producing a musical note conveys information essential to the modeling of orchestration and style. As such, it plays a crucial role in computer-assisted browsing of massive digital music corpora. Yet, although the automatic recognition of a musical instrument from the recording of a single "ordinary" note is considered a solved problem, automatic identification of ins… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1808.09730v1-abstract-full').style.display = 'inline'; document.getElementById('1808.09730v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1808.09730v1-abstract-full" style="display: none;"> The expressive variability in producing a musical note conveys information essential to the modeling of orchestration and style. As such, it plays a crucial role in computer-assisted browsing of massive digital music corpora. Yet, although the automatic recognition of a musical instrument from the recording of a single "ordinary" note is considered a solved problem, automatic identification of instrumental playing technique (IPT) remains largely underdeveloped. We benchmark machine listening systems for query-by-example browsing among 143 extended IPTs for 16 instruments, amounting to 469 triplets of instrument, mute, and technique. We identify and discuss three necessary conditions for significantly outperforming the traditional mel-frequency cepstral coefficient (MFCC) baseline: the addition of second-order scattering coefficients to account for amplitude modulation, the incorporation of long-range temporal dependencies, and metric learning using large-margin nearest neighbors (LMNN) to reduce intra-class variability. Evaluating on the Studio On Line (SOL) dataset, we obtain a precision at rank 5 of 99.7% for instrument recognition (baseline at 89.0%) and of 61.0% for IPT recognition (baseline at 44.5%). We interpret this gain through a qualitative assessment of practical usability and visualization using nonlinear dimensionality reduction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1808.09730v1-abstract-full').style.display = 'none'; document.getElementById('1808.09730v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 August, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 9 figures. The source code to reproduce the experiments of this paper is made available at: https://www.github.com/mathieulagrange/dlfm2018</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of the 5th International Workshop on Digital Libraries for Musicology (DLfM), Paris, France, September 2018. Published by ACM's International Conference Proceedings Series (ICPS) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1807.08869">arXiv:1807.08869</a> <span> [<a href="https://arxiv.org/pdf/1807.08869">pdf</a>, <a href="https://arxiv.org/format/1807.08869">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TSP.2019.2918992">10.1109/TSP.2019.2918992 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Joint Time-Frequency Scattering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=And%C3%A9n%2C+J">Joakim And茅n</a>, <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a>, <a href="/search/cs?searchtype=author&query=Mallat%2C+S">St茅phane Mallat</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1807.08869v2-abstract-short" style="display: inline;"> In time series classification and regression, signals are typically mapped into some intermediate representation used for constructing models. Since the underlying task is often insensitive to time shifts, these representations are required to be time-shift invariant. We introduce the joint time-frequency scattering transform, a time-shift invariant representation which characterizes the multiscal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1807.08869v2-abstract-full').style.display = 'inline'; document.getElementById('1807.08869v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1807.08869v2-abstract-full" style="display: none;"> In time series classification and regression, signals are typically mapped into some intermediate representation used for constructing models. Since the underlying task is often insensitive to time shifts, these representations are required to be time-shift invariant. We introduce the joint time-frequency scattering transform, a time-shift invariant representation which characterizes the multiscale energy distribution of a signal in time and frequency. It is computed through wavelet convolutions and modulus non-linearities and may therefore be implemented as a deep convolutional neural network whose filters are not learned but calculated from wavelets. We consider the progression from mel-spectrograms to time scattering and joint time-frequency scattering transforms, illustrating the relationship between increased discriminability and refinements of convolutional network architectures. The suitability of the joint time-frequency scattering transform for time-shift invariant characterization of time series is demonstrated through applications to chirp signals and audio synthesis experiments. The proposed transform also obtains state-of-the-art results on several audio classification tasks, outperforming time scattering transforms and achieving accuracies comparable to those of fully learned networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1807.08869v2-abstract-full').style.display = 'none'; document.getElementById('1807.08869v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 July, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 July, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 10 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Transactions on Signal Processing, vol. 67, no. 14, pp. 3704-3718, July 15, 2019 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1605.06644">arXiv:1605.06644</a> <span> [<a href="https://arxiv.org/pdf/1605.06644">pdf</a>, <a href="https://arxiv.org/format/1605.06644">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Deep convolutional networks on the pitch spiral for musical instrument recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a>, <a href="/search/cs?searchtype=author&query=Cella%2C+C">Carmine-Emanuele Cella</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1605.06644v3-abstract-short" style="display: inline;"> Musical performance combines a wide range of pitches, nuances, and expressive techniques. Audio-based classification of musical instruments thus requires to build signal representations that are invariant to such transformations. This article investigates the construction of learned convolutional architectures for instrument recognition, given a limited amount of annotated training data. In this c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1605.06644v3-abstract-full').style.display = 'inline'; document.getElementById('1605.06644v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1605.06644v3-abstract-full" style="display: none;"> Musical performance combines a wide range of pitches, nuances, and expressive techniques. Audio-based classification of musical instruments thus requires to build signal representations that are invariant to such transformations. This article investigates the construction of learned convolutional architectures for instrument recognition, given a limited amount of annotated training data. In this context, we benchmark three different weight sharing strategies for deep convolutional networks in the time-frequency domain: temporal kernels; time-frequency kernels; and a linear combination of time-frequency kernels which are one octave apart, akin to a Shepard pitch spiral. We provide an acoustical interpretation of these strategies within the source-filter framework of quasi-harmonic sounds with a fixed spectral envelope, which are archetypal of musical notes. The best classification accuracy is obtained by hybridizing all three convolutional layers into a single deep learning architecture. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1605.06644v3-abstract-full').style.display = 'none'; document.getElementById('1605.06644v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2017; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 May, 2016; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2016. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 3 figures. Accepted at the International Society for Music Information Retrieval Conference (ISMIR) conference in New York City, NY, USA, August 2016</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1601.00287">arXiv:1601.00287</a> <span> [<a href="https://arxiv.org/pdf/1601.00287">pdf</a>, <a href="https://arxiv.org/format/1601.00287">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Wavelet Scattering on the Pitch Spiral </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a>, <a href="/search/cs?searchtype=author&query=Mallat%2C+S">St茅phane Mallat</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1601.00287v1-abstract-short" style="display: inline;"> We present a new representation of harmonic sounds that linearizes the dynamics of pitch and spectral envelope, while remaining stable to deformations in the time-frequency plane. It is an instance of the scattering transform, a generic operator which cascades wavelet convolutions and modulus nonlinearities. It is derived from the pitch spiral, in that convolutions are successively performed in ti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1601.00287v1-abstract-full').style.display = 'inline'; document.getElementById('1601.00287v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1601.00287v1-abstract-full" style="display: none;"> We present a new representation of harmonic sounds that linearizes the dynamics of pitch and spectral envelope, while remaining stable to deformations in the time-frequency plane. It is an instance of the scattering transform, a generic operator which cascades wavelet convolutions and modulus nonlinearities. It is derived from the pitch spiral, in that convolutions are successively performed in time, log-frequency, and octave index. We give a closed-form approximation of spiral scattering coefficients for a nonstationary generalization of the harmonic source-filter model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1601.00287v1-abstract-full').style.display = 'none'; document.getElementById('1601.00287v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2016; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2016. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Proceedings of the 18th International Conference on Digital Audio Effects (DAFx-15), Trondheim, Norway, Nov 30 - Dec 3, 2015, pp. 429--432. 4 pages, 3 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 65T60 </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of the 18th International Conference on Digital Audio Effects (DAFx-15), Trondheim, Norway, Nov 30 - Dec 3, 2015, pp. 429--432 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1512.02125">arXiv:1512.02125</a> <span> [<a href="https://arxiv.org/pdf/1512.02125">pdf</a>, <a href="https://arxiv.org/format/1512.02125">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/MLSP.2015.7324385">10.1109/MLSP.2015.7324385 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Joint Time-Frequency Scattering for Audio Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=And%C3%A9n%2C+J">Joakim And茅n</a>, <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a>, <a href="/search/cs?searchtype=author&query=Mallat%2C+S">St茅phane Mallat</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1512.02125v1-abstract-short" style="display: inline;"> We introduce the joint time-frequency scattering transform, a time shift invariant descriptor of time-frequency structure for audio classification. It is obtained by applying a two-dimensional wavelet transform in time and log-frequency to a time-frequency wavelet scalogram. We show that this descriptor successfully characterizes complex time-frequency phenomena such as time-varying filters and fr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1512.02125v1-abstract-full').style.display = 'inline'; document.getElementById('1512.02125v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1512.02125v1-abstract-full" style="display: none;"> We introduce the joint time-frequency scattering transform, a time shift invariant descriptor of time-frequency structure for audio classification. It is obtained by applying a two-dimensional wavelet transform in time and log-frequency to a time-frequency wavelet scalogram. We show that this descriptor successfully characterizes complex time-frequency phenomena such as time-varying filters and frequency modulated excitations. State-of-the-art results are achieved for signal reconstruction and phone segment classification on the TIMIT dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1512.02125v1-abstract-full').style.display = 'none'; document.getElementById('1512.02125v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 December, 2015; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2015. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 2 figures in IEEE 25th International Workshop on Machine Learning for Signal Processing (MLSP), 2015. Sept. 17-20. Boston, USA</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1509.00334">arXiv:1509.00334</a> <span> [<a href="https://arxiv.org/pdf/1509.00334">pdf</a>, <a href="https://arxiv.org/format/1509.00334">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Transform茅e en scattering sur la spirale temps-chroma-octave </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lostanlen%2C+V">Vincent Lostanlen</a>, <a href="/search/cs?searchtype=author&query=Mallat%2C+S">St茅phane Mallat</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1509.00334v1-abstract-short" style="display: inline;"> We introduce a scattering representation for the analysis and classification of sounds. It is locally translation-invariant, stable to deformations in time and frequency, and has the ability to capture harmonic structures. The scattering representation can be interpreted as a convolutional neural network which cascades a wavelet transform in time and along a harmonic spiral. We study its applicati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1509.00334v1-abstract-full').style.display = 'inline'; document.getElementById('1509.00334v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1509.00334v1-abstract-full" style="display: none;"> We introduce a scattering representation for the analysis and classification of sounds. It is locally translation-invariant, stable to deformations in time and frequency, and has the ability to capture harmonic structures. The scattering representation can be interpreted as a convolutional neural network which cascades a wavelet transform in time and along a harmonic spiral. We study its application for the analysis of the deformations of the source-filter model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1509.00334v1-abstract-full').style.display = 'none'; document.getElementById('1509.00334v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 September, 2015; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2015. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">in French, 4 pages, 3 figures, presented at GRETSI 2015 in Lyon, France</span> </p> </li> </ol> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository