Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–39 of 39 results for author: <span class="mathjax">Oord, A v d</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Oord%2C+A+v+d">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Oord, A v d"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Oord%2C+A+v+d&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Oord, A v d"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07009">arXiv:2408.07009</a> <span> [<a href="https://arxiv.org/pdf/2408.07009">pdf</a>, <a href="https://arxiv.org/format/2408.07009">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Imagen 3 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Imagen-Team-Google"> Imagen-Team-Google</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Baldridge%2C+J">Jason Baldridge</a>, <a href="/search/cs?searchtype=author&query=Bauer%2C+J">Jakob Bauer</a>, <a href="/search/cs?searchtype=author&query=Bhutani%2C+M">Mukul Bhutani</a>, <a href="/search/cs?searchtype=author&query=Brichtova%2C+N">Nicole Brichtova</a>, <a href="/search/cs?searchtype=author&query=Bunner%2C+A">Andrew Bunner</a>, <a href="/search/cs?searchtype=author&query=Castrejon%2C+L">Lluis Castrejon</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+K">Kelvin Chan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yichang Chen</a>, <a href="/search/cs?searchtype=author&query=Dieleman%2C+S">Sander Dieleman</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yuqing Du</a>, <a href="/search/cs?searchtype=author&query=Eaton-Rosen%2C+Z">Zach Eaton-Rosen</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hongliang Fei</a>, <a href="/search/cs?searchtype=author&query=de+Freitas%2C+N">Nando de Freitas</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yilin Gao</a>, <a href="/search/cs?searchtype=author&query=Gladchenko%2C+E">Evgeny Gladchenko</a>, <a href="/search/cs?searchtype=author&query=Colmenarejo%2C+S+G">Sergio G贸mez Colmenarejo</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+M">Mandy Guo</a>, <a href="/search/cs?searchtype=author&query=Haig%2C+A">Alex Haig</a>, <a href="/search/cs?searchtype=author&query=Hawkins%2C+W">Will Hawkins</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hexiang Hu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Huilian Huang</a>, <a href="/search/cs?searchtype=author&query=Igwe%2C+T+P">Tobenna Peter Igwe</a>, <a href="/search/cs?searchtype=author&query=Kaplanis%2C+C">Christos Kaplanis</a> , et al. (237 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07009v3-abstract-short" style="display: inline;"> We introduce Imagen 3, a latent diffusion model that generates high quality images from text prompts. We describe our quality and responsibility evaluations. Imagen 3 is preferred over other state-of-the-art (SOTA) models at the time of evaluation. In addition, we discuss issues around safety and representation, as well as methods we used to minimize the potential harm of our models. </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07009v3-abstract-full" style="display: none;"> We introduce Imagen 3, a latent diffusion model that generates high quality images from text prompts. We describe our quality and responsibility evaluations. Imagen 3 is preferred over other state-of-the-art (SOTA) models at the time of evaluation. In addition, we discuss issues around safety and representation, as well as methods we used to minimize the potential harm of our models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07009v3-abstract-full').style.display = 'none'; document.getElementById('2408.07009v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.03526">arXiv:2308.03526</a> <span> [<a href="https://arxiv.org/pdf/2308.03526">pdf</a>, <a href="https://arxiv.org/format/2308.03526">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AlphaStar Unplugged: Large-Scale Offline Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mathieu%2C+M">Micha毛l Mathieu</a>, <a href="/search/cs?searchtype=author&query=Ozair%2C+S">Sherjil Ozair</a>, <a href="/search/cs?searchtype=author&query=Srinivasan%2C+S">Srivatsan Srinivasan</a>, <a href="/search/cs?searchtype=author&query=Gulcehre%2C+C">Caglar Gulcehre</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shangtong Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+R">Ray Jiang</a>, <a href="/search/cs?searchtype=author&query=Paine%2C+T+L">Tom Le Paine</a>, <a href="/search/cs?searchtype=author&query=Powell%2C+R">Richard Powell</a>, <a href="/search/cs?searchtype=author&query=%C5%BBo%C5%82na%2C+K">Konrad 呕o艂na</a>, <a href="/search/cs?searchtype=author&query=Schrittwieser%2C+J">Julian Schrittwieser</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+D">David Choi</a>, <a href="/search/cs?searchtype=author&query=Georgiev%2C+P">Petko Georgiev</a>, <a href="/search/cs?searchtype=author&query=Toyama%2C+D">Daniel Toyama</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+A">Aja Huang</a>, <a href="/search/cs?searchtype=author&query=Ring%2C+R">Roman Ring</a>, <a href="/search/cs?searchtype=author&query=Babuschkin%2C+I">Igor Babuschkin</a>, <a href="/search/cs?searchtype=author&query=Ewalds%2C+T">Timo Ewalds</a>, <a href="/search/cs?searchtype=author&query=Bordbar%2C+M">Mahyar Bordbar</a>, <a href="/search/cs?searchtype=author&query=Henderson%2C+S">Sarah Henderson</a>, <a href="/search/cs?searchtype=author&query=Colmenarejo%2C+S+G">Sergio G贸mez Colmenarejo</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">A盲ron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Czarnecki%2C+W+M">Wojciech Marian Czarnecki</a>, <a href="/search/cs?searchtype=author&query=de+Freitas%2C+N">Nando de Freitas</a>, <a href="/search/cs?searchtype=author&query=Vinyals%2C+O">Oriol Vinyals</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.03526v1-abstract-short" style="display: inline;"> StarCraft II is one of the most challenging simulated reinforcement learning environments; it is partially observable, stochastic, multi-agent, and mastering StarCraft II requires strategic planning over long time horizons with real-time low-level execution. It also has an active professional competitive scene. StarCraft II is uniquely suited for advancing offline RL algorithms, both because of it… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.03526v1-abstract-full').style.display = 'inline'; document.getElementById('2308.03526v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.03526v1-abstract-full" style="display: none;"> StarCraft II is one of the most challenging simulated reinforcement learning environments; it is partially observable, stochastic, multi-agent, and mastering StarCraft II requires strategic planning over long time horizons with real-time low-level execution. It also has an active professional competitive scene. StarCraft II is uniquely suited for advancing offline RL algorithms, both because of its challenging nature and because Blizzard has released a massive dataset of millions of StarCraft II games played by human players. This paper leverages that and establishes a benchmark, called AlphaStar Unplugged, introducing unprecedented challenges for offline reinforcement learning. We define a dataset (a subset of Blizzard's release), tools standardizing an API for machine learning methods, and an evaluation protocol. We also present baseline agents, including behavior cloning, offline variants of actor-critic and MuZero. We improve the state of the art of agents using only offline data, and we achieve 90% win rate against previously published AlphaStar behavior cloning agent. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.03526v1-abstract-full').style.display = 'none'; document.getElementById('2308.03526v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">32 pages, 13 figures, previous version published as a NeurIPS 2021 workshop: https://openreview.net/forum?id=Np8Pumfoty</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.06749">arXiv:2112.06749</a> <span> [<a href="https://arxiv.org/pdf/2112.06749">pdf</a>, <a href="https://arxiv.org/format/2112.06749">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Step-unrolled Denoising Autoencoders for Text Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Savinov%2C+N">Nikolay Savinov</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+J">Junyoung Chung</a>, <a href="/search/cs?searchtype=author&query=Binkowski%2C+M">Mikolaj Binkowski</a>, <a href="/search/cs?searchtype=author&query=Elsen%2C+E">Erich Elsen</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">Aaron van den Oord</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.06749v3-abstract-short" style="display: inline;"> In this paper we propose a new generative model of text, Step-unrolled Denoising Autoencoder (SUNDAE), that does not rely on autoregressive models. Similarly to denoising diffusion techniques, SUNDAE is repeatedly applied on a sequence of tokens, starting from random inputs and improving them each time until convergence. We present a simple new improvement operator that converges in fewer iteratio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.06749v3-abstract-full').style.display = 'inline'; document.getElementById('2112.06749v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.06749v3-abstract-full" style="display: none;"> In this paper we propose a new generative model of text, Step-unrolled Denoising Autoencoder (SUNDAE), that does not rely on autoregressive models. Similarly to denoising diffusion techniques, SUNDAE is repeatedly applied on a sequence of tokens, starting from random inputs and improving them each time until convergence. We present a simple new improvement operator that converges in fewer iterations than diffusion methods, while qualitatively producing better samples on natural language datasets. SUNDAE achieves state-of-the-art results (among non-autoregressive methods) on the WMT'14 English-to-German translation task and good qualitative results on unconditional language modeling on the Colossal Cleaned Common Crawl dataset and a dataset of Python code from GitHub. The non-autoregressive nature of SUNDAE opens up possibilities beyond left-to-right prompted generation, by filling in arbitrary blank patterns in a template. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.06749v3-abstract-full').style.display = 'none'; document.getElementById('2112.06749v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICLR 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.12124">arXiv:2111.12124</a> <span> [<a href="https://arxiv.org/pdf/2111.12124">pdf</a>, <a href="https://arxiv.org/ps/2111.12124">ps</a>, <a href="https://arxiv.org/format/2111.12124">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Towards Learning Universal Audio Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+L">Luyu Wang</a>, <a href="/search/cs?searchtype=author&query=Luc%2C+P">Pauline Luc</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yan Wu</a>, <a href="/search/cs?searchtype=author&query=Recasens%2C+A">Adria Recasens</a>, <a href="/search/cs?searchtype=author&query=Smaira%2C+L">Lucas Smaira</a>, <a href="/search/cs?searchtype=author&query=Brock%2C+A">Andrew Brock</a>, <a href="/search/cs?searchtype=author&query=Jaegle%2C+A">Andrew Jaegle</a>, <a href="/search/cs?searchtype=author&query=Alayrac%2C+J">Jean-Baptiste Alayrac</a>, <a href="/search/cs?searchtype=author&query=Dieleman%2C+S">Sander Dieleman</a>, <a href="/search/cs?searchtype=author&query=Carreira%2C+J">Joao Carreira</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">Aaron van den Oord</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.12124v3-abstract-short" style="display: inline;"> The ability to learn universal audio representations that can solve diverse speech, music, and environment tasks can spur many applications that require general sound content understanding. In this work, we introduce a holistic audio representation evaluation suite (HARES) spanning 12 downstream tasks across audio domains and provide a thorough empirical study of recent sound representation learni… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.12124v3-abstract-full').style.display = 'inline'; document.getElementById('2111.12124v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.12124v3-abstract-full" style="display: none;"> The ability to learn universal audio representations that can solve diverse speech, music, and environment tasks can spur many applications that require general sound content understanding. In this work, we introduce a holistic audio representation evaluation suite (HARES) spanning 12 downstream tasks across audio domains and provide a thorough empirical study of recent sound representation learning systems on that benchmark. We discover that previous sound event classification or speech models do not generalize outside of their domains. We observe that more robust audio representations can be learned with the SimCLR objective; however, the model's transferability depends heavily on the model architecture. We find the Slowfast architecture is good at learning rich representations required by different domains, but its performance is affected by the normalization scheme. Based on these findings, we propose a novel normalizer-free Slowfast NFNet and achieve state-of-the-art performance across all domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.12124v3-abstract-full').style.display = 'none'; document.getElementById('2111.12124v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.04615">arXiv:2106.04615</a> <span> [<a href="https://arxiv.org/pdf/2106.04615">pdf</a>, <a href="https://arxiv.org/format/2106.04615">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Vector Quantized Models for Planning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ozair%2C+S">Sherjil Ozair</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yazhe Li</a>, <a href="/search/cs?searchtype=author&query=Razavi%2C+A">Ali Razavi</a>, <a href="/search/cs?searchtype=author&query=Antonoglou%2C+I">Ioannis Antonoglou</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">A盲ron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Vinyals%2C+O">Oriol Vinyals</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.04615v2-abstract-short" style="display: inline;"> Recent developments in the field of model-based RL have proven successful in a range of environments, especially ones where planning is essential. However, such successes have been limited to deterministic fully-observed environments. We present a new approach that handles stochastic and partially-observable environments. Our key insight is to use discrete autoencoders to capture the multiple poss… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.04615v2-abstract-full').style.display = 'inline'; document.getElementById('2106.04615v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.04615v2-abstract-full" style="display: none;"> Recent developments in the field of model-based RL have proven successful in a range of environments, especially ones where planning is essential. However, such successes have been limited to deterministic fully-observed environments. We present a new approach that handles stochastic and partially-observable environments. Our key insight is to use discrete autoencoders to capture the multiple possible effects of an action in a stochastic environment. We use a stochastic variant of Monte Carlo tree search to plan over both the agent's actions and the discrete latent variables representing the environment's response. Our approach significantly outperforms an offline version of MuZero on a stochastic interpretation of chess where the opponent is considered part of the environment. We also show that our approach scales to DeepMind Lab, a first-person 3D environment with large visual observations and partial observability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.04615v2-abstract-full').style.display = 'none'; document.getElementById('2106.04615v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2105.08054">arXiv:2105.08054</a> <span> [<a href="https://arxiv.org/pdf/2105.08054">pdf</a>, <a href="https://arxiv.org/format/2105.08054">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Divide and Contrast: Self-supervised Learning from Uncurated Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yonglong Tian</a>, <a href="/search/cs?searchtype=author&query=Henaff%2C+O+J">Olivier J. Henaff</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">Aaron van den Oord</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2105.08054v1-abstract-short" style="display: inline;"> Self-supervised learning holds promise in leveraging large amounts of unlabeled data, however much of its progress has thus far been limited to highly curated pre-training data such as ImageNet. We explore the effects of contrastive learning from larger, less-curated image datasets such as YFCC, and find there is indeed a large difference in the resulting representation quality. We hypothesize tha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2105.08054v1-abstract-full').style.display = 'inline'; document.getElementById('2105.08054v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2105.08054v1-abstract-full" style="display: none;"> Self-supervised learning holds promise in leveraging large amounts of unlabeled data, however much of its progress has thus far been limited to highly curated pre-training data such as ImageNet. We explore the effects of contrastive learning from larger, less-curated image datasets such as YFCC, and find there is indeed a large difference in the resulting representation quality. We hypothesize that this curation gap is due to a shift in the distribution of image classes -- which is more diverse and heavy-tailed -- resulting in less relevant negative samples to learn from. We test this hypothesis with a new approach, Divide and Contrast (DnC), which alternates between contrastive learning and clustering-based hard negative mining. When pretrained on less curated datasets, DnC greatly improves the performance of self-supervised learning on downstream tasks, while remaining competitive with the current state-of-the-art on curated datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2105.08054v1-abstract-full').style.display = 'none'; document.getElementById('2105.08054v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 May, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.12807">arXiv:2104.12807</a> <span> [<a href="https://arxiv.org/pdf/2104.12807">pdf</a>, <a href="https://arxiv.org/format/2104.12807">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Multimodal Self-Supervised Learning of General Audio Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+L">Luyu Wang</a>, <a href="/search/cs?searchtype=author&query=Luc%2C+P">Pauline Luc</a>, <a href="/search/cs?searchtype=author&query=Recasens%2C+A">Adria Recasens</a>, <a href="/search/cs?searchtype=author&query=Alayrac%2C+J">Jean-Baptiste Alayrac</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">Aaron van den Oord</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.12807v2-abstract-short" style="display: inline;"> We present a multimodal framework to learn general audio representations from videos. Existing contrastive audio representation learning methods mainly focus on using the audio modality alone during training. In this work, we show that additional information contained in video can be utilized to greatly improve the learned features. First, we demonstrate that our contrastive framework does not req… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.12807v2-abstract-full').style.display = 'inline'; document.getElementById('2104.12807v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.12807v2-abstract-full" style="display: none;"> We present a multimodal framework to learn general audio representations from videos. Existing contrastive audio representation learning methods mainly focus on using the audio modality alone during training. In this work, we show that additional information contained in video can be utilized to greatly improve the learned features. First, we demonstrate that our contrastive framework does not require high resolution images to learn good audio features. This allows us to scale up the training batch size, while keeping the computational load incurred by the additional video modality to a reasonable level. Second, we use augmentations that mix together different samples. We show that this is effective to make the proxy task harder, which leads to substantial performance improvements when increasing the batch size. As a result, our audio model achieves a state-of-the-art of 42.4 mAP on the AudioSet classification downstream task, closing the gap between supervised and self-supervised methods trained on the same dataset. Moreover, we show that our method is advantageous on a broad range of non-semantic audio tasks, including speaker identification, keyword spotting, language identification, and music instrument classification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.12807v2-abstract-full').style.display = 'none'; document.getElementById('2104.12807v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2103.16559">arXiv:2103.16559</a> <span> [<a href="https://arxiv.org/pdf/2103.16559">pdf</a>, <a href="https://arxiv.org/format/2103.16559">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Broaden Your Views for Self-Supervised Video Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Recasens%2C+A">Adri脿 Recasens</a>, <a href="/search/cs?searchtype=author&query=Luc%2C+P">Pauline Luc</a>, <a href="/search/cs?searchtype=author&query=Alayrac%2C+J">Jean-Baptiste Alayrac</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Luyu Wang</a>, <a href="/search/cs?searchtype=author&query=Hemsley%2C+R">Ross Hemsley</a>, <a href="/search/cs?searchtype=author&query=Strub%2C+F">Florian Strub</a>, <a href="/search/cs?searchtype=author&query=Tallec%2C+C">Corentin Tallec</a>, <a href="/search/cs?searchtype=author&query=Malinowski%2C+M">Mateusz Malinowski</a>, <a href="/search/cs?searchtype=author&query=Patraucean%2C+V">Viorica Patraucean</a>, <a href="/search/cs?searchtype=author&query=Altch%C3%A9%2C+F">Florent Altch茅</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&query=Grill%2C+J">Jean-Bastien Grill</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">A盲ron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Zisserman%2C+A">Andrew Zisserman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2103.16559v3-abstract-short" style="display: inline;"> Most successful self-supervised learning methods are trained to align the representations of two independent views from the data. State-of-the-art methods in video are inspired by image techniques, where these two views are similarly extracted by cropping and augmenting the resulting crop. However, these methods miss a crucial element in the video domain: time. We introduce BraVe, a self-supervise… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.16559v3-abstract-full').style.display = 'inline'; document.getElementById('2103.16559v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2103.16559v3-abstract-full" style="display: none;"> Most successful self-supervised learning methods are trained to align the representations of two independent views from the data. State-of-the-art methods in video are inspired by image techniques, where these two views are similarly extracted by cropping and augmenting the resulting crop. However, these methods miss a crucial element in the video domain: time. We introduce BraVe, a self-supervised learning framework for video. In BraVe, one of the views has access to a narrow temporal window of the video while the other view has a broad access to the video content. Our models learn to generalise from the narrow view to the general content of the video. Furthermore, BraVe processes the views with different backbones, enabling the use of alternative augmentations or modalities into the broad view such as optical flow, randomly convolved RGB frames, audio or their combinations. We demonstrate that BraVe achieves state-of-the-art results in self-supervised representation learning on standard video and audio classification benchmarks including UCF101, HMDB51, Kinetics, ESC-50 and AudioSet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.16559v3-abstract-full').style.display = 'none'; document.getElementById('2103.16559v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is an extended version of our ICCV-21 paper. It includes more results as well as a minor architectural variation which improves results</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2103.10957">arXiv:2103.10957</a> <span> [<a href="https://arxiv.org/pdf/2103.10957">pdf</a>, <a href="https://arxiv.org/format/2103.10957">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Efficient Visual Pretraining with Contrastive Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=H%C3%A9naff%2C+O+J">Olivier J. H茅naff</a>, <a href="/search/cs?searchtype=author&query=Koppula%2C+S">Skanda Koppula</a>, <a href="/search/cs?searchtype=author&query=Alayrac%2C+J">Jean-Baptiste Alayrac</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">Aaron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Vinyals%2C+O">Oriol Vinyals</a>, <a href="/search/cs?searchtype=author&query=Carreira%2C+J">Jo茫o Carreira</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2103.10957v2-abstract-short" style="display: inline;"> Self-supervised pretraining has been shown to yield powerful representations for transfer learning. These performance gains come at a large computational cost however, with state-of-the-art methods requiring an order of magnitude more computation than supervised pretraining. We tackle this computational bottleneck by introducing a new self-supervised objective, contrastive detection, which tasks r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.10957v2-abstract-full').style.display = 'inline'; document.getElementById('2103.10957v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2103.10957v2-abstract-full" style="display: none;"> Self-supervised pretraining has been shown to yield powerful representations for transfer learning. These performance gains come at a large computational cost however, with state-of-the-art methods requiring an order of magnitude more computation than supervised pretraining. We tackle this computational bottleneck by introducing a new self-supervised objective, contrastive detection, which tasks representations with identifying object-level features across augmentations. This objective extracts a rich learning signal per image, leading to state-of-the-art transfer accuracy on a variety of downstream tasks, while requiring up to 10x less pretraining. In particular, our strongest ImageNet-pretrained model performs on par with SEER, one of the largest self-supervised systems to date, which uses 1000x more pretraining data. Finally, our objective seamlessly handles pretraining on more complex images such as those in COCO, closing the gap with supervised transfer learning from COCO to PASCAL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.10957v2-abstract-full').style.display = 'none'; document.getElementById('2103.10957v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2103.06508">arXiv:2103.06508</a> <span> [<a href="https://arxiv.org/pdf/2103.06508">pdf</a>, <a href="https://arxiv.org/format/2103.06508">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Multi-Format Contrastive Learning of Audio Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+L">Luyu Wang</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">Aaron van den Oord</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2103.06508v3-abstract-short" style="display: inline;"> Recent advances suggest the advantage of multi-modal training in comparison with single-modal methods. In contrast to this view, in our work we find that similar gain can be obtained from training with different formats of a single modality. In particular, we investigate the use of the contrastive learning framework to learn audio representations by maximizing the agreement between the raw audio a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.06508v3-abstract-full').style.display = 'inline'; document.getElementById('2103.06508v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2103.06508v3-abstract-full" style="display: none;"> Recent advances suggest the advantage of multi-modal training in comparison with single-modal methods. In contrast to this view, in our work we find that similar gain can be obtained from training with different formats of a single modality. In particular, we investigate the use of the contrastive learning framework to learn audio representations by maximizing the agreement between the raw audio and its spectral representation. We find a significant gain using this multi-format strategy against the single-format counterparts. Moreover, on the downstream AudioSet and ESC-50 classification task, our audio-only approach achieves new state-of-the-art results with a mean average precision of 0.376 and an accuracy of 90.5%, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.06508v3-abstract-full').style.display = 'none'; document.getElementById('2103.06508v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2103.01950">arXiv:2103.01950</a> <span> [<a href="https://arxiv.org/pdf/2103.01950">pdf</a>, <a href="https://arxiv.org/format/2103.01950">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Predicting Video with VQVAE </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Walker%2C+J">Jacob Walker</a>, <a href="/search/cs?searchtype=author&query=Razavi%2C+A">Ali Razavi</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">A盲ron van den Oord</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2103.01950v1-abstract-short" style="display: inline;"> In recent years, the task of video prediction-forecasting future video given past video frames-has attracted attention in the research community. In this paper we propose a novel approach to this problem with Vector Quantized Variational AutoEncoders (VQ-VAE). With VQ-VAE we compress high-resolution videos into a hierarchical set of multi-scale discrete latent variables. Compared to pixels, this c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.01950v1-abstract-full').style.display = 'inline'; document.getElementById('2103.01950v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2103.01950v1-abstract-full" style="display: none;"> In recent years, the task of video prediction-forecasting future video given past video frames-has attracted attention in the research community. In this paper we propose a novel approach to this problem with Vector Quantized Variational AutoEncoders (VQ-VAE). With VQ-VAE we compress high-resolution videos into a hierarchical set of multi-scale discrete latent variables. Compared to pixels, this compressed latent space has dramatically reduced dimensionality, allowing us to apply scalable autoregressive generative models to predict video. In contrast to previous work that has largely emphasized highly constrained datasets, we focus on very diverse, large-scale datasets such as Kinetics-600. We predict video at a higher resolution on unconstrained videos, 256x256, than any other previous method to our knowledge. We further validate our approach against prior work via a crowdsourced human evaluation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.01950v1-abstract-full').style.display = 'none'; document.getElementById('2103.01950v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 Pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6; I.2.10 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2006.07159">arXiv:2006.07159</a> <span> [<a href="https://arxiv.org/pdf/2006.07159">pdf</a>, <a href="https://arxiv.org/format/2006.07159">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Are we done with ImageNet? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Beyer%2C+L">Lucas Beyer</a>, <a href="/search/cs?searchtype=author&query=H%C3%A9naff%2C+O+J">Olivier J. H茅naff</a>, <a href="/search/cs?searchtype=author&query=Kolesnikov%2C+A">Alexander Kolesnikov</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+X">Xiaohua Zhai</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">A盲ron van den Oord</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2006.07159v1-abstract-short" style="display: inline;"> Yes, and no. We ask whether recent progress on the ImageNet classification benchmark continues to represent meaningful generalization, or whether the community has started to overfit to the idiosyncrasies of its labeling procedure. We therefore develop a significantly more robust procedure for collecting human annotations of the ImageNet validation set. Using these new labels, we reassess the accu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.07159v1-abstract-full').style.display = 'inline'; document.getElementById('2006.07159v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2006.07159v1-abstract-full" style="display: none;"> Yes, and no. We ask whether recent progress on the ImageNet classification benchmark continues to represent meaningful generalization, or whether the community has started to overfit to the idiosyncrasies of its labeling procedure. We therefore develop a significantly more robust procedure for collecting human annotations of the ImageNet validation set. Using these new labels, we reassess the accuracy of recently proposed ImageNet classifiers, and find their gains to be substantially smaller than those reported on the original labels. Furthermore, we find the original ImageNet labels to no longer be the best predictors of this independently-collected set, indicating that their usefulness in evaluating vision models may be nearing an end. Nevertheless, we find our annotation procedure to have largely remedied the errors in the original labels, reinforcing ImageNet as a powerful benchmark for future research in visual recognition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.07159v1-abstract-full').style.display = 'none'; document.getElementById('2006.07159v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">All five authors contributed equally. New labels at https://github.com/google-research/reassessed-imagenet</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2001.11128">arXiv:2001.11128</a> <span> [<a href="https://arxiv.org/pdf/2001.11128">pdf</a>, <a href="https://arxiv.org/format/2001.11128">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Learning Robust and Multilingual Speech Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kawakami%2C+K">Kazuya Kawakami</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Luyu Wang</a>, <a href="/search/cs?searchtype=author&query=Dyer%2C+C">Chris Dyer</a>, <a href="/search/cs?searchtype=author&query=Blunsom%2C+P">Phil Blunsom</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">Aaron van den Oord</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2001.11128v1-abstract-short" style="display: inline;"> Unsupervised speech representation learning has shown remarkable success at finding representations that correlate with phonetic structures and improve downstream speech recognition performance. However, most research has been focused on evaluating the representations in terms of their ability to improve the performance of speech recognition systems on read English (e.g. Wall Street Journal and Li… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.11128v1-abstract-full').style.display = 'inline'; document.getElementById('2001.11128v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2001.11128v1-abstract-full" style="display: none;"> Unsupervised speech representation learning has shown remarkable success at finding representations that correlate with phonetic structures and improve downstream speech recognition performance. However, most research has been focused on evaluating the representations in terms of their ability to improve the performance of speech recognition systems on read English (e.g. Wall Street Journal and LibriSpeech). This evaluation methodology overlooks two important desiderata that speech representations should have: robustness to domain shifts and transferability to other languages. In this paper we learn representations from up to 8000 hours of diverse and noisy speech data and evaluate the representations by looking at their robustness to domain shifts and their ability to improve recognition performance in many languages. We find that our representations confer significant robustness advantages to the resulting recognition systems: we see significant improvements in out-of-domain transfer relative to baseline feature sets and the features likewise provide improvements in 25 phonetically diverse languages including tonal languages and low-resource languages. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.11128v1-abstract-full').style.display = 'none'; document.getElementById('2001.11128v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1910.06464">arXiv:1910.06464</a> <span> [<a href="https://arxiv.org/pdf/1910.06464">pdf</a>, <a href="https://arxiv.org/format/1910.06464">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICASSP.2019.8683277">10.1109/ICASSP.2019.8683277 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Low Bit-Rate Speech Coding with VQ-VAE and a WaveNet Decoder </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=G%C3%A2rbacea%2C+C">Cristina G芒rbacea</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">A盲ron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yazhe Li</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+F+S+C">Felicia S C Lim</a>, <a href="/search/cs?searchtype=author&query=Luebs%2C+A">Alejandro Luebs</a>, <a href="/search/cs?searchtype=author&query=Vinyals%2C+O">Oriol Vinyals</a>, <a href="/search/cs?searchtype=author&query=Walters%2C+T+C">Thomas C Walters</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1910.06464v1-abstract-short" style="display: inline;"> In order to efficiently transmit and store speech signals, speech codecs create a minimally redundant representation of the input signal which is then decoded at the receiver with the best possible perceptual quality. In this work we demonstrate that a neural network architecture based on VQ-VAE with a WaveNet decoder can be used to perform very low bit-rate speech coding with high reconstruction… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.06464v1-abstract-full').style.display = 'inline'; document.getElementById('1910.06464v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1910.06464v1-abstract-full" style="display: none;"> In order to efficiently transmit and store speech signals, speech codecs create a minimally redundant representation of the input signal which is then decoded at the receiver with the best possible perceptual quality. In this work we demonstrate that a neural network architecture based on VQ-VAE with a WaveNet decoder can be used to perform very low bit-rate speech coding with high reconstruction quality. A prosody-transparent and speaker-independent model trained on the LibriSpeech corpus coding audio at 1.6 kbps exhibits perceptual quality which is around halfway between the MELP codec at 2.4 kbps and AMR-WB codec at 23.05 kbps. In addition, when training on high-quality recorded speech with the test speaker included in the training set, a model coding speech at 1.6 kbps produces output of similar perceptual quality to that generated by AMR-WB at 23.05 kbps. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.06464v1-abstract-full').style.display = 'none'; document.getElementById('1910.06464v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICASSP 2019</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 735-739. IEEE, 2019 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1906.09237">arXiv:1906.09237</a> <span> [<a href="https://arxiv.org/pdf/1906.09237">pdf</a>, <a href="https://arxiv.org/format/1906.09237">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Shaping Belief States with Generative Environment Models for RL </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gregor%2C+K">Karol Gregor</a>, <a href="/search/cs?searchtype=author&query=Rezende%2C+D+J">Danilo Jimenez Rezende</a>, <a href="/search/cs?searchtype=author&query=Besse%2C+F">Frederic Besse</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yan Wu</a>, <a href="/search/cs?searchtype=author&query=Merzic%2C+H">Hamza Merzic</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">Aaron van den Oord</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1906.09237v2-abstract-short" style="display: inline;"> When agents interact with a complex environment, they must form and maintain beliefs about the relevant aspects of that environment. We propose a way to efficiently train expressive generative models in complex environments. We show that a predictive algorithm with an expressive generative model can form stable belief-states in visually rich and dynamic 3D environments. More precisely, we show tha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.09237v2-abstract-full').style.display = 'inline'; document.getElementById('1906.09237v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1906.09237v2-abstract-full" style="display: none;"> When agents interact with a complex environment, they must form and maintain beliefs about the relevant aspects of that environment. We propose a way to efficiently train expressive generative models in complex environments. We show that a predictive algorithm with an expressive generative model can form stable belief-states in visually rich and dynamic 3D environments. More precisely, we show that the learned representation captures the layout of the environment as well as the position and orientation of the agent. Our experiments show that the model substantially improves data-efficiency on a number of reinforcement learning (RL) tasks compared with strong model-free baseline agents. We find that predicting multiple steps into the future (overshooting), in combination with an expressive generative model, is critical for stable representations to emerge. In practice, using expressive generative models in RL is computationally expensive and we propose a scheme to reduce this computational burden, allowing us to build agents that are competitive with model-free baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.09237v2-abstract-full').style.display = 'none'; document.getElementById('1906.09237v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 June, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 June, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">pre-print</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1906.00446">arXiv:1906.00446</a> <span> [<a href="https://arxiv.org/pdf/1906.00446">pdf</a>, <a href="https://arxiv.org/format/1906.00446">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Generating Diverse High-Fidelity Images with VQ-VAE-2 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Razavi%2C+A">Ali Razavi</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">Aaron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Vinyals%2C+O">Oriol Vinyals</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1906.00446v1-abstract-short" style="display: inline;"> We explore the use of Vector Quantized Variational AutoEncoder (VQ-VAE) models for large scale image generation. To this end, we scale and enhance the autoregressive priors used in VQ-VAE to generate synthetic samples of much higher coherence and fidelity than possible before. We use simple feed-forward encoder and decoder networks, making our model an attractive candidate for applications where t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.00446v1-abstract-full').style.display = 'inline'; document.getElementById('1906.00446v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1906.00446v1-abstract-full" style="display: none;"> We explore the use of Vector Quantized Variational AutoEncoder (VQ-VAE) models for large scale image generation. To this end, we scale and enhance the autoregressive priors used in VQ-VAE to generate synthetic samples of much higher coherence and fidelity than possible before. We use simple feed-forward encoder and decoder networks, making our model an attractive candidate for applications where the encoding and/or decoding speed is critical. Additionally, VQ-VAE requires sampling an autoregressive model only in the compressed latent space, which is an order of magnitude faster than sampling in the pixel space, especially for large images. We demonstrate that a multi-scale hierarchical organization of VQ-VAE, augmented with powerful priors over the latent codes, is able to generate samples with quality that rivals that of state of the art Generative Adversarial Networks on multifaceted datasets such as ImageNet, while not suffering from GAN's known shortcomings such as mode collapse and lack of diversity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.00446v1-abstract-full').style.display = 'none'; document.getElementById('1906.00446v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 June, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1905.09272">arXiv:1905.09272</a> <span> [<a href="https://arxiv.org/pdf/1905.09272">pdf</a>, <a href="https://arxiv.org/format/1905.09272">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Data-Efficient Image Recognition with Contrastive Predictive Coding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=H%C3%A9naff%2C+O+J">Olivier J. H茅naff</a>, <a href="/search/cs?searchtype=author&query=Srinivas%2C+A">Aravind Srinivas</a>, <a href="/search/cs?searchtype=author&query=De+Fauw%2C+J">Jeffrey De Fauw</a>, <a href="/search/cs?searchtype=author&query=Razavi%2C+A">Ali Razavi</a>, <a href="/search/cs?searchtype=author&query=Doersch%2C+C">Carl Doersch</a>, <a href="/search/cs?searchtype=author&query=Eslami%2C+S+M+A">S. M. Ali Eslami</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">Aaron van den Oord</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1905.09272v3-abstract-short" style="display: inline;"> Human observers can learn to recognize new categories of images from a handful of examples, yet doing so with artificial ones remains an open challenge. We hypothesize that data-efficient recognition is enabled by representations which make the variability in natural signals more predictable. We therefore revisit and improve Contrastive Predictive Coding, an unsupervised objective for learning suc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1905.09272v3-abstract-full').style.display = 'inline'; document.getElementById('1905.09272v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1905.09272v3-abstract-full" style="display: none;"> Human observers can learn to recognize new categories of images from a handful of examples, yet doing so with artificial ones remains an open challenge. We hypothesize that data-efficient recognition is enabled by representations which make the variability in natural signals more predictable. We therefore revisit and improve Contrastive Predictive Coding, an unsupervised objective for learning such representations. This new implementation produces features which support state-of-the-art linear classification accuracy on the ImageNet dataset. When used as input for non-linear classification with deep neural networks, this representation allows us to use 2-5x less labels than classifiers trained directly on image pixels. Finally, this unsupervised representation substantially improves transfer learning to object detection on the PASCAL VOC dataset, surpassing fully supervised pre-trained ImageNet classifiers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1905.09272v3-abstract-full').style.display = 'none'; document.getElementById('1905.09272v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 May, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1905.06922">arXiv:1905.06922</a> <span> [<a href="https://arxiv.org/pdf/1905.06922">pdf</a>, <a href="https://arxiv.org/format/1905.06922">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> On Variational Bounds of Mutual Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Poole%2C+B">Ben Poole</a>, <a href="/search/cs?searchtype=author&query=Ozair%2C+S">Sherjil Ozair</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">Aaron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Alemi%2C+A+A">Alexander A. Alemi</a>, <a href="/search/cs?searchtype=author&query=Tucker%2C+G">George Tucker</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1905.06922v1-abstract-short" style="display: inline;"> Estimating and optimizing Mutual Information (MI) is core to many problems in machine learning; however, bounding MI in high dimensions is challenging. To establish tractable and scalable objectives, recent work has turned to variational bounds parameterized by neural networks, but the relationships and tradeoffs between these bounds remains unclear. In this work, we unify these recent development… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1905.06922v1-abstract-full').style.display = 'inline'; document.getElementById('1905.06922v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1905.06922v1-abstract-full" style="display: none;"> Estimating and optimizing Mutual Information (MI) is core to many problems in machine learning; however, bounding MI in high dimensions is challenging. To establish tractable and scalable objectives, recent work has turned to variational bounds parameterized by neural networks, but the relationships and tradeoffs between these bounds remains unclear. In this work, we unify these recent developments in a single framework. We find that the existing variational lower bounds degrade when the MI is large, exhibiting either high bias or high variance. To address this problem, we introduce a continuum of lower bounds that encompasses previous bounds and flexibly trades off bias and variance. On high-dimensional, controlled problems, we empirically characterize the bias and variance of the bounds and their gradients and demonstrate the effectiveness of our new bounds for estimation and representation learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1905.06922v1-abstract-full').style.display = 'none'; document.getElementById('1905.06922v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 May, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2019</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1903.11780">arXiv:1903.11780</a> <span> [<a href="https://arxiv.org/pdf/1903.11780">pdf</a>, <a href="https://arxiv.org/format/1903.11780">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Wasserstein Dependency Measure for Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ozair%2C+S">Sherjil Ozair</a>, <a href="/search/cs?searchtype=author&query=Lynch%2C+C">Corey Lynch</a>, <a href="/search/cs?searchtype=author&query=Bengio%2C+Y">Yoshua Bengio</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">Aaron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Levine%2C+S">Sergey Levine</a>, <a href="/search/cs?searchtype=author&query=Sermanet%2C+P">Pierre Sermanet</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1903.11780v1-abstract-short" style="display: inline;"> Mutual information maximization has emerged as a powerful learning objective for unsupervised representation learning obtaining state-of-the-art performance in applications such as object recognition, speech recognition, and reinforcement learning. However, such approaches are fundamentally limited since a tight lower bound of mutual information requires sample size exponential in the mutual infor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1903.11780v1-abstract-full').style.display = 'inline'; document.getElementById('1903.11780v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1903.11780v1-abstract-full" style="display: none;"> Mutual information maximization has emerged as a powerful learning objective for unsupervised representation learning obtaining state-of-the-art performance in applications such as object recognition, speech recognition, and reinforcement learning. However, such approaches are fundamentally limited since a tight lower bound of mutual information requires sample size exponential in the mutual information. This limits the applicability of these approaches for prediction tasks with high mutual information, such as in video understanding or reinforcement learning. In these settings, such techniques are prone to overfit, both in theory and in practice, and capture only a few of the relevant factors of variation. This leads to incomplete representations that are not optimal for downstream tasks. In this work, we empirically demonstrate that mutual information-based representation learning approaches do fail to learn complete representations on a number of designed and real-world tasks. To mitigate these problems we introduce the Wasserstein dependency measure, which learns more complete representations by using the Wasserstein distance instead of the KL divergence in the mutual information estimator. We show that a practical approximation to this theoretically motivated solution, constructed using Lipschitz constraint techniques from the GAN literature, achieves substantially improved results on tasks where incomplete representations are a major challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1903.11780v1-abstract-full').style.display = 'none'; document.getElementById('1903.11780v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1901.08810">arXiv:1901.08810</a> <span> [<a href="https://arxiv.org/pdf/1901.08810">pdf</a>, <a href="https://arxiv.org/format/1901.08810">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TASLP.2019.2938863">10.1109/TASLP.2019.2938863 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Unsupervised speech representation learning using WaveNet autoencoders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chorowski%2C+J">Jan Chorowski</a>, <a href="/search/cs?searchtype=author&query=Weiss%2C+R+J">Ron J. Weiss</a>, <a href="/search/cs?searchtype=author&query=Bengio%2C+S">Samy Bengio</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">A盲ron van den Oord</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1901.08810v2-abstract-short" style="display: inline;"> We consider the task of unsupervised extraction of meaningful latent representations of speech by applying autoencoding neural networks to speech waveforms. The goal is to learn a representation able to capture high level semantic content from the signal, e.g.\ phoneme identities, while being invariant to confounding low level details in the signal such as the underlying pitch contour or backgroun… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1901.08810v2-abstract-full').style.display = 'inline'; document.getElementById('1901.08810v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1901.08810v2-abstract-full" style="display: none;"> We consider the task of unsupervised extraction of meaningful latent representations of speech by applying autoencoding neural networks to speech waveforms. The goal is to learn a representation able to capture high level semantic content from the signal, e.g.\ phoneme identities, while being invariant to confounding low level details in the signal such as the underlying pitch contour or background noise. Since the learned representation is tuned to contain only phonetic content, we resort to using a high capacity WaveNet decoder to infer information discarded by the encoder from previous samples. Moreover, the behavior of autoencoder models depends on the kind of constraint that is applied to the latent representation. We compare three variants: a simple dimensionality reduction bottleneck, a Gaussian Variational Autoencoder (VAE), and a discrete Vector Quantized VAE (VQ-VAE). We analyze the quality of learned representations in terms of speaker independence, the ability to predict phonetic content, and the ability to accurately reconstruct individual spectrogram frames. Moreover, for discrete encodings extracted using the VQ-VAE, we measure the ease of mapping them to phonemes. We introduce a regularization scheme that forces the representations to focus on the phonetic content of the utterance and report performance comparable with the top entries in the ZeroSpeech 2017 unsupervised acoustic unit discovery task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1901.08810v2-abstract-full').style.display = 'none'; document.getElementById('1901.08810v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 January, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IEEE TASLP, final version available at http://dx.doi.org/10.1109/TASLP.2019.2938863</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1901.03416">arXiv:1901.03416</a> <span> [<a href="https://arxiv.org/pdf/1901.03416">pdf</a>, <a href="https://arxiv.org/format/1901.03416">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Preventing Posterior Collapse with delta-VAEs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Razavi%2C+A">Ali Razavi</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">A盲ron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Poole%2C+B">Ben Poole</a>, <a href="/search/cs?searchtype=author&query=Vinyals%2C+O">Oriol Vinyals</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1901.03416v1-abstract-short" style="display: inline;"> Due to the phenomenon of "posterior collapse," current latent variable generative models pose a challenging design choice that either weakens the capacity of the decoder or requires augmenting the objective so it does not only maximize the likelihood of the data. In this paper, we propose an alternative that utilizes the most powerful generative models as decoders, whilst optimising the variationa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1901.03416v1-abstract-full').style.display = 'inline'; document.getElementById('1901.03416v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1901.03416v1-abstract-full" style="display: none;"> Due to the phenomenon of "posterior collapse," current latent variable generative models pose a challenging design choice that either weakens the capacity of the decoder or requires augmenting the objective so it does not only maximize the likelihood of the data. In this paper, we propose an alternative that utilizes the most powerful generative models as decoders, whilst optimising the variational lower bound all while ensuring that the latent variables preserve and encode useful information. Our proposed $未$-VAEs achieve this by constraining the variational family for the posterior to have a minimum distance to the prior. For sequential latent variable models, our approach resembles the classic representation learning approach of slow feature analysis. We demonstrate the efficacy of our approach at modeling text on LM1B and modeling images: learning representations, improving sample quality, and achieving state of the art log-likelihood on CIFAR-10 and ImageNet $32\times 32$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1901.03416v1-abstract-full').style.display = 'none'; document.getElementById('1901.03416v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1809.10460">arXiv:1809.10460</a> <span> [<a href="https://arxiv.org/pdf/1809.10460">pdf</a>, <a href="https://arxiv.org/format/1809.10460">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Sample Efficient Adaptive Text-to-Speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yutian Chen</a>, <a href="/search/cs?searchtype=author&query=Assael%2C+Y">Yannis Assael</a>, <a href="/search/cs?searchtype=author&query=Shillingford%2C+B">Brendan Shillingford</a>, <a href="/search/cs?searchtype=author&query=Budden%2C+D">David Budden</a>, <a href="/search/cs?searchtype=author&query=Reed%2C+S">Scott Reed</a>, <a href="/search/cs?searchtype=author&query=Zen%2C+H">Heiga Zen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Quan Wang</a>, <a href="/search/cs?searchtype=author&query=Cobo%2C+L+C">Luis C. Cobo</a>, <a href="/search/cs?searchtype=author&query=Trask%2C+A">Andrew Trask</a>, <a href="/search/cs?searchtype=author&query=Laurie%2C+B">Ben Laurie</a>, <a href="/search/cs?searchtype=author&query=Gulcehre%2C+C">Caglar Gulcehre</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">A盲ron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Vinyals%2C+O">Oriol Vinyals</a>, <a href="/search/cs?searchtype=author&query=de+Freitas%2C+N">Nando de Freitas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1809.10460v3-abstract-short" style="display: inline;"> We present a meta-learning approach for adaptive text-to-speech (TTS) with few data. During training, we learn a multi-speaker model using a shared conditional WaveNet core and independent learned embeddings for each speaker. The aim of training is not to produce a neural network with fixed weights, which is then deployed as a TTS system. Instead, the aim is to produce a network that requires few… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1809.10460v3-abstract-full').style.display = 'inline'; document.getElementById('1809.10460v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1809.10460v3-abstract-full" style="display: none;"> We present a meta-learning approach for adaptive text-to-speech (TTS) with few data. During training, we learn a multi-speaker model using a shared conditional WaveNet core and independent learned embeddings for each speaker. The aim of training is not to produce a neural network with fixed weights, which is then deployed as a TTS system. Instead, the aim is to produce a network that requires few data at deployment time to rapidly adapt to new speakers. We introduce and benchmark three strategies: (i) learning the speaker embedding while keeping the WaveNet core fixed, (ii) fine-tuning the entire architecture with stochastic gradient descent, and (iii) predicting the speaker embedding with a trained neural network encoder. The experiments show that these approaches are successful at adapting the multi-speaker neural network to new speakers, obtaining state-of-the-art results in both sample naturalness and voice similarity with merely a few minutes of audio data from new speakers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1809.10460v3-abstract-full').style.display = 'none'; document.getElementById('1809.10460v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 January, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 September, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR 2019</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1807.03748">arXiv:1807.03748</a> <span> [<a href="https://arxiv.org/pdf/1807.03748">pdf</a>, <a href="https://arxiv.org/format/1807.03748">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Representation Learning with Contrastive Predictive Coding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">Aaron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yazhe Li</a>, <a href="/search/cs?searchtype=author&query=Vinyals%2C+O">Oriol Vinyals</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1807.03748v2-abstract-short" style="display: inline;"> While supervised learning has enabled great progress in many applications, unsupervised learning has not seen such widespread adoption, and remains an important and challenging endeavor for artificial intelligence. In this work, we propose a universal unsupervised learning approach to extract useful representations from high-dimensional data, which we call Contrastive Predictive Coding. The key in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1807.03748v2-abstract-full').style.display = 'inline'; document.getElementById('1807.03748v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1807.03748v2-abstract-full" style="display: none;"> While supervised learning has enabled great progress in many applications, unsupervised learning has not seen such widespread adoption, and remains an important and challenging endeavor for artificial intelligence. In this work, we propose a universal unsupervised learning approach to extract useful representations from high-dimensional data, which we call Contrastive Predictive Coding. The key insight of our model is to learn such representations by predicting the future in latent space by using powerful autoregressive models. We use a probabilistic contrastive loss which induces the latent space to capture information that is maximally useful to predict future samples. It also makes the model tractable by using negative sampling. While most prior work has focused on evaluating representations for a particular modality, we demonstrate that our approach is able to learn useful representations achieving strong performance on four distinct domains: speech, images, text and reinforcement learning in 3D environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1807.03748v2-abstract-full').style.display = 'none'; document.getElementById('1807.03748v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 July, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2018. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1806.10474">arXiv:1806.10474</a> <span> [<a href="https://arxiv.org/pdf/1806.10474">pdf</a>, <a href="https://arxiv.org/format/1806.10474">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> The challenge of realistic music generation: modelling raw audio at scale </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dieleman%2C+S">Sander Dieleman</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">A盲ron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Simonyan%2C+K">Karen Simonyan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1806.10474v1-abstract-short" style="display: inline;"> Realistic music generation is a challenging task. When building generative models of music that are learnt from data, typically high-level representations such as scores or MIDI are used that abstract away the idiosyncrasies of a particular performance. But these nuances are very important for our perception of musicality and realism, so in this work we embark on modelling music in the raw audio d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1806.10474v1-abstract-full').style.display = 'inline'; document.getElementById('1806.10474v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1806.10474v1-abstract-full" style="display: none;"> Realistic music generation is a challenging task. When building generative models of music that are learnt from data, typically high-level representations such as scores or MIDI are used that abstract away the idiosyncrasies of a particular performance. But these nuances are very important for our perception of musicality and realism, so in this work we embark on modelling music in the raw audio domain. It has been shown that autoregressive models excel at generating raw audio waveforms of speech, but when applied to music, we find them biased towards capturing local signal structure at the expense of modelling long-range correlations. This is problematic because music exhibits structure at many different timescales. In this work, we explore autoregressive discrete autoencoders (ADAs) as a means to enable autoregressive models to capture long-range correlations in waveforms. We find that they allow us to unconditionally generate piano music directly in the raw audio domain, which shows stylistic consistency across tens of seconds. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1806.10474v1-abstract-full').style.display = 'none'; document.getElementById('1806.10474v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 June, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 2 figures, submitted to NIPS 2018</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1804.02476">arXiv:1804.02476</a> <span> [<a href="https://arxiv.org/pdf/1804.02476">pdf</a>, <a href="https://arxiv.org/format/1804.02476">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Associative Compression Networks for Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Graves%2C+A">Alex Graves</a>, <a href="/search/cs?searchtype=author&query=Menick%2C+J">Jacob Menick</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">Aaron van den Oord</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1804.02476v2-abstract-short" style="display: inline;"> This paper introduces Associative Compression Networks (ACNs), a new framework for variational autoencoding with neural networks. The system differs from existing variational autoencoders (VAEs) in that the prior distribution used to model each code is conditioned on a similar code from the dataset. In compression terms this equates to sequentially transmitting the dataset using an ordering determ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1804.02476v2-abstract-full').style.display = 'inline'; document.getElementById('1804.02476v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1804.02476v2-abstract-full" style="display: none;"> This paper introduces Associative Compression Networks (ACNs), a new framework for variational autoencoding with neural networks. The system differs from existing variational autoencoders (VAEs) in that the prior distribution used to model each code is conditioned on a similar code from the dataset. In compression terms this equates to sequentially transmitting the dataset using an ordering determined by proximity in latent space. Since the prior need only account for local, rather than global variations in the latent space, the coding cost is greatly reduced, leading to rich, informative codes. Crucially, the codes remain informative when powerful, autoregressive decoders are used, which we argue is fundamentally difficult with normal VAEs. Experimental results on MNIST, CIFAR-10, ImageNet and CelebA show that ACNs discover high-level latent features such as object class, writing style, pose and facial expression, which can be used to cluster and classify the data, as well as to generate diverse and convincing samples. We conclude that ACNs are a promising new direction for representation learning: one that steps away from IID modelling, and towards learning a structured description of the dataset as a whole. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1804.02476v2-abstract-full').style.display = 'none'; document.getElementById('1804.02476v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 April, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 April, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Revised to clarify difference between ACN and IID loss</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1802.08435">arXiv:1802.08435</a> <span> [<a href="https://arxiv.org/pdf/1802.08435">pdf</a>, <a href="https://arxiv.org/format/1802.08435">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Efficient Neural Audio Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a>, <a href="/search/cs?searchtype=author&query=Elsen%2C+E">Erich Elsen</a>, <a href="/search/cs?searchtype=author&query=Simonyan%2C+K">Karen Simonyan</a>, <a href="/search/cs?searchtype=author&query=Noury%2C+S">Seb Noury</a>, <a href="/search/cs?searchtype=author&query=Casagrande%2C+N">Norman Casagrande</a>, <a href="/search/cs?searchtype=author&query=Lockhart%2C+E">Edward Lockhart</a>, <a href="/search/cs?searchtype=author&query=Stimberg%2C+F">Florian Stimberg</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">Aaron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Dieleman%2C+S">Sander Dieleman</a>, <a href="/search/cs?searchtype=author&query=Kavukcuoglu%2C+K">Koray Kavukcuoglu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1802.08435v2-abstract-short" style="display: inline;"> Sequential models achieve state-of-the-art results in audio, visual and textual domains with respect to both estimating the data distribution and generating high-quality samples. Efficient sampling for this class of models has however remained an elusive problem. With a focus on text-to-speech synthesis, we describe a set of general techniques for reducing sampling time while maintaining high outp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1802.08435v2-abstract-full').style.display = 'inline'; document.getElementById('1802.08435v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1802.08435v2-abstract-full" style="display: none;"> Sequential models achieve state-of-the-art results in audio, visual and textual domains with respect to both estimating the data distribution and generating high-quality samples. Efficient sampling for this class of models has however remained an elusive problem. With a focus on text-to-speech synthesis, we describe a set of general techniques for reducing sampling time while maintaining high output quality. We first describe a single-layer recurrent neural network, the WaveRNN, with a dual softmax layer that matches the quality of the state-of-the-art WaveNet model. The compact form of the network makes it possible to generate 24kHz 16-bit audio 4x faster than real time on a GPU. Second, we apply a weight pruning technique to reduce the number of weights in the WaveRNN. We find that, for a constant number of parameters, large sparse networks perform better than small dense networks and this relationship holds for sparsity levels beyond 96%. The small number of weights in a Sparse WaveRNN makes it possible to sample high-fidelity audio on a mobile CPU in real time. Finally, we propose a new generation scheme based on subscaling that folds a long sequence into a batch of shorter sequences and allows one to generate multiple samples at once. The Subscale WaveRNN produces 16 samples per step without loss of quality and offers an orthogonal method for increasing sampling efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1802.08435v2-abstract-full').style.display = 'none'; document.getElementById('1802.08435v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 February, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1802.05666">arXiv:1802.05666</a> <span> [<a href="https://arxiv.org/pdf/1802.05666">pdf</a>, <a href="https://arxiv.org/format/1802.05666">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Adversarial Risk and the Dangers of Evaluating Against Weak Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Uesato%2C+J">Jonathan Uesato</a>, <a href="/search/cs?searchtype=author&query=O%27Donoghue%2C+B">Brendan O'Donoghue</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">Aaron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Kohli%2C+P">Pushmeet Kohli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1802.05666v2-abstract-short" style="display: inline;"> This paper investigates recently proposed approaches for defending against adversarial examples and evaluating adversarial robustness. We motivate 'adversarial risk' as an objective for achieving models robust to worst-case inputs. We then frame commonly used attacks and evaluation metrics as defining a tractable surrogate objective to the true adversarial risk. This suggests that models may optim… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1802.05666v2-abstract-full').style.display = 'inline'; document.getElementById('1802.05666v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1802.05666v2-abstract-full" style="display: none;"> This paper investigates recently proposed approaches for defending against adversarial examples and evaluating adversarial robustness. We motivate 'adversarial risk' as an objective for achieving models robust to worst-case inputs. We then frame commonly used attacks and evaluation metrics as defining a tractable surrogate objective to the true adversarial risk. This suggests that models may optimize this surrogate rather than the true adversarial risk. We formalize this notion as 'obscurity to an adversary,' and develop tools and heuristics for identifying obscured models and designing transparent models. We demonstrate that this is a significant problem in practice by repurposing gradient-free optimization techniques into adversarial attacks, which we use to decrease the accuracy of several recently proposed defenses to near zero. Our hope is that our formulations and results will help researchers to develop more powerful defenses. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1802.05666v2-abstract-full').style.display = 'none'; document.getElementById('1802.05666v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 February, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2018. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1711.10433">arXiv:1711.10433</a> <span> [<a href="https://arxiv.org/pdf/1711.10433">pdf</a>, <a href="https://arxiv.org/format/1711.10433">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Parallel WaveNet: Fast High-Fidelity Speech Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">Aaron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yazhe Li</a>, <a href="/search/cs?searchtype=author&query=Babuschkin%2C+I">Igor Babuschkin</a>, <a href="/search/cs?searchtype=author&query=Simonyan%2C+K">Karen Simonyan</a>, <a href="/search/cs?searchtype=author&query=Vinyals%2C+O">Oriol Vinyals</a>, <a href="/search/cs?searchtype=author&query=Kavukcuoglu%2C+K">Koray Kavukcuoglu</a>, <a href="/search/cs?searchtype=author&query=Driessche%2C+G+v+d">George van den Driessche</a>, <a href="/search/cs?searchtype=author&query=Lockhart%2C+E">Edward Lockhart</a>, <a href="/search/cs?searchtype=author&query=Cobo%2C+L+C">Luis C. Cobo</a>, <a href="/search/cs?searchtype=author&query=Stimberg%2C+F">Florian Stimberg</a>, <a href="/search/cs?searchtype=author&query=Casagrande%2C+N">Norman Casagrande</a>, <a href="/search/cs?searchtype=author&query=Grewe%2C+D">Dominik Grewe</a>, <a href="/search/cs?searchtype=author&query=Noury%2C+S">Seb Noury</a>, <a href="/search/cs?searchtype=author&query=Dieleman%2C+S">Sander Dieleman</a>, <a href="/search/cs?searchtype=author&query=Elsen%2C+E">Erich Elsen</a>, <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a>, <a href="/search/cs?searchtype=author&query=Zen%2C+H">Heiga Zen</a>, <a href="/search/cs?searchtype=author&query=Graves%2C+A">Alex Graves</a>, <a href="/search/cs?searchtype=author&query=King%2C+H">Helen King</a>, <a href="/search/cs?searchtype=author&query=Walters%2C+T">Tom Walters</a>, <a href="/search/cs?searchtype=author&query=Belov%2C+D">Dan Belov</a>, <a href="/search/cs?searchtype=author&query=Hassabis%2C+D">Demis Hassabis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1711.10433v1-abstract-short" style="display: inline;"> The recently-developed WaveNet architecture is the current state of the art in realistic speech synthesis, consistently rated as more natural sounding for many different languages than any previous system. However, because WaveNet relies on sequential generation of one audio sample at a time, it is poorly suited to today's massively parallel computers, and therefore hard to deploy in a real-time p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1711.10433v1-abstract-full').style.display = 'inline'; document.getElementById('1711.10433v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1711.10433v1-abstract-full" style="display: none;"> The recently-developed WaveNet architecture is the current state of the art in realistic speech synthesis, consistently rated as more natural sounding for many different languages than any previous system. However, because WaveNet relies on sequential generation of one audio sample at a time, it is poorly suited to today's massively parallel computers, and therefore hard to deploy in a real-time production setting. This paper introduces Probability Density Distillation, a new method for training a parallel feed-forward network from a trained WaveNet with no significant difference in quality. The resulting system is capable of generating high-fidelity speech samples at more than 20 times faster than real-time, and is deployed online by Google Assistant, including serving multiple English and Japanese voices. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1711.10433v1-abstract-full').style.display = 'none'; document.getElementById('1711.10433v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2017. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1711.00937">arXiv:1711.00937</a> <span> [<a href="https://arxiv.org/pdf/1711.00937">pdf</a>, <a href="https://arxiv.org/format/1711.00937">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Neural Discrete Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">Aaron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Vinyals%2C+O">Oriol Vinyals</a>, <a href="/search/cs?searchtype=author&query=Kavukcuoglu%2C+K">Koray Kavukcuoglu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1711.00937v2-abstract-short" style="display: inline;"> Learning useful representations without supervision remains a key challenge in machine learning. In this paper, we propose a simple yet powerful generative model that learns such discrete representations. Our model, the Vector Quantised-Variational AutoEncoder (VQ-VAE), differs from VAEs in two key ways: the encoder network outputs discrete, rather than continuous, codes; and the prior is learnt r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1711.00937v2-abstract-full').style.display = 'inline'; document.getElementById('1711.00937v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1711.00937v2-abstract-full" style="display: none;"> Learning useful representations without supervision remains a key challenge in machine learning. In this paper, we propose a simple yet powerful generative model that learns such discrete representations. Our model, the Vector Quantised-Variational AutoEncoder (VQ-VAE), differs from VAEs in two key ways: the encoder network outputs discrete, rather than continuous, codes; and the prior is learnt rather than static. In order to learn a discrete latent representation, we incorporate ideas from vector quantisation (VQ). Using the VQ method allows the model to circumvent issues of "posterior collapse" -- where the latents are ignored when they are paired with a powerful autoregressive decoder -- typically observed in the VAE framework. Pairing these representations with an autoregressive prior, the model can generate high quality images, videos, and speech as well as doing high quality speaker conversion and unsupervised learning of phonemes, providing further evidence of the utility of the learnt representations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1711.00937v2-abstract-full').style.display = 'none'; document.getElementById('1711.00937v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 November, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2017. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1710.10304">arXiv:1710.10304</a> <span> [<a href="https://arxiv.org/pdf/1710.10304">pdf</a>, <a href="https://arxiv.org/format/1710.10304">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Few-shot Autoregressive Density Estimation: Towards Learning to Learn Distributions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Reed%2C+S">Scott Reed</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yutian Chen</a>, <a href="/search/cs?searchtype=author&query=Paine%2C+T">Thomas Paine</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">A盲ron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Eslami%2C+S+M+A">S. M. Ali Eslami</a>, <a href="/search/cs?searchtype=author&query=Rezende%2C+D">Danilo Rezende</a>, <a href="/search/cs?searchtype=author&query=Vinyals%2C+O">Oriol Vinyals</a>, <a href="/search/cs?searchtype=author&query=de+Freitas%2C+N">Nando de Freitas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1710.10304v4-abstract-short" style="display: inline;"> Deep autoregressive models have shown state-of-the-art performance in density estimation for natural images on large-scale datasets such as ImageNet. However, such models require many thousands of gradient-based weight updates and unique image examples for training. Ideally, the models would rapidly learn visual concepts from only a handful of examples, similar to the manner in which humans learns… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1710.10304v4-abstract-full').style.display = 'inline'; document.getElementById('1710.10304v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1710.10304v4-abstract-full" style="display: none;"> Deep autoregressive models have shown state-of-the-art performance in density estimation for natural images on large-scale datasets such as ImageNet. However, such models require many thousands of gradient-based weight updates and unique image examples for training. Ideally, the models would rapidly learn visual concepts from only a handful of examples, similar to the manner in which humans learns across many vision tasks. In this paper, we show how 1) neural attention and 2) meta learning techniques can be used in combination with autoregressive models to enable effective few-shot density estimation. Our proposed modifications to PixelCNN result in state-of-the art few-shot density estimation on the Omniglot dataset. Furthermore, we visualize the learned attention policy and find that it learns intuitive algorithms for simple tasks such as image mirroring on ImageNet and handwriting on Omniglot without supervision. Finally, we extend the model to natural images and demonstrate few-shot image generation on the Stanford Online Products dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1710.10304v4-abstract-full').style.display = 'none'; document.getElementById('1710.10304v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 February, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 October, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2017. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1703.03664">arXiv:1703.03664</a> <span> [<a href="https://arxiv.org/pdf/1703.03664">pdf</a>, <a href="https://arxiv.org/format/1703.03664">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Parallel Multiscale Autoregressive Density Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Reed%2C+S">Scott Reed</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">A盲ron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a>, <a href="/search/cs?searchtype=author&query=Colmenarejo%2C+S+G">Sergio G贸mez Colmenarejo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziyu Wang</a>, <a href="/search/cs?searchtype=author&query=Belov%2C+D">Dan Belov</a>, <a href="/search/cs?searchtype=author&query=de+Freitas%2C+N">Nando de Freitas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1703.03664v1-abstract-short" style="display: inline;"> PixelCNN achieves state-of-the-art results in density estimation for natural images. Although training is fast, inference is costly, requiring one network evaluation per pixel; O(N) for N pixels. This can be sped up by caching activations, but still involves generating each pixel sequentially. In this work, we propose a parallelized PixelCNN that allows more efficient inference by modeling certain… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1703.03664v1-abstract-full').style.display = 'inline'; document.getElementById('1703.03664v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1703.03664v1-abstract-full" style="display: none;"> PixelCNN achieves state-of-the-art results in density estimation for natural images. Although training is fast, inference is costly, requiring one network evaluation per pixel; O(N) for N pixels. This can be sped up by caching activations, but still involves generating each pixel sequentially. In this work, we propose a parallelized PixelCNN that allows more efficient inference by modeling certain pixel groups as conditionally independent. Our new PixelCNN model achieves competitive density estimation and orders of magnitude speedup - O(log N) sampling instead of O(N) - enabling the practical generation of 512x512 images. We evaluate the model on class-conditional image generation, text-to-image synthesis, and action-conditional video generation, showing that our model achieves the best results among non-pixel-autoregressive density models that allow efficient sampling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1703.03664v1-abstract-full').style.display = 'none'; document.getElementById('1703.03664v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2017. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1703.01310">arXiv:1703.01310</a> <span> [<a href="https://arxiv.org/pdf/1703.01310">pdf</a>, <a href="https://arxiv.org/format/1703.01310">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Count-Based Exploration with Neural Density Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ostrovski%2C+G">Georg Ostrovski</a>, <a href="/search/cs?searchtype=author&query=Bellemare%2C+M+G">Marc G. Bellemare</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">Aaron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">Remi Munos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1703.01310v2-abstract-short" style="display: inline;"> Bellemare et al. (2016) introduced the notion of a pseudo-count, derived from a density model, to generalize count-based exploration to non-tabular reinforcement learning. This pseudo-count was used to generate an exploration bonus for a DQN agent and combined with a mixed Monte Carlo update was sufficient to achieve state of the art on the Atari 2600 game Montezuma's Revenge. We consider two ques… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1703.01310v2-abstract-full').style.display = 'inline'; document.getElementById('1703.01310v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1703.01310v2-abstract-full" style="display: none;"> Bellemare et al. (2016) introduced the notion of a pseudo-count, derived from a density model, to generalize count-based exploration to non-tabular reinforcement learning. This pseudo-count was used to generate an exploration bonus for a DQN agent and combined with a mixed Monte Carlo update was sufficient to achieve state of the art on the Atari 2600 game Montezuma's Revenge. We consider two questions left open by their work: First, how important is the quality of the density model for exploration? Second, what role does the Monte Carlo update play in exploration? We answer the first question by demonstrating the use of PixelCNN, an advanced neural density model for images, to supply a pseudo-count. In particular, we examine the intrinsic difficulties in adapting Bellemare et al.'s approach when assumptions about the model are violated. The result is a more practical and general algorithm requiring no special apparatus. We combine PixelCNN pseudo-counts with different agent architectures to dramatically improve the state of the art on several hard Atari games. One surprising finding is that the mixed Monte Carlo update is a powerful facilitator of exploration in the sparsest of settings, including Montezuma's Revenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1703.01310v2-abstract-full').style.display = 'none'; document.getElementById('1703.01310v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 June, 2017; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 March, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2017. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1610.10099">arXiv:1610.10099</a> <span> [<a href="https://arxiv.org/pdf/1610.10099">pdf</a>, <a href="https://arxiv.org/format/1610.10099">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Neural Machine Translation in Linear Time </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a>, <a href="/search/cs?searchtype=author&query=Espeholt%2C+L">Lasse Espeholt</a>, <a href="/search/cs?searchtype=author&query=Simonyan%2C+K">Karen Simonyan</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">Aaron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Graves%2C+A">Alex Graves</a>, <a href="/search/cs?searchtype=author&query=Kavukcuoglu%2C+K">Koray Kavukcuoglu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1610.10099v2-abstract-short" style="display: inline;"> We present a novel neural network for processing sequences. The ByteNet is a one-dimensional convolutional neural network that is composed of two parts, one to encode the source sequence and the other to decode the target sequence. The two network parts are connected by stacking the decoder on top of the encoder and preserving the temporal resolution of the sequences. To address the differing leng… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1610.10099v2-abstract-full').style.display = 'inline'; document.getElementById('1610.10099v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1610.10099v2-abstract-full" style="display: none;"> We present a novel neural network for processing sequences. The ByteNet is a one-dimensional convolutional neural network that is composed of two parts, one to encode the source sequence and the other to decode the target sequence. The two network parts are connected by stacking the decoder on top of the encoder and preserving the temporal resolution of the sequences. To address the differing lengths of the source and the target, we introduce an efficient mechanism by which the decoder is dynamically unfolded over the representation of the encoder. The ByteNet uses dilation in the convolutional layers to increase its receptive field. The resulting network has two core properties: it runs in time that is linear in the length of the sequences and it sidesteps the need for excessive memorization. The ByteNet decoder attains state-of-the-art performance on character-level language modelling and outperforms the previous best results obtained with recurrent networks. The ByteNet also achieves state-of-the-art performance on character-to-character machine translation on the English-to-German WMT translation task, surpassing comparable neural translation models that are based on recurrent networks with attentional pooling and run in quadratic time. We find that the latent alignment structure contained in the representations reflects the expected alignment between the tokens. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1610.10099v2-abstract-full').style.display = 'none'; document.getElementById('1610.10099v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2017; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 October, 2016; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2016. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1610.00527">arXiv:1610.00527</a> <span> [<a href="https://arxiv.org/pdf/1610.00527">pdf</a>, <a href="https://arxiv.org/format/1610.00527">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Video Pixel Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">Aaron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Simonyan%2C+K">Karen Simonyan</a>, <a href="/search/cs?searchtype=author&query=Danihelka%2C+I">Ivo Danihelka</a>, <a href="/search/cs?searchtype=author&query=Vinyals%2C+O">Oriol Vinyals</a>, <a href="/search/cs?searchtype=author&query=Graves%2C+A">Alex Graves</a>, <a href="/search/cs?searchtype=author&query=Kavukcuoglu%2C+K">Koray Kavukcuoglu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1610.00527v1-abstract-short" style="display: inline;"> We propose a probabilistic video model, the Video Pixel Network (VPN), that estimates the discrete joint distribution of the raw pixel values in a video. The model and the neural architecture reflect the time, space and color structure of video tensors and encode it as a four-dimensional dependency chain. The VPN approaches the best possible performance on the Moving MNIST benchmark, a leap over t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1610.00527v1-abstract-full').style.display = 'inline'; document.getElementById('1610.00527v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1610.00527v1-abstract-full" style="display: none;"> We propose a probabilistic video model, the Video Pixel Network (VPN), that estimates the discrete joint distribution of the raw pixel values in a video. The model and the neural architecture reflect the time, space and color structure of video tensors and encode it as a four-dimensional dependency chain. The VPN approaches the best possible performance on the Moving MNIST benchmark, a leap over the previous state of the art, and the generated videos show only minor deviations from the ground truth. The VPN also produces detailed samples on the action-conditional Robotic Pushing benchmark and generalizes to the motion of novel objects. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1610.00527v1-abstract-full').style.display = 'none'; document.getElementById('1610.00527v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2016; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2016. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1609.03499">arXiv:1609.03499</a> <span> [<a href="https://arxiv.org/pdf/1609.03499">pdf</a>, <a href="https://arxiv.org/format/1609.03499">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> WaveNet: A Generative Model for Raw Audio </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">Aaron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Dieleman%2C+S">Sander Dieleman</a>, <a href="/search/cs?searchtype=author&query=Zen%2C+H">Heiga Zen</a>, <a href="/search/cs?searchtype=author&query=Simonyan%2C+K">Karen Simonyan</a>, <a href="/search/cs?searchtype=author&query=Vinyals%2C+O">Oriol Vinyals</a>, <a href="/search/cs?searchtype=author&query=Graves%2C+A">Alex Graves</a>, <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a>, <a href="/search/cs?searchtype=author&query=Senior%2C+A">Andrew Senior</a>, <a href="/search/cs?searchtype=author&query=Kavukcuoglu%2C+K">Koray Kavukcuoglu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1609.03499v2-abstract-short" style="display: inline;"> This paper introduces WaveNet, a deep neural network for generating raw audio waveforms. The model is fully probabilistic and autoregressive, with the predictive distribution for each audio sample conditioned on all previous ones; nonetheless we show that it can be efficiently trained on data with tens of thousands of samples per second of audio. When applied to text-to-speech, it yields state-of-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1609.03499v2-abstract-full').style.display = 'inline'; document.getElementById('1609.03499v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1609.03499v2-abstract-full" style="display: none;"> This paper introduces WaveNet, a deep neural network for generating raw audio waveforms. The model is fully probabilistic and autoregressive, with the predictive distribution for each audio sample conditioned on all previous ones; nonetheless we show that it can be efficiently trained on data with tens of thousands of samples per second of audio. When applied to text-to-speech, it yields state-of-the-art performance, with human listeners rating it as significantly more natural sounding than the best parametric and concatenative systems for both English and Mandarin. A single WaveNet can capture the characteristics of many different speakers with equal fidelity, and can switch between them by conditioning on the speaker identity. When trained to model music, we find that it generates novel and often highly realistic musical fragments. We also show that it can be employed as a discriminative model, returning promising results for phoneme recognition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1609.03499v2-abstract-full').style.display = 'none'; document.getElementById('1609.03499v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2016; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 September, 2016; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2016. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1606.05328">arXiv:1606.05328</a> <span> [<a href="https://arxiv.org/pdf/1606.05328">pdf</a>, <a href="https://arxiv.org/format/1606.05328">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Conditional Image Generation with PixelCNN Decoders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">Aaron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a>, <a href="/search/cs?searchtype=author&query=Vinyals%2C+O">Oriol Vinyals</a>, <a href="/search/cs?searchtype=author&query=Espeholt%2C+L">Lasse Espeholt</a>, <a href="/search/cs?searchtype=author&query=Graves%2C+A">Alex Graves</a>, <a href="/search/cs?searchtype=author&query=Kavukcuoglu%2C+K">Koray Kavukcuoglu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1606.05328v2-abstract-short" style="display: inline;"> This work explores conditional image generation with a new image density model based on the PixelCNN architecture. The model can be conditioned on any vector, including descriptive labels or tags, or latent embeddings created by other networks. When conditioned on class labels from the ImageNet database, the model is able to generate diverse, realistic scenes representing distinct animals, objects… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1606.05328v2-abstract-full').style.display = 'inline'; document.getElementById('1606.05328v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1606.05328v2-abstract-full" style="display: none;"> This work explores conditional image generation with a new image density model based on the PixelCNN architecture. The model can be conditioned on any vector, including descriptive labels or tags, or latent embeddings created by other networks. When conditioned on class labels from the ImageNet database, the model is able to generate diverse, realistic scenes representing distinct animals, objects, landscapes and structures. When conditioned on an embedding produced by a convolutional network given a single image of an unseen face, it generates a variety of new portraits of the same person with different facial expressions, poses and lighting conditions. We also show that conditional PixelCNN can serve as a powerful decoder in an image autoencoder. Additionally, the gated convolutional layers in the proposed model improve the log-likelihood of PixelCNN to match the state-of-the-art performance of PixelRNN on ImageNet, with greatly reduced computational cost. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1606.05328v2-abstract-full').style.display = 'none'; document.getElementById('1606.05328v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2016; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 June, 2016; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2016. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1601.06759">arXiv:1601.06759</a> <span> [<a href="https://arxiv.org/pdf/1601.06759">pdf</a>, <a href="https://arxiv.org/format/1601.06759">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Pixel Recurrent Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">Aaron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a>, <a href="/search/cs?searchtype=author&query=Kavukcuoglu%2C+K">Koray Kavukcuoglu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1601.06759v3-abstract-short" style="display: inline;"> Modeling the distribution of natural images is a landmark problem in unsupervised learning. This task requires an image model that is at once expressive, tractable and scalable. We present a deep neural network that sequentially predicts the pixels in an image along the two spatial dimensions. Our method models the discrete probability of the raw pixel values and encodes the complete set of depend… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1601.06759v3-abstract-full').style.display = 'inline'; document.getElementById('1601.06759v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1601.06759v3-abstract-full" style="display: none;"> Modeling the distribution of natural images is a landmark problem in unsupervised learning. This task requires an image model that is at once expressive, tractable and scalable. We present a deep neural network that sequentially predicts the pixels in an image along the two spatial dimensions. Our method models the discrete probability of the raw pixel values and encodes the complete set of dependencies in the image. Architectural novelties include fast two-dimensional recurrent layers and an effective use of residual connections in deep recurrent networks. We achieve log-likelihood scores on natural images that are considerably better than the previous state of the art. Our main results also provide benchmarks on the diverse ImageNet dataset. Samples generated from the model appear crisp, varied and globally coherent. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1601.06759v3-abstract-full').style.display = 'none'; document.getElementById('1601.06759v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2016; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 January, 2016; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2016. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1511.01844">arXiv:1511.01844</a> <span> [<a href="https://arxiv.org/pdf/1511.01844">pdf</a>, <a href="https://arxiv.org/format/1511.01844">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A note on the evaluation of generative models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Theis%2C+L">Lucas Theis</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">A盲ron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Bethge%2C+M">Matthias Bethge</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1511.01844v3-abstract-short" style="display: inline;"> Probabilistic generative models can be used for compression, denoising, inpainting, texture synthesis, semi-supervised learning, unsupervised feature learning, and other tasks. Given this wide range of applications, it is not surprising that a lot of heterogeneity exists in the way these models are formulated, trained, and evaluated. As a consequence, direct comparison between models is often diff… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1511.01844v3-abstract-full').style.display = 'inline'; document.getElementById('1511.01844v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1511.01844v3-abstract-full" style="display: none;"> Probabilistic generative models can be used for compression, denoising, inpainting, texture synthesis, semi-supervised learning, unsupervised feature learning, and other tasks. Given this wide range of applications, it is not surprising that a lot of heterogeneity exists in the way these models are formulated, trained, and evaluated. As a consequence, direct comparison between models is often difficult. This article reviews mostly known but often underappreciated properties relating to the evaluation and interpretation of generative models with a focus on image models. In particular, we show that three of the currently most commonly used criteria---average log-likelihood, Parzen window estimates, and visual fidelity of samples---are largely independent of each other when the data is high-dimensional. Good performance with respect to one criterion therefore need not imply good performance with respect to the other criteria. Our results show that extrapolation from one criterion to another is not warranted and generative models need to be evaluated directly with respect to the application(s) they were intended for. In addition, we provide examples demonstrating that Parzen window estimates should generally be avoided. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1511.01844v3-abstract-full').style.display = 'none'; document.getElementById('1511.01844v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 April, 2016; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 November, 2015; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2015. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1506.01911">arXiv:1506.01911</a> <span> [<a href="https://arxiv.org/pdf/1506.01911">pdf</a>, <a href="https://arxiv.org/format/1506.01911">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Beyond Temporal Pooling: Recurrence and Temporal Convolutions for Gesture Recognition in Video </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pigou%2C+L">Lionel Pigou</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">A盲ron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Dieleman%2C+S">Sander Dieleman</a>, <a href="/search/cs?searchtype=author&query=Van+Herreweghe%2C+M">Mieke Van Herreweghe</a>, <a href="/search/cs?searchtype=author&query=Dambre%2C+J">Joni Dambre</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1506.01911v3-abstract-short" style="display: inline;"> Recent studies have demonstrated the power of recurrent neural networks for machine translation, image captioning and speech recognition. For the task of capturing temporal structure in video, however, there still remain numerous open research questions. Current research suggests using a simple temporal feature pooling strategy to take into account the temporal aspect of video. We demonstrate that… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1506.01911v3-abstract-full').style.display = 'inline'; document.getElementById('1506.01911v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1506.01911v3-abstract-full" style="display: none;"> Recent studies have demonstrated the power of recurrent neural networks for machine translation, image captioning and speech recognition. For the task of capturing temporal structure in video, however, there still remain numerous open research questions. Current research suggests using a simple temporal feature pooling strategy to take into account the temporal aspect of video. We demonstrate that this method is not sufficient for gesture recognition, where temporal information is more discriminative compared to general video classification tasks. We explore deep architectures for gesture recognition in video and propose a new end-to-end trainable neural network architecture incorporating temporal convolutions and bidirectional recurrence. Our main contributions are twofold; first, we show that recurrence is crucial for this task; second, we show that adding temporal convolutions leads to significant improvements. We evaluate the different approaches on the Montalbano gesture recognition dataset, where we achieve state-of-the-art results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1506.01911v3-abstract-full').style.display = 'none'; document.getElementById('1506.01911v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2016; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 June, 2015; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2015. </p> </li> </ol> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository