Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–27 of 27 results for author: <span class="mathjax">Kalchbrenner, N</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Kalchbrenner%2C+N">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Kalchbrenner, N"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Kalchbrenner%2C+N&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Kalchbrenner, N"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.06079">arXiv:2306.06079</a> <span> [<a href="https://arxiv.org/pdf/2306.06079">pdf</a>, <a href="https://arxiv.org/format/2306.06079">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Atmospheric and Oceanic Physics">physics.ao-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Deep Learning for Day Forecasts from Sparse Observations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Andrychowicz%2C+M">Marcin Andrychowicz</a>, <a href="/search/cs?searchtype=author&query=Espeholt%2C+L">Lasse Espeholt</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Di Li</a>, <a href="/search/cs?searchtype=author&query=Merchant%2C+S">Samier Merchant</a>, <a href="/search/cs?searchtype=author&query=Merose%2C+A">Alexander Merose</a>, <a href="/search/cs?searchtype=author&query=Zyda%2C+F">Fred Zyda</a>, <a href="/search/cs?searchtype=author&query=Agrawal%2C+S">Shreya Agrawal</a>, <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.06079v3-abstract-short" style="display: inline;"> Deep neural networks offer an alternative paradigm for modeling weather conditions. The ability of neural models to make a prediction in less than a second once the data is available and to do so with very high temporal and spatial resolution, and the ability to learn directly from atmospheric observations, are just some of these models' unique advantages. Neural models trained using atmospheric o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.06079v3-abstract-full').style.display = 'inline'; document.getElementById('2306.06079v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.06079v3-abstract-full" style="display: none;"> Deep neural networks offer an alternative paradigm for modeling weather conditions. The ability of neural models to make a prediction in less than a second once the data is available and to do so with very high temporal and spatial resolution, and the ability to learn directly from atmospheric observations, are just some of these models' unique advantages. Neural models trained using atmospheric observations, the highest fidelity and lowest latency data, have to date achieved good performance only up to twelve hours of lead time when compared with state-of-the-art probabilistic Numerical Weather Prediction models and only for the sole variable of precipitation. In this paper, we present MetNet-3 that extends significantly both the lead time range and the variables that an observation based neural model can predict well. MetNet-3 learns from both dense and sparse data sensors and makes predictions up to 24 hours ahead for precipitation, wind, temperature and dew point. MetNet-3 introduces a key densification technique that implicitly captures data assimilation and produces spatially dense forecasts in spite of the network training on extremely sparse targets. MetNet-3 has a high temporal and spatial resolution of, respectively, up to 2 minutes and 1 km as well as a low operational latency. We find that MetNet-3 is able to outperform the best single- and multi-member NWPs such as HRRR and ENS over the CONUS region for up to 24 hours ahead setting a new performance milestone for observation based neural models. MetNet-3 is operational and its forecasts are served in Google Search in conjunction with other models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.06079v3-abstract-full').style.display = 'none'; document.getElementById('2306.06079v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.04946">arXiv:2203.04946</a> <span> [<a href="https://arxiv.org/pdf/2203.04946">pdf</a>, <a href="https://arxiv.org/format/2203.04946">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Do better ImageNet classifiers assess perceptual similarity better? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kumar%2C+M">Manoj Kumar</a>, <a href="/search/cs?searchtype=author&query=Houlsby%2C+N">Neil Houlsby</a>, <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a>, <a href="/search/cs?searchtype=author&query=Cubuk%2C+E+D">Ekin D. Cubuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.04946v3-abstract-short" style="display: inline;"> Perceptual distances between images, as measured in the space of pre-trained deep features, have outperformed prior low-level, pixel-based metrics on assessing perceptual similarity. While the capabilities of older and less accurate models such as AlexNet and VGG to capture perceptual similarity are well known, modern and more accurate models are less studied. In this paper, we present a large-sca… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.04946v3-abstract-full').style.display = 'inline'; document.getElementById('2203.04946v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.04946v3-abstract-full" style="display: none;"> Perceptual distances between images, as measured in the space of pre-trained deep features, have outperformed prior low-level, pixel-based metrics on assessing perceptual similarity. While the capabilities of older and less accurate models such as AlexNet and VGG to capture perceptual similarity are well known, modern and more accurate models are less studied. In this paper, we present a large-scale empirical study to assess how well ImageNet classifiers perform on perceptual similarity. First, we observe a inverse correlation between ImageNet accuracy and Perceptual Scores of modern networks such as ResNets, EfficientNets, and Vision Transformers: that is better classifiers achieve worse Perceptual Scores. Then, we examine the ImageNet accuracy/Perceptual Score relationship on varying the depth, width, number of training steps, weight decay, label smoothing, and dropout. Higher accuracy improves Perceptual Score up to a certain point, but we uncover a Pareto frontier between accuracies and Perceptual Score in the mid-to-high accuracy regime. We explore this relationship further using a number of plausible hypotheses such as distortion invariance, spatial frequency sensitivity, and alternative perceptual functions. Interestingly we discover shallow ResNets and ResNets trained for less than 5 epochs only on ImageNet, whose emergent Perceptual Score matches the prior best networks trained directly on supervised human perceptual judgements. The checkpoints for the models in our study are available at https://console.cloud.google.com/storage/browser/gresearch/perceptual_similarity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.04946v3-abstract-full').style.display = 'none'; document.getElementById('2203.04946v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">TMLR 2022 (https://openreview.net/forum?id=qrGKGZZvH0)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.07470">arXiv:2111.07470</a> <span> [<a href="https://arxiv.org/pdf/2111.07470">pdf</a>, <a href="https://arxiv.org/format/2111.07470">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Atmospheric and Oceanic Physics">physics.ao-ph</span> </div> </div> <p class="title is-5 mathjax"> Skillful Twelve Hour Precipitation Forecasts using Large Context Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Espeholt%2C+L">Lasse Espeholt</a>, <a href="/search/cs?searchtype=author&query=Agrawal%2C+S">Shreya Agrawal</a>, <a href="/search/cs?searchtype=author&query=S%C3%B8nderby%2C+C">Casper S酶nderby</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+M">Manoj Kumar</a>, <a href="/search/cs?searchtype=author&query=Heek%2C+J">Jonathan Heek</a>, <a href="/search/cs?searchtype=author&query=Bromberg%2C+C">Carla Bromberg</a>, <a href="/search/cs?searchtype=author&query=Gazen%2C+C">Cenk Gazen</a>, <a href="/search/cs?searchtype=author&query=Hickey%2C+J">Jason Hickey</a>, <a href="/search/cs?searchtype=author&query=Bell%2C+A">Aaron Bell</a>, <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.07470v1-abstract-short" style="display: inline;"> The problem of forecasting weather has been scientifically studied for centuries due to its high impact on human lives, transportation, food production and energy management, among others. Current operational forecasting models are based on physics and use supercomputers to simulate the atmosphere to make forecasts hours and days in advance. Better physics-based forecasts require improvements in t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.07470v1-abstract-full').style.display = 'inline'; document.getElementById('2111.07470v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.07470v1-abstract-full" style="display: none;"> The problem of forecasting weather has been scientifically studied for centuries due to its high impact on human lives, transportation, food production and energy management, among others. Current operational forecasting models are based on physics and use supercomputers to simulate the atmosphere to make forecasts hours and days in advance. Better physics-based forecasts require improvements in the models themselves, which can be a substantial scientific challenge, as well as improvements in the underlying resolution, which can be computationally prohibitive. An emerging class of weather models based on neural networks represents a paradigm shift in weather forecasting: the models learn the required transformations from data instead of relying on hand-coded physics and are computationally efficient. For neural models, however, each additional hour of lead time poses a substantial challenge as it requires capturing ever larger spatial contexts and increases the uncertainty of the prediction. In this work, we present a neural network that is capable of large-scale precipitation forecasting up to twelve hours ahead and, starting from the same atmospheric state, the model achieves greater skill than the state-of-the-art physics-based models HRRR and HREF that currently operate in the Continental United States. Interpretability analyses reinforce the observation that the model learns to emulate advanced physics principles. These results represent a substantial step towards establishing a new paradigm of efficient forecasting with neural networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.07470v1-abstract-full').style.display = 'none'; document.getElementById('2111.07470v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">34 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.06080">arXiv:2106.06080</a> <span> [<a href="https://arxiv.org/pdf/2106.06080">pdf</a>, <a href="https://arxiv.org/format/2106.06080">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Gradual Domain Adaptation in the Wild:When Intermediate Distributions are Absent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Abnar%2C+S">Samira Abnar</a>, <a href="/search/cs?searchtype=author&query=Berg%2C+R+v+d">Rianne van den Berg</a>, <a href="/search/cs?searchtype=author&query=Ghiasi%2C+G">Golnaz Ghiasi</a>, <a href="/search/cs?searchtype=author&query=Dehghani%2C+M">Mostafa Dehghani</a>, <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a>, <a href="/search/cs?searchtype=author&query=Sedghi%2C+H">Hanie Sedghi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.06080v2-abstract-short" style="display: inline;"> We focus on the problem of domain adaptation when the goal is shifting the model towards the target distribution, rather than learning domain invariant representations. It has been shown that under the following two assumptions: (a) access to samples from intermediate distributions, and (b) samples being annotated with the amount of change from the source distribution, self-training can be success… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.06080v2-abstract-full').style.display = 'inline'; document.getElementById('2106.06080v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.06080v2-abstract-full" style="display: none;"> We focus on the problem of domain adaptation when the goal is shifting the model towards the target distribution, rather than learning domain invariant representations. It has been shown that under the following two assumptions: (a) access to samples from intermediate distributions, and (b) samples being annotated with the amount of change from the source distribution, self-training can be successfully applied on gradually shifted samples to adapt the model toward the target distribution. We hypothesize having (a) is enough to enable iterative self-training to slowly adapt the model to the target distribution, by making use of an implicit curriculum. In the case where (a) does not hold, we observe that iterative self-training falls short. We propose GIFT, a method that creates virtual samples from intermediate distributions by interpolating representations of examples from source and target domains. We evaluate an iterative-self-training method on datasets with natural distribution shifts, and show that when applied on top of other domain adaptation methods, it improves the performance of the model on the target dataset. We run an analysis on a synthetic dataset to show that in the presence of (a) iterative-self-training naturally forms a curriculum of samples. Furthermore, we show that when (a) does not hold, GIFT performs better than iterative self-training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.06080v2-abstract-full').style.display = 'none'; document.getElementById('2106.06080v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 July, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2102.11107">arXiv:2102.11107</a> <span> [<a href="https://arxiv.org/pdf/2102.11107">pdf</a>, <a href="https://arxiv.org/format/2102.11107">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Towards Causal Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sch%C3%B6lkopf%2C+B">Bernhard Sch枚lkopf</a>, <a href="/search/cs?searchtype=author&query=Locatello%2C+F">Francesco Locatello</a>, <a href="/search/cs?searchtype=author&query=Bauer%2C+S">Stefan Bauer</a>, <a href="/search/cs?searchtype=author&query=Ke%2C+N+R">Nan Rosemary Ke</a>, <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a>, <a href="/search/cs?searchtype=author&query=Goyal%2C+A">Anirudh Goyal</a>, <a href="/search/cs?searchtype=author&query=Bengio%2C+Y">Yoshua Bengio</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2102.11107v1-abstract-short" style="display: inline;"> The two fields of machine learning and graphical causality arose and developed separately. However, there is now cross-pollination and increasing interest in both fields to benefit from the advances of the other. In the present paper, we review fundamental concepts of causal inference and relate them to crucial open problems of machine learning, including transfer and generalization, thereby assay… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.11107v1-abstract-full').style.display = 'inline'; document.getElementById('2102.11107v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2102.11107v1-abstract-full" style="display: none;"> The two fields of machine learning and graphical causality arose and developed separately. However, there is now cross-pollination and increasing interest in both fields to benefit from the advances of the other. In the present paper, we review fundamental concepts of causal inference and relate them to crucial open problems of machine learning, including transfer and generalization, thereby assaying how causality can contribute to modern machine learning research. This also applies in the opposite direction: we note that most work in causality starts from the premise that the causal variables are given. A central problem for AI and causality is, thus, causal representation learning, the discovery of high-level causal variables from low-level observations. Finally, we delineate some implications of causality for machine learning and propose key research areas at the intersection of both communities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.11107v1-abstract-full').style.display = 'none'; document.getElementById('2102.11107v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Special Issue of Proceedings of the IEEE - Advances in Machine Learning and Deep Neural Networks</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2102.04432">arXiv:2102.04432</a> <span> [<a href="https://arxiv.org/pdf/2102.04432">pdf</a>, <a href="https://arxiv.org/format/2102.04432">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Colorization Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kumar%2C+M">Manoj Kumar</a>, <a href="/search/cs?searchtype=author&query=Weissenborn%2C+D">Dirk Weissenborn</a>, <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2102.04432v2-abstract-short" style="display: inline;"> We present the Colorization Transformer, a novel approach for diverse high fidelity image colorization based on self-attention. Given a grayscale image, the colorization proceeds in three steps. We first use a conditional autoregressive transformer to produce a low resolution coarse coloring of the grayscale image. Our architecture adopts conditional transformer layers to effectively condition gra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.04432v2-abstract-full').style.display = 'inline'; document.getElementById('2102.04432v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2102.04432v2-abstract-full" style="display: none;"> We present the Colorization Transformer, a novel approach for diverse high fidelity image colorization based on self-attention. Given a grayscale image, the colorization proceeds in three steps. We first use a conditional autoregressive transformer to produce a low resolution coarse coloring of the grayscale image. Our architecture adopts conditional transformer layers to effectively condition grayscale input. Two subsequent fully parallel networks upsample the coarse colored low resolution image into a finely colored high resolution image. Sampling from the Colorization Transformer produces diverse colorings whose fidelity outperforms the previous state-of-the-art on colorising ImageNet based on FID results and based on a human evaluation in a Mechanical Turk test. Remarkably, in more than 60% of cases human evaluators prefer the highest rated among three generated colorings over the ground truth. The code and pre-trained checkpoints for Colorization Transformer are publicly available at https://github.com/google-research/google-research/tree/master/coltran <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.04432v2-abstract-full').style.display = 'none'; document.getElementById('2102.04432v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2021 Camera Ready. See https://openreview.net/forum?id=5NA1PinlGFu for more details</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2008.01160">arXiv:2008.01160</a> <span> [<a href="https://arxiv.org/pdf/2008.01160">pdf</a>, <a href="https://arxiv.org/format/2008.01160">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> A Spectral Energy Distance for Parallel Speech Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gritsenko%2C+A+A">Alexey A. Gritsenko</a>, <a href="/search/cs?searchtype=author&query=Salimans%2C+T">Tim Salimans</a>, <a href="/search/cs?searchtype=author&query=Berg%2C+R+v+d">Rianne van den Berg</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+J">Jasper Snoek</a>, <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2008.01160v2-abstract-short" style="display: inline;"> Speech synthesis is an important practical generative modeling problem that has seen great progress over the last few years, with likelihood-based autoregressive neural models now outperforming traditional concatenative systems. A downside of such autoregressive models is that they require executing tens of thousands of sequential operations per second of generated audio, making them ill-suited fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.01160v2-abstract-full').style.display = 'inline'; document.getElementById('2008.01160v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2008.01160v2-abstract-full" style="display: none;"> Speech synthesis is an important practical generative modeling problem that has seen great progress over the last few years, with likelihood-based autoregressive neural models now outperforming traditional concatenative systems. A downside of such autoregressive models is that they require executing tens of thousands of sequential operations per second of generated audio, making them ill-suited for deployment on specialized deep learning hardware. Here, we propose a new learning method that allows us to train highly parallel models of speech, without requiring access to an analytical likelihood function. Our approach is based on a generalized energy distance between the distributions of the generated and real audio. This spectral energy distance is a proper scoring rule with respect to the distribution over magnitude-spectrograms of the generated waveform audio and offers statistical consistency guarantees. The distance can be calculated from minibatches without bias, and does not involve adversarial learning, yielding a stable and consistent method for training implicit generative models. Empirically, we achieve state-of-the-art generation quality among implicit generative models, as judged by the recently-proposed cFDSD metric. When combining our method with adversarial techniques, we also improve upon the recently-proposed GAN-TTS model in terms of Mean Opinion Score as judged by trained human evaluators. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.01160v2-abstract-full').style.display = 'none'; document.getElementById('2008.01160v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 August, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2004.03705">arXiv:2004.03705</a> <span> [<a href="https://arxiv.org/pdf/2004.03705">pdf</a>, <a href="https://arxiv.org/format/2004.03705">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Deep Learning Based Text Classification: A Comprehensive Review </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Minaee%2C+S">Shervin Minaee</a>, <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a>, <a href="/search/cs?searchtype=author&query=Cambria%2C+E">Erik Cambria</a>, <a href="/search/cs?searchtype=author&query=Nikzad%2C+N">Narjes Nikzad</a>, <a href="/search/cs?searchtype=author&query=Chenaghlu%2C+M">Meysam Chenaghlu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jianfeng Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2004.03705v3-abstract-short" style="display: inline;"> Deep learning based models have surpassed classical machine learning based approaches in various text classification tasks, including sentiment analysis, news categorization, question answering, and natural language inference. In this paper, we provide a comprehensive review of more than 150 deep learning based models for text classification developed in recent years, and discuss their technical c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.03705v3-abstract-full').style.display = 'inline'; document.getElementById('2004.03705v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2004.03705v3-abstract-full" style="display: none;"> Deep learning based models have surpassed classical machine learning based approaches in various text classification tasks, including sentiment analysis, news categorization, question answering, and natural language inference. In this paper, we provide a comprehensive review of more than 150 deep learning based models for text classification developed in recent years, and discuss their technical contributions, similarities, and strengths. We also provide a summary of more than 40 popular datasets widely used for text classification. Finally, we provide a quantitative analysis of the performance of different deep learning models on popular benchmarks, and discuss future research directions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.03705v3-abstract-full').style.display = 'none'; document.getElementById('2004.03705v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 January, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 April, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2003.12140">arXiv:2003.12140</a> <span> [<a href="https://arxiv.org/pdf/2003.12140">pdf</a>, <a href="https://arxiv.org/format/2003.12140">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Atmospheric and Oceanic Physics">physics.ao-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> MetNet: A Neural Weather Model for Precipitation Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=S%C3%B8nderby%2C+C+K">Casper Kaae S酶nderby</a>, <a href="/search/cs?searchtype=author&query=Espeholt%2C+L">Lasse Espeholt</a>, <a href="/search/cs?searchtype=author&query=Heek%2C+J">Jonathan Heek</a>, <a href="/search/cs?searchtype=author&query=Dehghani%2C+M">Mostafa Dehghani</a>, <a href="/search/cs?searchtype=author&query=Oliver%2C+A">Avital Oliver</a>, <a href="/search/cs?searchtype=author&query=Salimans%2C+T">Tim Salimans</a>, <a href="/search/cs?searchtype=author&query=Agrawal%2C+S">Shreya Agrawal</a>, <a href="/search/cs?searchtype=author&query=Hickey%2C+J">Jason Hickey</a>, <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2003.12140v2-abstract-short" style="display: inline;"> Weather forecasting is a long standing scientific challenge with direct social and economic impact. The task is suitable for deep neural networks due to vast amounts of continuously collected data and a rich spatial and temporal structure that presents long range dependencies. We introduce MetNet, a neural network that forecasts precipitation up to 8 hours into the future at the high spatial resol… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.12140v2-abstract-full').style.display = 'inline'; document.getElementById('2003.12140v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2003.12140v2-abstract-full" style="display: none;"> Weather forecasting is a long standing scientific challenge with direct social and economic impact. The task is suitable for deep neural networks due to vast amounts of continuously collected data and a rich spatial and temporal structure that presents long range dependencies. We introduce MetNet, a neural network that forecasts precipitation up to 8 hours into the future at the high spatial resolution of 1 km$^2$ and at the temporal resolution of 2 minutes with a latency in the order of seconds. MetNet takes as input radar and satellite data and forecast lead time and produces a probabilistic precipitation map. The architecture uses axial self-attention to aggregate the global context from a large input patch corresponding to a million square kilometers. We evaluate the performance of MetNet at various precipitation thresholds and find that MetNet outperforms Numerical Weather Prediction at forecasts of up to 7 to 8 hours on the scale of the continental United States. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.12140v2-abstract-full').style.display = 'none'; document.getElementById('2003.12140v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 March, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1912.12180">arXiv:1912.12180</a> <span> [<a href="https://arxiv.org/pdf/1912.12180">pdf</a>, <a href="https://arxiv.org/format/1912.12180">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Axial Attention in Multidimensional Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ho%2C+J">Jonathan Ho</a>, <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a>, <a href="/search/cs?searchtype=author&query=Weissenborn%2C+D">Dirk Weissenborn</a>, <a href="/search/cs?searchtype=author&query=Salimans%2C+T">Tim Salimans</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1912.12180v1-abstract-short" style="display: inline;"> We propose Axial Transformers, a self-attention-based autoregressive model for images and other data organized as high dimensional tensors. Existing autoregressive models either suffer from excessively large computational resource requirements for high dimensional data, or make compromises in terms of distribution expressiveness or ease of implementation in order to decrease resource requirements.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1912.12180v1-abstract-full').style.display = 'inline'; document.getElementById('1912.12180v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1912.12180v1-abstract-full" style="display: none;"> We propose Axial Transformers, a self-attention-based autoregressive model for images and other data organized as high dimensional tensors. Existing autoregressive models either suffer from excessively large computational resource requirements for high dimensional data, or make compromises in terms of distribution expressiveness or ease of implementation in order to decrease resource requirements. Our architecture, by contrast, maintains both full expressiveness over joint distributions over data and ease of implementation with standard deep learning frameworks, while requiring reasonable memory and computation and achieving state-of-the-art results on standard generative modeling benchmarks. Our models are based on axial attention, a simple generalization of self-attention that naturally aligns with the multiple dimensions of the tensors in both the encoding and the decoding settings. Notably the proposed structure of the layers allows for the vast majority of the context to be computed in parallel during decoding without introducing any independence assumptions. This semi-parallel structure goes a long way to making decoding from even a very large Axial Transformer broadly applicable. We demonstrate state-of-the-art results for the Axial Transformer on the ImageNet-32 and ImageNet-64 image benchmarks as well as on the BAIR Robotic Pushing video benchmark. We open source the implementation of Axial Transformers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1912.12180v1-abstract-full').style.display = 'none'; document.getElementById('1912.12180v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1908.03491">arXiv:1908.03491</a> <span> [<a href="https://arxiv.org/pdf/1908.03491">pdf</a>, <a href="https://arxiv.org/format/1908.03491">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Bayesian Inference for Large Scale Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Heek%2C+J">Jonathan Heek</a>, <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1908.03491v1-abstract-short" style="display: inline;"> Bayesian inference promises to ground and improve the performance of deep neural networks. It promises to be robust to overfitting, to simplify the training procedure and the space of hyperparameters, and to provide a calibrated measure of uncertainty that can enhance decision making, agent exploration and prediction fairness. Markov Chain Monte Carlo (MCMC) methods enable Bayesian inference by ge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1908.03491v1-abstract-full').style.display = 'inline'; document.getElementById('1908.03491v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1908.03491v1-abstract-full" style="display: none;"> Bayesian inference promises to ground and improve the performance of deep neural networks. It promises to be robust to overfitting, to simplify the training procedure and the space of hyperparameters, and to provide a calibrated measure of uncertainty that can enhance decision making, agent exploration and prediction fairness. Markov Chain Monte Carlo (MCMC) methods enable Bayesian inference by generating samples from the posterior distribution over model parameters. Despite the theoretical advantages of Bayesian inference and the similarity between MCMC and optimization methods, the performance of sampling methods has so far lagged behind optimization methods for large scale deep learning tasks. We aim to fill this gap and introduce ATMC, an adaptive noise MCMC algorithm that estimates and is able to sample from the posterior of a neural network. ATMC dynamically adjusts the amount of momentum and noise applied to each parameter update in order to compensate for the use of stochastic gradients. We use a ResNet architecture without batch normalization to test ATMC on the Cifar10 benchmark and the large scale ImageNet benchmark and show that, despite the absence of batch normalization, ATMC outperforms a strong optimization baseline in terms of both classification accuracy and test log-likelihood. We show that ATMC is intrinsically robust to overfitting on the training data and that ATMC provides a better calibrated measure of uncertainty compared to the optimization baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1908.03491v1-abstract-full').style.display = 'none'; document.getElementById('1908.03491v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 August, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1812.01608">arXiv:1812.01608</a> <span> [<a href="https://arxiv.org/pdf/1812.01608">pdf</a>, <a href="https://arxiv.org/format/1812.01608">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Generating High Fidelity Images with Subscale Pixel Networks and Multidimensional Upscaling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Menick%2C+J">Jacob Menick</a>, <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1812.01608v1-abstract-short" style="display: inline;"> The unconditional generation of high fidelity images is a longstanding benchmark for testing the performance of image decoders. Autoregressive image models have been able to generate small images unconditionally, but the extension of these methods to large images where fidelity can be more readily assessed has remained an open problem. Among the major challenges are the capacity to encode the vast… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1812.01608v1-abstract-full').style.display = 'inline'; document.getElementById('1812.01608v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1812.01608v1-abstract-full" style="display: none;"> The unconditional generation of high fidelity images is a longstanding benchmark for testing the performance of image decoders. Autoregressive image models have been able to generate small images unconditionally, but the extension of these methods to large images where fidelity can be more readily assessed has remained an open problem. Among the major challenges are the capacity to encode the vast previous context and the sheer difficulty of learning a distribution that preserves both global semantic coherence and exactness of detail. To address the former challenge, we propose the Subscale Pixel Network (SPN), a conditional decoder architecture that generates an image as a sequence of sub-images of equal size. The SPN compactly captures image-wide spatial dependencies and requires a fraction of the memory and the computation required by other fully autoregressive models. To address the latter challenge, we propose to use Multidimensional Upscaling to grow an image in both size and depth via intermediate stages utilising distinct SPNs. We evaluate SPNs on the unconditional generation of CelebAHQ of size 256 and of ImageNet from size 32 to 256. We achieve state-of-the-art likelihood results in multiple settings, set up new benchmark results in previously unexplored settings and are able to generate very high fidelity large scale samples on the basis of both datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1812.01608v1-abstract-full').style.display = 'none'; document.getElementById('1812.01608v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2018. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1803.07416">arXiv:1803.07416</a> <span> [<a href="https://arxiv.org/pdf/1803.07416">pdf</a>, <a href="https://arxiv.org/format/1803.07416">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Tensor2Tensor for Neural Machine Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Vaswani%2C+A">Ashish Vaswani</a>, <a href="/search/cs?searchtype=author&query=Bengio%2C+S">Samy Bengio</a>, <a href="/search/cs?searchtype=author&query=Brevdo%2C+E">Eugene Brevdo</a>, <a href="/search/cs?searchtype=author&query=Chollet%2C+F">Francois Chollet</a>, <a href="/search/cs?searchtype=author&query=Gomez%2C+A+N">Aidan N. Gomez</a>, <a href="/search/cs?searchtype=author&query=Gouws%2C+S">Stephan Gouws</a>, <a href="/search/cs?searchtype=author&query=Jones%2C+L">Llion Jones</a>, <a href="/search/cs?searchtype=author&query=Kaiser%2C+%C5%81">艁ukasz Kaiser</a>, <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a>, <a href="/search/cs?searchtype=author&query=Parmar%2C+N">Niki Parmar</a>, <a href="/search/cs?searchtype=author&query=Sepassi%2C+R">Ryan Sepassi</a>, <a href="/search/cs?searchtype=author&query=Shazeer%2C+N">Noam Shazeer</a>, <a href="/search/cs?searchtype=author&query=Uszkoreit%2C+J">Jakob Uszkoreit</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1803.07416v1-abstract-short" style="display: inline;"> Tensor2Tensor is a library for deep learning models that is well-suited for neural machine translation and includes the reference implementation of the state-of-the-art Transformer model. </span> <span class="abstract-full has-text-grey-dark mathjax" id="1803.07416v1-abstract-full" style="display: none;"> Tensor2Tensor is a library for deep learning models that is well-suited for neural machine translation and includes the reference implementation of the state-of-the-art Transformer model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1803.07416v1-abstract-full').style.display = 'none'; document.getElementById('1803.07416v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:1706.03762</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1802.08435">arXiv:1802.08435</a> <span> [<a href="https://arxiv.org/pdf/1802.08435">pdf</a>, <a href="https://arxiv.org/format/1802.08435">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Efficient Neural Audio Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a>, <a href="/search/cs?searchtype=author&query=Elsen%2C+E">Erich Elsen</a>, <a href="/search/cs?searchtype=author&query=Simonyan%2C+K">Karen Simonyan</a>, <a href="/search/cs?searchtype=author&query=Noury%2C+S">Seb Noury</a>, <a href="/search/cs?searchtype=author&query=Casagrande%2C+N">Norman Casagrande</a>, <a href="/search/cs?searchtype=author&query=Lockhart%2C+E">Edward Lockhart</a>, <a href="/search/cs?searchtype=author&query=Stimberg%2C+F">Florian Stimberg</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">Aaron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Dieleman%2C+S">Sander Dieleman</a>, <a href="/search/cs?searchtype=author&query=Kavukcuoglu%2C+K">Koray Kavukcuoglu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1802.08435v2-abstract-short" style="display: inline;"> Sequential models achieve state-of-the-art results in audio, visual and textual domains with respect to both estimating the data distribution and generating high-quality samples. Efficient sampling for this class of models has however remained an elusive problem. With a focus on text-to-speech synthesis, we describe a set of general techniques for reducing sampling time while maintaining high outp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1802.08435v2-abstract-full').style.display = 'inline'; document.getElementById('1802.08435v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1802.08435v2-abstract-full" style="display: none;"> Sequential models achieve state-of-the-art results in audio, visual and textual domains with respect to both estimating the data distribution and generating high-quality samples. Efficient sampling for this class of models has however remained an elusive problem. With a focus on text-to-speech synthesis, we describe a set of general techniques for reducing sampling time while maintaining high output quality. We first describe a single-layer recurrent neural network, the WaveRNN, with a dual softmax layer that matches the quality of the state-of-the-art WaveNet model. The compact form of the network makes it possible to generate 24kHz 16-bit audio 4x faster than real time on a GPU. Second, we apply a weight pruning technique to reduce the number of weights in the WaveRNN. We find that, for a constant number of parameters, large sparse networks perform better than small dense networks and this relationship holds for sparsity levels beyond 96%. The small number of weights in a Sparse WaveRNN makes it possible to sample high-fidelity audio on a mobile CPU in real time. Finally, we propose a new generation scheme based on subscaling that folds a long sequence into a batch of shorter sequences and allows one to generate multiple samples at once. The Subscale WaveRNN produces 16 samples per step without loss of quality and offers an orthogonal method for increasing sampling efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1802.08435v2-abstract-full').style.display = 'none'; document.getElementById('1802.08435v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 February, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1711.10433">arXiv:1711.10433</a> <span> [<a href="https://arxiv.org/pdf/1711.10433">pdf</a>, <a href="https://arxiv.org/format/1711.10433">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Parallel WaveNet: Fast High-Fidelity Speech Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">Aaron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yazhe Li</a>, <a href="/search/cs?searchtype=author&query=Babuschkin%2C+I">Igor Babuschkin</a>, <a href="/search/cs?searchtype=author&query=Simonyan%2C+K">Karen Simonyan</a>, <a href="/search/cs?searchtype=author&query=Vinyals%2C+O">Oriol Vinyals</a>, <a href="/search/cs?searchtype=author&query=Kavukcuoglu%2C+K">Koray Kavukcuoglu</a>, <a href="/search/cs?searchtype=author&query=Driessche%2C+G+v+d">George van den Driessche</a>, <a href="/search/cs?searchtype=author&query=Lockhart%2C+E">Edward Lockhart</a>, <a href="/search/cs?searchtype=author&query=Cobo%2C+L+C">Luis C. Cobo</a>, <a href="/search/cs?searchtype=author&query=Stimberg%2C+F">Florian Stimberg</a>, <a href="/search/cs?searchtype=author&query=Casagrande%2C+N">Norman Casagrande</a>, <a href="/search/cs?searchtype=author&query=Grewe%2C+D">Dominik Grewe</a>, <a href="/search/cs?searchtype=author&query=Noury%2C+S">Seb Noury</a>, <a href="/search/cs?searchtype=author&query=Dieleman%2C+S">Sander Dieleman</a>, <a href="/search/cs?searchtype=author&query=Elsen%2C+E">Erich Elsen</a>, <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a>, <a href="/search/cs?searchtype=author&query=Zen%2C+H">Heiga Zen</a>, <a href="/search/cs?searchtype=author&query=Graves%2C+A">Alex Graves</a>, <a href="/search/cs?searchtype=author&query=King%2C+H">Helen King</a>, <a href="/search/cs?searchtype=author&query=Walters%2C+T">Tom Walters</a>, <a href="/search/cs?searchtype=author&query=Belov%2C+D">Dan Belov</a>, <a href="/search/cs?searchtype=author&query=Hassabis%2C+D">Demis Hassabis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1711.10433v1-abstract-short" style="display: inline;"> The recently-developed WaveNet architecture is the current state of the art in realistic speech synthesis, consistently rated as more natural sounding for many different languages than any previous system. However, because WaveNet relies on sequential generation of one audio sample at a time, it is poorly suited to today's massively parallel computers, and therefore hard to deploy in a real-time p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1711.10433v1-abstract-full').style.display = 'inline'; document.getElementById('1711.10433v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1711.10433v1-abstract-full" style="display: none;"> The recently-developed WaveNet architecture is the current state of the art in realistic speech synthesis, consistently rated as more natural sounding for many different languages than any previous system. However, because WaveNet relies on sequential generation of one audio sample at a time, it is poorly suited to today's massively parallel computers, and therefore hard to deploy in a real-time production setting. This paper introduces Probability Density Distillation, a new method for training a parallel feed-forward network from a trained WaveNet with no significant difference in quality. The resulting system is capable of generating high-fidelity speech samples at more than 20 times faster than real-time, and is deployed online by Google Assistant, including serving multiple English and Japanese voices. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1711.10433v1-abstract-full').style.display = 'none'; document.getElementById('1711.10433v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2017. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1703.03664">arXiv:1703.03664</a> <span> [<a href="https://arxiv.org/pdf/1703.03664">pdf</a>, <a href="https://arxiv.org/format/1703.03664">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Parallel Multiscale Autoregressive Density Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Reed%2C+S">Scott Reed</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">A盲ron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a>, <a href="/search/cs?searchtype=author&query=Colmenarejo%2C+S+G">Sergio G贸mez Colmenarejo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziyu Wang</a>, <a href="/search/cs?searchtype=author&query=Belov%2C+D">Dan Belov</a>, <a href="/search/cs?searchtype=author&query=de+Freitas%2C+N">Nando de Freitas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1703.03664v1-abstract-short" style="display: inline;"> PixelCNN achieves state-of-the-art results in density estimation for natural images. Although training is fast, inference is costly, requiring one network evaluation per pixel; O(N) for N pixels. This can be sped up by caching activations, but still involves generating each pixel sequentially. In this work, we propose a parallelized PixelCNN that allows more efficient inference by modeling certain… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1703.03664v1-abstract-full').style.display = 'inline'; document.getElementById('1703.03664v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1703.03664v1-abstract-full" style="display: none;"> PixelCNN achieves state-of-the-art results in density estimation for natural images. Although training is fast, inference is costly, requiring one network evaluation per pixel; O(N) for N pixels. This can be sped up by caching activations, but still involves generating each pixel sequentially. In this work, we propose a parallelized PixelCNN that allows more efficient inference by modeling certain pixel groups as conditionally independent. Our new PixelCNN model achieves competitive density estimation and orders of magnitude speedup - O(log N) sampling instead of O(N) - enabling the practical generation of 512x512 images. We evaluate the model on class-conditional image generation, text-to-image synthesis, and action-conditional video generation, showing that our model achieves the best results among non-pixel-autoregressive density models that allow efficient sampling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1703.03664v1-abstract-full').style.display = 'none'; document.getElementById('1703.03664v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2017. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1610.10099">arXiv:1610.10099</a> <span> [<a href="https://arxiv.org/pdf/1610.10099">pdf</a>, <a href="https://arxiv.org/format/1610.10099">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Neural Machine Translation in Linear Time </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a>, <a href="/search/cs?searchtype=author&query=Espeholt%2C+L">Lasse Espeholt</a>, <a href="/search/cs?searchtype=author&query=Simonyan%2C+K">Karen Simonyan</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">Aaron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Graves%2C+A">Alex Graves</a>, <a href="/search/cs?searchtype=author&query=Kavukcuoglu%2C+K">Koray Kavukcuoglu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1610.10099v2-abstract-short" style="display: inline;"> We present a novel neural network for processing sequences. The ByteNet is a one-dimensional convolutional neural network that is composed of two parts, one to encode the source sequence and the other to decode the target sequence. The two network parts are connected by stacking the decoder on top of the encoder and preserving the temporal resolution of the sequences. To address the differing leng… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1610.10099v2-abstract-full').style.display = 'inline'; document.getElementById('1610.10099v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1610.10099v2-abstract-full" style="display: none;"> We present a novel neural network for processing sequences. The ByteNet is a one-dimensional convolutional neural network that is composed of two parts, one to encode the source sequence and the other to decode the target sequence. The two network parts are connected by stacking the decoder on top of the encoder and preserving the temporal resolution of the sequences. To address the differing lengths of the source and the target, we introduce an efficient mechanism by which the decoder is dynamically unfolded over the representation of the encoder. The ByteNet uses dilation in the convolutional layers to increase its receptive field. The resulting network has two core properties: it runs in time that is linear in the length of the sequences and it sidesteps the need for excessive memorization. The ByteNet decoder attains state-of-the-art performance on character-level language modelling and outperforms the previous best results obtained with recurrent networks. The ByteNet also achieves state-of-the-art performance on character-to-character machine translation on the English-to-German WMT translation task, surpassing comparable neural translation models that are based on recurrent networks with attentional pooling and run in quadratic time. We find that the latent alignment structure contained in the representations reflects the expected alignment between the tokens. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1610.10099v2-abstract-full').style.display = 'none'; document.getElementById('1610.10099v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2017; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 October, 2016; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2016. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1610.00527">arXiv:1610.00527</a> <span> [<a href="https://arxiv.org/pdf/1610.00527">pdf</a>, <a href="https://arxiv.org/format/1610.00527">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Video Pixel Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a>, <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">Aaron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Simonyan%2C+K">Karen Simonyan</a>, <a href="/search/cs?searchtype=author&query=Danihelka%2C+I">Ivo Danihelka</a>, <a href="/search/cs?searchtype=author&query=Vinyals%2C+O">Oriol Vinyals</a>, <a href="/search/cs?searchtype=author&query=Graves%2C+A">Alex Graves</a>, <a href="/search/cs?searchtype=author&query=Kavukcuoglu%2C+K">Koray Kavukcuoglu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1610.00527v1-abstract-short" style="display: inline;"> We propose a probabilistic video model, the Video Pixel Network (VPN), that estimates the discrete joint distribution of the raw pixel values in a video. The model and the neural architecture reflect the time, space and color structure of video tensors and encode it as a four-dimensional dependency chain. The VPN approaches the best possible performance on the Moving MNIST benchmark, a leap over t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1610.00527v1-abstract-full').style.display = 'inline'; document.getElementById('1610.00527v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1610.00527v1-abstract-full" style="display: none;"> We propose a probabilistic video model, the Video Pixel Network (VPN), that estimates the discrete joint distribution of the raw pixel values in a video. The model and the neural architecture reflect the time, space and color structure of video tensors and encode it as a four-dimensional dependency chain. The VPN approaches the best possible performance on the Moving MNIST benchmark, a leap over the previous state of the art, and the generated videos show only minor deviations from the ground truth. The VPN also produces detailed samples on the action-conditional Robotic Pushing benchmark and generalizes to the motion of novel objects. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1610.00527v1-abstract-full').style.display = 'none'; document.getElementById('1610.00527v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2016; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2016. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1609.03499">arXiv:1609.03499</a> <span> [<a href="https://arxiv.org/pdf/1609.03499">pdf</a>, <a href="https://arxiv.org/format/1609.03499">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> WaveNet: A Generative Model for Raw Audio </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">Aaron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Dieleman%2C+S">Sander Dieleman</a>, <a href="/search/cs?searchtype=author&query=Zen%2C+H">Heiga Zen</a>, <a href="/search/cs?searchtype=author&query=Simonyan%2C+K">Karen Simonyan</a>, <a href="/search/cs?searchtype=author&query=Vinyals%2C+O">Oriol Vinyals</a>, <a href="/search/cs?searchtype=author&query=Graves%2C+A">Alex Graves</a>, <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a>, <a href="/search/cs?searchtype=author&query=Senior%2C+A">Andrew Senior</a>, <a href="/search/cs?searchtype=author&query=Kavukcuoglu%2C+K">Koray Kavukcuoglu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1609.03499v2-abstract-short" style="display: inline;"> This paper introduces WaveNet, a deep neural network for generating raw audio waveforms. The model is fully probabilistic and autoregressive, with the predictive distribution for each audio sample conditioned on all previous ones; nonetheless we show that it can be efficiently trained on data with tens of thousands of samples per second of audio. When applied to text-to-speech, it yields state-of-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1609.03499v2-abstract-full').style.display = 'inline'; document.getElementById('1609.03499v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1609.03499v2-abstract-full" style="display: none;"> This paper introduces WaveNet, a deep neural network for generating raw audio waveforms. The model is fully probabilistic and autoregressive, with the predictive distribution for each audio sample conditioned on all previous ones; nonetheless we show that it can be efficiently trained on data with tens of thousands of samples per second of audio. When applied to text-to-speech, it yields state-of-the-art performance, with human listeners rating it as significantly more natural sounding than the best parametric and concatenative systems for both English and Mandarin. A single WaveNet can capture the characteristics of many different speakers with equal fidelity, and can switch between them by conditioning on the speaker identity. When trained to model music, we find that it generates novel and often highly realistic musical fragments. We also show that it can be employed as a discriminative model, returning promising results for phoneme recognition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1609.03499v2-abstract-full').style.display = 'none'; document.getElementById('1609.03499v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2016; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 September, 2016; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2016. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1606.05328">arXiv:1606.05328</a> <span> [<a href="https://arxiv.org/pdf/1606.05328">pdf</a>, <a href="https://arxiv.org/format/1606.05328">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Conditional Image Generation with PixelCNN Decoders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">Aaron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a>, <a href="/search/cs?searchtype=author&query=Vinyals%2C+O">Oriol Vinyals</a>, <a href="/search/cs?searchtype=author&query=Espeholt%2C+L">Lasse Espeholt</a>, <a href="/search/cs?searchtype=author&query=Graves%2C+A">Alex Graves</a>, <a href="/search/cs?searchtype=author&query=Kavukcuoglu%2C+K">Koray Kavukcuoglu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1606.05328v2-abstract-short" style="display: inline;"> This work explores conditional image generation with a new image density model based on the PixelCNN architecture. The model can be conditioned on any vector, including descriptive labels or tags, or latent embeddings created by other networks. When conditioned on class labels from the ImageNet database, the model is able to generate diverse, realistic scenes representing distinct animals, objects… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1606.05328v2-abstract-full').style.display = 'inline'; document.getElementById('1606.05328v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1606.05328v2-abstract-full" style="display: none;"> This work explores conditional image generation with a new image density model based on the PixelCNN architecture. The model can be conditioned on any vector, including descriptive labels or tags, or latent embeddings created by other networks. When conditioned on class labels from the ImageNet database, the model is able to generate diverse, realistic scenes representing distinct animals, objects, landscapes and structures. When conditioned on an embedding produced by a convolutional network given a single image of an unseen face, it generates a variety of new portraits of the same person with different facial expressions, poses and lighting conditions. We also show that conditional PixelCNN can serve as a powerful decoder in an image autoencoder. Additionally, the gated convolutional layers in the proposed model improve the log-likelihood of PixelCNN to match the state-of-the-art performance of PixelRNN on ImageNet, with greatly reduced computational cost. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1606.05328v2-abstract-full').style.display = 'none'; document.getElementById('1606.05328v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2016; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 June, 2016; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2016. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1602.03032">arXiv:1602.03032</a> <span> [<a href="https://arxiv.org/pdf/1602.03032">pdf</a>, <a href="https://arxiv.org/format/1602.03032">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Associative Long Short-Term Memory </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Danihelka%2C+I">Ivo Danihelka</a>, <a href="/search/cs?searchtype=author&query=Wayne%2C+G">Greg Wayne</a>, <a href="/search/cs?searchtype=author&query=Uria%2C+B">Benigno Uria</a>, <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a>, <a href="/search/cs?searchtype=author&query=Graves%2C+A">Alex Graves</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1602.03032v2-abstract-short" style="display: inline;"> We investigate a new method to augment recurrent neural networks with extra memory without increasing the number of network parameters. The system has an associative memory based on complex-valued vectors and is closely related to Holographic Reduced Representations and Long Short-Term Memory networks. Holographic Reduced Representations have limited capacity: as they store more information, each… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1602.03032v2-abstract-full').style.display = 'inline'; document.getElementById('1602.03032v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1602.03032v2-abstract-full" style="display: none;"> We investigate a new method to augment recurrent neural networks with extra memory without increasing the number of network parameters. The system has an associative memory based on complex-valued vectors and is closely related to Holographic Reduced Representations and Long Short-Term Memory networks. Holographic Reduced Representations have limited capacity: as they store more information, each retrieval becomes noisier due to interference. Our system in contrast creates redundant copies of stored information, which enables retrieval with reduced noise. Experiments demonstrate faster learning on multiple memorization tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1602.03032v2-abstract-full').style.display = 'none'; document.getElementById('1602.03032v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 May, 2016; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 February, 2016; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2016. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML-2016</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1601.06759">arXiv:1601.06759</a> <span> [<a href="https://arxiv.org/pdf/1601.06759">pdf</a>, <a href="https://arxiv.org/format/1601.06759">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Pixel Recurrent Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Oord%2C+A+v+d">Aaron van den Oord</a>, <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a>, <a href="/search/cs?searchtype=author&query=Kavukcuoglu%2C+K">Koray Kavukcuoglu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1601.06759v3-abstract-short" style="display: inline;"> Modeling the distribution of natural images is a landmark problem in unsupervised learning. This task requires an image model that is at once expressive, tractable and scalable. We present a deep neural network that sequentially predicts the pixels in an image along the two spatial dimensions. Our method models the discrete probability of the raw pixel values and encodes the complete set of depend… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1601.06759v3-abstract-full').style.display = 'inline'; document.getElementById('1601.06759v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1601.06759v3-abstract-full" style="display: none;"> Modeling the distribution of natural images is a landmark problem in unsupervised learning. This task requires an image model that is at once expressive, tractable and scalable. We present a deep neural network that sequentially predicts the pixels in an image along the two spatial dimensions. Our method models the discrete probability of the raw pixel values and encodes the complete set of dependencies in the image. Architectural novelties include fast two-dimensional recurrent layers and an effective use of residual connections in deep recurrent networks. We achieve log-likelihood scores on natural images that are considerably better than the previous state of the art. Our main results also provide benchmarks on the diverse ImageNet dataset. Samples generated from the model appear crisp, varied and globally coherent. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1601.06759v3-abstract-full').style.display = 'none'; document.getElementById('1601.06759v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2016; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 January, 2016; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2016. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1507.01526">arXiv:1507.01526</a> <span> [<a href="https://arxiv.org/pdf/1507.01526">pdf</a>, <a href="https://arxiv.org/format/1507.01526">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Grid Long Short-Term Memory </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a>, <a href="/search/cs?searchtype=author&query=Danihelka%2C+I">Ivo Danihelka</a>, <a href="/search/cs?searchtype=author&query=Graves%2C+A">Alex Graves</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1507.01526v3-abstract-short" style="display: inline;"> This paper introduces Grid Long Short-Term Memory, a network of LSTM cells arranged in a multidimensional grid that can be applied to vectors, sequences or higher dimensional data such as images. The network differs from existing deep LSTM architectures in that the cells are connected between network layers as well as along the spatiotemporal dimensions of the data. The network provides a unified… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1507.01526v3-abstract-full').style.display = 'inline'; document.getElementById('1507.01526v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1507.01526v3-abstract-full" style="display: none;"> This paper introduces Grid Long Short-Term Memory, a network of LSTM cells arranged in a multidimensional grid that can be applied to vectors, sequences or higher dimensional data such as images. The network differs from existing deep LSTM architectures in that the cells are connected between network layers as well as along the spatiotemporal dimensions of the data. The network provides a unified way of using LSTM for both deep and sequential computation. We apply the model to algorithmic tasks such as 15-digit integer addition and sequence memorization, where it is able to significantly outperform the standard LSTM. We then give results for two empirical tasks. We find that 2D Grid LSTM achieves 1.47 bits per character on the Wikipedia character prediction benchmark, which is state-of-the-art among neural approaches. In addition, we use the Grid LSTM to define a novel two-dimensional translation model, the Reencoder, and show that it outperforms a phrase-based reference system on a Chinese-to-English translation task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1507.01526v3-abstract-full').style.display = 'none'; document.getElementById('1507.01526v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2016; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 July, 2015; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2015. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1408.6181">arXiv:1408.6181</a> <span> [<a href="https://arxiv.org/pdf/1408.6181">pdf</a>, <a href="https://arxiv.org/ps/1408.6181">ps</a>, <a href="https://arxiv.org/format/1408.6181">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Resolving Lexical Ambiguity in Tensor Regression Models of Meaning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kartsaklis%2C+D">Dimitri Kartsaklis</a>, <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a>, <a href="/search/cs?searchtype=author&query=Sadrzadeh%2C+M">Mehrnoosh Sadrzadeh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1408.6181v1-abstract-short" style="display: inline;"> This paper provides a method for improving tensor-based compositional distributional models of meaning by the addition of an explicit disambiguation step prior to composition. In contrast with previous research where this hypothesis has been successfully tested against relatively simple compositional models, in our work we use a robust model trained with linear regression. The results we get in tw… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1408.6181v1-abstract-full').style.display = 'inline'; document.getElementById('1408.6181v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1408.6181v1-abstract-full" style="display: none;"> This paper provides a method for improving tensor-based compositional distributional models of meaning by the addition of an explicit disambiguation step prior to composition. In contrast with previous research where this hypothesis has been successfully tested against relatively simple compositional models, in our work we use a robust model trained with linear regression. The results we get in two experiments show the superiority of the prior disambiguation method and suggest that the effectiveness of this approach is model-independent. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1408.6181v1-abstract-full').style.display = 'none'; document.getElementById('1408.6181v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 August, 2014; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2014. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of ACL 2014, Vol. 2:Short Papers, pp:212-217 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1406.3830">arXiv:1406.3830</a> <span> [<a href="https://arxiv.org/pdf/1406.3830">pdf</a>, <a href="https://arxiv.org/format/1406.3830">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Modelling, Visualising and Summarising Documents with a Single Convolutional Neural Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Denil%2C+M">Misha Denil</a>, <a href="/search/cs?searchtype=author&query=Demiraj%2C+A">Alban Demiraj</a>, <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a>, <a href="/search/cs?searchtype=author&query=Blunsom%2C+P">Phil Blunsom</a>, <a href="/search/cs?searchtype=author&query=de+Freitas%2C+N">Nando de Freitas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1406.3830v1-abstract-short" style="display: inline;"> Capturing the compositional process which maps the meaning of words to that of documents is a central challenge for researchers in Natural Language Processing and Information Retrieval. We introduce a model that is able to represent the meaning of documents by embedding them in a low dimensional vector space, while preserving distinctions of word and sentence order crucial for capturing nuanced se… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1406.3830v1-abstract-full').style.display = 'inline'; document.getElementById('1406.3830v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1406.3830v1-abstract-full" style="display: none;"> Capturing the compositional process which maps the meaning of words to that of documents is a central challenge for researchers in Natural Language Processing and Information Retrieval. We introduce a model that is able to represent the meaning of documents by embedding them in a low dimensional vector space, while preserving distinctions of word and sentence order crucial for capturing nuanced semantics. Our model is based on an extended Dynamic Convolution Neural Network, which learns convolution filters at both the sentence and document level, hierarchically learning to capture and compose low level lexical features into high level semantic concepts. We demonstrate the effectiveness of this model on a range of document modelling tasks, achieving strong results with no feature engineering and with a more compact model. Inspired by recent advances in visualising deep convolution networks for computer vision, we present a novel visualisation technique for our document networks which not only provides insight into their learning process, but also can be interpreted to produce a compelling automatic summarisation system for texts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1406.3830v1-abstract-full').style.display = 'none'; document.getElementById('1406.3830v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 June, 2014; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2014. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1404.2188">arXiv:1404.2188</a> <span> [<a href="https://arxiv.org/pdf/1404.2188">pdf</a>, <a href="https://arxiv.org/format/1404.2188">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Convolutional Neural Network for Modelling Sentences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a>, <a href="/search/cs?searchtype=author&query=Grefenstette%2C+E">Edward Grefenstette</a>, <a href="/search/cs?searchtype=author&query=Blunsom%2C+P">Phil Blunsom</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1404.2188v1-abstract-short" style="display: inline;"> The ability to accurately represent sentences is central to language understanding. We describe a convolutional architecture dubbed the Dynamic Convolutional Neural Network (DCNN) that we adopt for the semantic modelling of sentences. The network uses Dynamic k-Max Pooling, a global pooling operation over linear sequences. The network handles input sentences of varying length and induces a feature… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1404.2188v1-abstract-full').style.display = 'inline'; document.getElementById('1404.2188v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1404.2188v1-abstract-full" style="display: none;"> The ability to accurately represent sentences is central to language understanding. We describe a convolutional architecture dubbed the Dynamic Convolutional Neural Network (DCNN) that we adopt for the semantic modelling of sentences. The network uses Dynamic k-Max Pooling, a global pooling operation over linear sequences. The network handles input sentences of varying length and induces a feature graph over the sentence that is capable of explicitly capturing short and long-range relations. The network does not rely on a parse tree and is easily applicable to any language. We test the DCNN in four experiments: small scale binary and multi-class sentiment prediction, six-way question classification and Twitter sentiment prediction by distant supervision. The network achieves excellent performance in the first three tasks and a greater than 25% error reduction in the last task with respect to the strongest baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1404.2188v1-abstract-full').style.display = 'none'; document.getElementById('1404.2188v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 April, 2014; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2014. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1306.3584">arXiv:1306.3584</a> <span> [<a href="https://arxiv.org/pdf/1306.3584">pdf</a>, <a href="https://arxiv.org/format/1306.3584">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Recurrent Convolutional Neural Networks for Discourse Compositionality </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kalchbrenner%2C+N">Nal Kalchbrenner</a>, <a href="/search/cs?searchtype=author&query=Blunsom%2C+P">Phil Blunsom</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1306.3584v1-abstract-short" style="display: inline;"> The compositionality of meaning extends beyond the single sentence. Just as words combine to form the meaning of sentences, so do sentences combine to form the meaning of paragraphs, dialogues and general discourse. We introduce both a sentence model and a discourse model corresponding to the two levels of compositionality. The sentence model adopts convolution as the central operation for composi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1306.3584v1-abstract-full').style.display = 'inline'; document.getElementById('1306.3584v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1306.3584v1-abstract-full" style="display: none;"> The compositionality of meaning extends beyond the single sentence. Just as words combine to form the meaning of sentences, so do sentences combine to form the meaning of paragraphs, dialogues and general discourse. We introduce both a sentence model and a discourse model corresponding to the two levels of compositionality. The sentence model adopts convolution as the central operation for composing semantic vectors and is based on a novel hierarchical convolutional neural network. The discourse model extends the sentence model and is based on a recurrent neural network that is conditioned in a novel way both on the current sentence and on the current speaker. The discourse model is able to capture both the sequentiality of sentences and the interaction between different speakers. Without feature engineering or pretraining and with simple greedy decoding, the discourse model coupled to the sentence model obtains state of the art performance on a dialogue act classification experiment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1306.3584v1-abstract-full').style.display = 'none'; document.getElementById('1306.3584v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 June, 2013; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2013. </p> </li> </ol> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository