CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 76 results for author: <span class="mathjax">Courville, A</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/stat" aria-role="search"> Searching in archive <strong>stat</strong>. <a href="/search/?searchtype=author&amp;query=Courville%2C+A">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Courville, A"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Courville%2C+A&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Courville, A"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Courville%2C+A&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Courville%2C+A&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Courville%2C+A&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.02679">arXiv:2310.02679</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.02679">pdf</a>, <a href="https://arxiv.org/format/2310.02679">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation">stat.CO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Diffusion Generative Flow Samplers: Improving learning signals through partial trajectory optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Zhang%2C+D">Dinghuai Zhang</a>, <a href="/search/stat?searchtype=author&amp;query=Chen%2C+R+T+Q">Ricky T. Q. Chen</a>, <a href="/search/stat?searchtype=author&amp;query=Liu%2C+C">Cheng-Hao Liu</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a>, <a href="/search/stat?searchtype=author&amp;query=Bengio%2C+Y">Yoshua Bengio</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.02679v3-abstract-short" style="display: inline;"> We tackle the problem of sampling from intractable high-dimensional density functions, a fundamental task that often appears in machine learning and statistics. We extend recent sampling-based approaches that leverage controlled stochastic processes to model approximate samples from these target densities. The main drawback of these approaches is that the training objective requires full trajector&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.02679v3-abstract-full').style.display = 'inline'; document.getElementById('2310.02679v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.02679v3-abstract-full" style="display: none;"> We tackle the problem of sampling from intractable high-dimensional density functions, a fundamental task that often appears in machine learning and statistics. We extend recent sampling-based approaches that leverage controlled stochastic processes to model approximate samples from these target densities. The main drawback of these approaches is that the training objective requires full trajectories to compute, resulting in sluggish credit assignment issues due to use of entire trajectories and a learning signal present only at the terminal time. In this work, we present Diffusion Generative Flow Samplers (DGFS), a sampling-based framework where the learning process can be tractably broken down into short partial trajectory segments, via parameterizing an additional &#34;flow function&#34;. Our method takes inspiration from the theory developed for generative flow networks (GFlowNets), allowing us to make use of intermediate learning signals. Through various challenging experiments, we demonstrate that DGFS achieves more accurate estimates of the normalization constant than closely-related prior methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.02679v3-abstract-full').style.display = 'none'; document.getElementById('2310.02679v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.17010">arXiv:2305.17010</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.17010">pdf</a>, <a href="https://arxiv.org/format/2305.17010">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Discrete Mathematics">cs.DM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Let the Flows Tell: Solving Graph Combinatorial Optimization Problems with GFlowNets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Zhang%2C+D">Dinghuai Zhang</a>, <a href="/search/stat?searchtype=author&amp;query=Dai%2C+H">Hanjun Dai</a>, <a href="/search/stat?searchtype=author&amp;query=Malkin%2C+N">Nikolay Malkin</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a>, <a href="/search/stat?searchtype=author&amp;query=Bengio%2C+Y">Yoshua Bengio</a>, <a href="/search/stat?searchtype=author&amp;query=Pan%2C+L">Ling Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.17010v3-abstract-short" style="display: inline;"> Combinatorial optimization (CO) problems are often NP-hard and thus out of reach for exact algorithms, making them a tempting domain to apply machine learning methods. The highly structured constraints in these problems can hinder either optimization or sampling directly in the solution space. On the other hand, GFlowNets have recently emerged as a powerful machinery to efficiently sample from com&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.17010v3-abstract-full').style.display = 'inline'; document.getElementById('2305.17010v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.17010v3-abstract-full" style="display: none;"> Combinatorial optimization (CO) problems are often NP-hard and thus out of reach for exact algorithms, making them a tempting domain to apply machine learning methods. The highly structured constraints in these problems can hinder either optimization or sampling directly in the solution space. On the other hand, GFlowNets have recently emerged as a powerful machinery to efficiently sample from composite unnormalized densities sequentially and have the potential to amortize such solution-searching processes in CO, as well as generate diverse solution candidates. In this paper, we design Markov decision processes (MDPs) for different combinatorial problems and propose to train conditional GFlowNets to sample from the solution space. Efficient training techniques are also developed to benefit long-range credit assignment. Through extensive experiments on a variety of different CO tasks with synthetic and realistic data, we demonstrate that GFlowNet policies can efficiently find high-quality solutions. Our implementation is open-sourced at https://github.com/zdhNarsil/GFlowNet-CombOpt. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.17010v3-abstract-full').style.display = 'none'; document.getElementById('2305.17010v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2023 as spotlight</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.05793">arXiv:2302.05793</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2302.05793">pdf</a>, <a href="https://arxiv.org/format/2302.05793">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation">stat.CO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Distributional GFlowNets with Quantile Flows </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Zhang%2C+D">Dinghuai Zhang</a>, <a href="/search/stat?searchtype=author&amp;query=Pan%2C+L">Ling Pan</a>, <a href="/search/stat?searchtype=author&amp;query=Chen%2C+R+T+Q">Ricky T. Q. Chen</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a>, <a href="/search/stat?searchtype=author&amp;query=Bengio%2C+Y">Yoshua Bengio</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.05793v3-abstract-short" style="display: inline;"> Generative Flow Networks (GFlowNets) are a new family of probabilistic samplers where an agent learns a stochastic policy for generating complex combinatorial structure through a series of decision-making steps. Despite being inspired from reinforcement learning, the current GFlowNet framework is relatively limited in its applicability and cannot handle stochasticity in the reward function. In thi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.05793v3-abstract-full').style.display = 'inline'; document.getElementById('2302.05793v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.05793v3-abstract-full" style="display: none;"> Generative Flow Networks (GFlowNets) are a new family of probabilistic samplers where an agent learns a stochastic policy for generating complex combinatorial structure through a series of decision-making steps. Despite being inspired from reinforcement learning, the current GFlowNet framework is relatively limited in its applicability and cannot handle stochasticity in the reward function. In this work, we adopt a distributional paradigm for GFlowNets, turning each flow function into a distribution, thus providing more informative learning signals during training. By parameterizing each edge flow through their quantile functions, our proposed \textit{quantile matching} GFlowNet learning algorithm is able to learn a risk-sensitive policy, an essential component for handling scenarios with risk uncertainty. Moreover, we find that the distributional approach can achieve substantial improvement on existing benchmarks compared to prior methods due to our enhanced training algorithm, even in settings with deterministic rewards. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.05793v3-abstract-full').style.display = 'none'; document.getElementById('2302.05793v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by TMLR</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.00695">arXiv:2302.00695</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2302.00695">pdf</a>, <a href="https://arxiv.org/format/2302.00695">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="High Energy Physics - Experiment">hep-ex</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="High Energy Physics - Phenomenology">hep-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Versatile Energy-Based Probabilistic Models for High Energy Physics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Cheng%2C+T">Taoli Cheng</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.00695v5-abstract-short" style="display: inline;"> As a classical generative modeling approach, energy-based models have the natural advantage of flexibility in the form of the energy function. Recently, energy-based models have achieved great success in modeling high-dimensional data in computer vision and natural language processing. In line with these advancements, we build a multi-purpose energy-based probabilistic model for High Energy Physic&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.00695v5-abstract-full').style.display = 'inline'; document.getElementById('2302.00695v5-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.00695v5-abstract-full" style="display: none;"> As a classical generative modeling approach, energy-based models have the natural advantage of flexibility in the form of the energy function. Recently, energy-based models have achieved great success in modeling high-dimensional data in computer vision and natural language processing. In line with these advancements, we build a multi-purpose energy-based probabilistic model for High Energy Physics events at the Large Hadron Collider. This framework builds on a powerful generative model and describes higher-order inter-particle interactions. It suits different encoding architectures and builds on implicit generation. As for applicative aspects, it can serve as a powerful parameterized event generator for physics simulation, a generic anomalous signal detector free from spurious correlations, and an augmented event classifier for particle identification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.00695v5-abstract-full').style.display = 'none'; document.getElementById('2302.00695v5-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 9 figures. NeurIPS 2023 camera ready</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.00999">arXiv:2210.00999</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2210.00999">pdf</a>, <a href="https://arxiv.org/format/2210.00999">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Latent State Marginalization as a Low-cost Approach for Improving Exploration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Zhang%2C+D">Dinghuai Zhang</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a>, <a href="/search/stat?searchtype=author&amp;query=Bengio%2C+Y">Yoshua Bengio</a>, <a href="/search/stat?searchtype=author&amp;query=Zheng%2C+Q">Qinqing Zheng</a>, <a href="/search/stat?searchtype=author&amp;query=Zhang%2C+A">Amy Zhang</a>, <a href="/search/stat?searchtype=author&amp;query=Chen%2C+R+T+Q">Ricky T. Q. Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.00999v2-abstract-short" style="display: inline;"> While the maximum entropy (MaxEnt) reinforcement learning (RL) framework -- often touted for its exploration and robustness capabilities -- is usually motivated from a probabilistic perspective, the use of deep probabilistic models has not gained much traction in practice due to their inherent complexity. In this work, we propose the adoption of latent variable policies within the MaxEnt framework&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.00999v2-abstract-full').style.display = 'inline'; document.getElementById('2210.00999v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.00999v2-abstract-full" style="display: none;"> While the maximum entropy (MaxEnt) reinforcement learning (RL) framework -- often touted for its exploration and robustness capabilities -- is usually motivated from a probabilistic perspective, the use of deep probabilistic models has not gained much traction in practice due to their inherent complexity. In this work, we propose the adoption of latent variable policies within the MaxEnt framework, which we show can provably approximate any policy distribution, and additionally, naturally emerges under the use of world models with a latent belief state. We discuss why latent variable policies are difficult to train, how naive approaches can fail, then subsequently introduce a series of improvements centered around low-cost marginalization of the latent state, allowing us to make full use of the latent state at minimal additional cost. We instantiate our method under the actor-critic framework, marginalizing both the actor and critic. The resulting algorithm, referred to as Stochastic Marginal Actor-Critic (SMAC), is simple yet effective. We experimentally validate our method on continuous control tasks, showing that effective marginalization can lead to better exploration and more robust training. Our implementation is open sourced at https://github.com/zdhNarsil/Stochastic-Marginal-Actor-Critic. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.00999v2-abstract-full').style.display = 'none'; document.getElementById('2210.00999v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.03362">arXiv:2206.03362</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2206.03362">pdf</a>, <a href="https://arxiv.org/format/2206.03362">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Building Robust Ensembles via Margin Boosting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Zhang%2C+D">Dinghuai Zhang</a>, <a href="/search/stat?searchtype=author&amp;query=Zhang%2C+H">Hongyang Zhang</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a>, <a href="/search/stat?searchtype=author&amp;query=Bengio%2C+Y">Yoshua Bengio</a>, <a href="/search/stat?searchtype=author&amp;query=Ravikumar%2C+P">Pradeep Ravikumar</a>, <a href="/search/stat?searchtype=author&amp;query=Suggala%2C+A+S">Arun Sai Suggala</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.03362v1-abstract-short" style="display: inline;"> In the context of adversarial robustness, a single model does not usually have enough power to defend against all possible adversarial attacks, and as a result, has sub-optimal robustness. Consequently, an emerging line of work has focused on learning an ensemble of neural networks to defend against adversarial attacks. In this work, we take a principled approach towards building robust ensembles.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.03362v1-abstract-full').style.display = 'inline'; document.getElementById('2206.03362v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.03362v1-abstract-full" style="display: none;"> In the context of adversarial robustness, a single model does not usually have enough power to defend against all possible adversarial attacks, and as a result, has sub-optimal robustness. Consequently, an emerging line of work has focused on learning an ensemble of neural networks to defend against adversarial attacks. In this work, we take a principled approach towards building robust ensembles. We view this problem from the perspective of margin-boosting and develop an algorithm for learning an ensemble with maximum margin. Through extensive empirical evaluation on benchmark datasets, we show that our algorithm not only outperforms existing ensembling techniques, but also large models trained in an end-to-end fashion. An important byproduct of our work is a margin-maximizing cross-entropy (MCE) loss, which is a better alternative to the standard cross-entropy (CE) loss. Empirically, we show that replacing the CE loss in state-of-the-art adversarial training techniques with our MCE loss leads to significant performance improvement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.03362v1-abstract-full').style.display = 'none'; document.getElementById('2206.03362v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICML 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.01626">arXiv:2206.01626</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2206.01626">pdf</a>, <a href="https://arxiv.org/format/2206.01626">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Reincarnating Reinforcement Learning: Reusing Prior Computation to Accelerate Progress </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Agarwal%2C+R">Rishabh Agarwal</a>, <a href="/search/stat?searchtype=author&amp;query=Schwarzer%2C+M">Max Schwarzer</a>, <a href="/search/stat?searchtype=author&amp;query=Castro%2C+P+S">Pablo Samuel Castro</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a>, <a href="/search/stat?searchtype=author&amp;query=Bellemare%2C+M+G">Marc G. Bellemare</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.01626v2-abstract-short" style="display: inline;"> Learning tabula rasa, that is without any prior knowledge, is the prevalent workflow in reinforcement learning (RL) research. However, RL systems, when applied to large-scale settings, rarely operate tabula rasa. Such large-scale systems undergo multiple design or algorithmic changes during their development cycle and use ad hoc approaches for incorporating these changes without re-training from s&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.01626v2-abstract-full').style.display = 'inline'; document.getElementById('2206.01626v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.01626v2-abstract-full" style="display: none;"> Learning tabula rasa, that is without any prior knowledge, is the prevalent workflow in reinforcement learning (RL) research. However, RL systems, when applied to large-scale settings, rarely operate tabula rasa. Such large-scale systems undergo multiple design or algorithmic changes during their development cycle and use ad hoc approaches for incorporating these changes without re-training from scratch, which would have been prohibitively expensive. Additionally, the inefficiency of deep RL typically excludes researchers without access to industrial-scale resources from tackling computationally-demanding problems. To address these issues, we present reincarnating RL as an alternative workflow or class of problem settings, where prior computational work (e.g., learned policies) is reused or transferred between design iterations of an RL agent, or from one RL agent to another. As a step towards enabling reincarnating RL from any agent to any other agent, we focus on the specific setting of efficiently transferring an existing sub-optimal policy to a standalone value-based RL agent. We find that existing approaches fail in this setting and propose a simple algorithm to address their limitations. Equipped with this algorithm, we demonstrate reincarnating RL&#39;s gains over tabula rasa RL on Atari 2600 games, a challenging locomotion task, and the real-world problem of navigating stratospheric balloons. Overall, this work argues for an alternative approach to RL research, which we believe could significantly improve real-world RL adoption and help democratize it further. Open-sourced code and trained agents at https://agarwl.github.io/reincarnating_rl. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.01626v2-abstract-full').style.display = 'none'; document.getElementById('2206.01626v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2022. Code and agents at https://agarwl.github.io/reincarnating_rl</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.07802">arXiv:2205.07802</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2205.07802">pdf</a>, <a href="https://arxiv.org/format/2205.07802">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> The Primacy Bias in Deep Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Nikishin%2C+E">Evgenii Nikishin</a>, <a href="/search/stat?searchtype=author&amp;query=Schwarzer%2C+M">Max Schwarzer</a>, <a href="/search/stat?searchtype=author&amp;query=D%27Oro%2C+P">Pierluca D&#39;Oro</a>, <a href="/search/stat?searchtype=author&amp;query=Bacon%2C+P">Pierre-Luc Bacon</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.07802v1-abstract-short" style="display: inline;"> This work identifies a common flaw of deep reinforcement learning (RL) algorithms: a tendency to rely on early interactions and ignore useful evidence encountered later. Because of training on progressively growing datasets, deep RL agents incur a risk of overfitting to earlier experiences, negatively affecting the rest of the learning process. Inspired by cognitive science, we refer to this effec&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.07802v1-abstract-full').style.display = 'inline'; document.getElementById('2205.07802v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.07802v1-abstract-full" style="display: none;"> This work identifies a common flaw of deep reinforcement learning (RL) algorithms: a tendency to rely on early interactions and ignore useful evidence encountered later. Because of training on progressively growing datasets, deep RL agents incur a risk of overfitting to earlier experiences, negatively affecting the rest of the learning process. Inspired by cognitive science, we refer to this effect as the primacy bias. Through a series of experiments, we dissect the algorithmic aspects of deep RL that exacerbate this bias. We then propose a simple yet generally-applicable mechanism that tackles the primacy bias by periodically resetting a part of the agent. We apply this mechanism to algorithms in both discrete (Atari 100k) and continuous action (DeepMind Control Suite) domains, consistently improving their performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.07802v1-abstract-full').style.display = 'none'; document.getElementById('2205.07802v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2022; code at https://github.com/evgenii-nikishin/rl_with_resets</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.01361">arXiv:2202.01361</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2202.01361">pdf</a>, <a href="https://arxiv.org/format/2202.01361">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Generative Flow Networks for Discrete Probabilistic Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Zhang%2C+D">Dinghuai Zhang</a>, <a href="/search/stat?searchtype=author&amp;query=Malkin%2C+N">Nikolay Malkin</a>, <a href="/search/stat?searchtype=author&amp;query=Liu%2C+Z">Zhen Liu</a>, <a href="/search/stat?searchtype=author&amp;query=Volokhova%2C+A">Alexandra Volokhova</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a>, <a href="/search/stat?searchtype=author&amp;query=Bengio%2C+Y">Yoshua Bengio</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.01361v2-abstract-short" style="display: inline;"> We present energy-based generative flow networks (EB-GFN), a novel probabilistic modeling algorithm for high-dimensional discrete data. Building upon the theory of generative flow networks (GFlowNets), we model the generation process by a stochastic data construction policy and thus amortize expensive MCMC exploration into a fixed number of actions sampled from a GFlowNet. We show how GFlowNets ca&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.01361v2-abstract-full').style.display = 'inline'; document.getElementById('2202.01361v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.01361v2-abstract-full" style="display: none;"> We present energy-based generative flow networks (EB-GFN), a novel probabilistic modeling algorithm for high-dimensional discrete data. Building upon the theory of generative flow networks (GFlowNets), we model the generation process by a stochastic data construction policy and thus amortize expensive MCMC exploration into a fixed number of actions sampled from a GFlowNet. We show how GFlowNets can approximately perform large-block Gibbs sampling to mix between modes. We propose a framework to jointly train a GFlowNet with an energy function, so that the GFlowNet learns to sample from the energy distribution, while the energy learns with an approximate MLE objective with negative samples from the GFlowNet. We demonstrate EB-GFN&#39;s effectiveness on various probabilistic modeling tasks. Code is publicly available at https://github.com/zdhNarsil/EB_GFN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.01361v2-abstract-full').style.display = 'none'; document.getElementById('2202.01361v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICML 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.03372">arXiv:2110.03372</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2110.03372">pdf</a>, <a href="https://arxiv.org/format/2110.03372">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Unifying Likelihood-free Inference with Black-box Optimization and Beyond </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Zhang%2C+D">Dinghuai Zhang</a>, <a href="/search/stat?searchtype=author&amp;query=Fu%2C+J">Jie Fu</a>, <a href="/search/stat?searchtype=author&amp;query=Bengio%2C+Y">Yoshua Bengio</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.03372v2-abstract-short" style="display: inline;"> Black-box optimization formulations for biological sequence design have drawn recent attention due to their promising potential impact on the pharmaceutical industry. In this work, we propose to unify two seemingly distinct worlds: likelihood-free inference and black-box optimization, under one probabilistic framework. In tandem, we provide a recipe for constructing various sequence design methods&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.03372v2-abstract-full').style.display = 'inline'; document.getElementById('2110.03372v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.03372v2-abstract-full" style="display: none;"> Black-box optimization formulations for biological sequence design have drawn recent attention due to their promising potential impact on the pharmaceutical industry. In this work, we propose to unify two seemingly distinct worlds: likelihood-free inference and black-box optimization, under one probabilistic framework. In tandem, we provide a recipe for constructing various sequence design methods based on this framework. We show how previous optimization approaches can be &#34;reinvented&#34; in our framework, and further propose new probabilistic black-box optimization algorithms. Extensive experiments on sequence design application illustrate the benefits of the proposed methodology. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.03372v2-abstract-full').style.display = 'none'; document.getElementById('2110.03372v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2022 spotlight</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2108.13264">arXiv:2108.13264</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2108.13264">pdf</a>, <a href="https://arxiv.org/format/2108.13264">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Deep Reinforcement Learning at the Edge of the Statistical Precipice </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Agarwal%2C+R">Rishabh Agarwal</a>, <a href="/search/stat?searchtype=author&amp;query=Schwarzer%2C+M">Max Schwarzer</a>, <a href="/search/stat?searchtype=author&amp;query=Castro%2C+P+S">Pablo Samuel Castro</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a>, <a href="/search/stat?searchtype=author&amp;query=Bellemare%2C+M+G">Marc G. Bellemare</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2108.13264v4-abstract-short" style="display: inline;"> Deep reinforcement learning (RL) algorithms are predominantly evaluated by comparing their relative performance on a large suite of tasks. Most published results on deep RL benchmarks compare point estimates of aggregate performance such as mean and median scores across tasks, ignoring the statistical uncertainty implied by the use of a finite number of training runs. Beginning with the Arcade Lea&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.13264v4-abstract-full').style.display = 'inline'; document.getElementById('2108.13264v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2108.13264v4-abstract-full" style="display: none;"> Deep reinforcement learning (RL) algorithms are predominantly evaluated by comparing their relative performance on a large suite of tasks. Most published results on deep RL benchmarks compare point estimates of aggregate performance such as mean and median scores across tasks, ignoring the statistical uncertainty implied by the use of a finite number of training runs. Beginning with the Arcade Learning Environment (ALE), the shift towards computationally-demanding benchmarks has led to the practice of evaluating only a small number of runs per task, exacerbating the statistical uncertainty in point estimates. In this paper, we argue that reliable evaluation in the few run deep RL regime cannot ignore the uncertainty in results without running the risk of slowing down progress in the field. We illustrate this point using a case study on the Atari 100k benchmark, where we find substantial discrepancies between conclusions drawn from point estimates alone versus a more thorough statistical analysis. With the aim of increasing the field&#39;s confidence in reported results with a handful of runs, we advocate for reporting interval estimates of aggregate performance and propose performance profiles to account for the variability in results, as well as present more robust and efficient aggregate metrics, such as interquartile mean scores, to achieve small uncertainty in results. Using such statistical tools, we scrutinize performance evaluations of existing algorithms on other widely used RL benchmarks including the ALE, Procgen, and the DeepMind Control Suite, again revealing discrepancies in prior comparisons. Our findings call for a change in how we evaluate performance in deep RL, for which we present a more rigorous evaluation methodology, accompanied with an open-source library rliable, to prevent unreliable results from stagnating the field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.13264v4-abstract-full').style.display = 'none'; document.getElementById('2108.13264v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Outstanding Paper Award at NeurIPS 2021. Website: https://agarwl.github.io/rliable. 28 Pages, 33 Figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.02890">arXiv:2106.02890</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2106.02890">pdf</a>, <a href="https://arxiv.org/format/2106.02890">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Can Subnetwork Structure be the Key to Out-of-Distribution Generalization? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Zhang%2C+D">Dinghuai Zhang</a>, <a href="/search/stat?searchtype=author&amp;query=Ahuja%2C+K">Kartik Ahuja</a>, <a href="/search/stat?searchtype=author&amp;query=Xu%2C+Y">Yilun Xu</a>, <a href="/search/stat?searchtype=author&amp;query=Wang%2C+Y">Yisen Wang</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.02890v1-abstract-short" style="display: inline;"> Can models with particular structure avoid being biased towards spurious correlation in out-of-distribution (OOD) generalization? Peters et al. (2016) provides a positive answer for linear cases. In this paper, we use a functional modular probing method to analyze deep model structures under OOD setting. We demonstrate that even in biased models (which focus on spurious correlation) there still ex&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.02890v1-abstract-full').style.display = 'inline'; document.getElementById('2106.02890v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.02890v1-abstract-full" style="display: none;"> Can models with particular structure avoid being biased towards spurious correlation in out-of-distribution (OOD) generalization? Peters et al. (2016) provides a positive answer for linear cases. In this paper, we use a functional modular probing method to analyze deep model structures under OOD setting. We demonstrate that even in biased models (which focus on spurious correlation) there still exist unbiased functional subnetworks. Furthermore, we articulate and demonstrate the functional lottery ticket hypothesis: full network contains a subnetwork that can achieve better OOD performance. We then propose Modular Risk Minimization to solve the subnetwork selection problem. Our algorithm learns the subnetwork structure from a given dataset, and can be combined with any other OOD regularization methods. Experiments on various OOD generalization tasks corroborate the effectiveness of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.02890v1-abstract-full').style.display = 'none'; document.getElementById('2106.02890v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICML2021 as long talk</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.09468">arXiv:2011.09468</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2011.09468">pdf</a>, <a href="https://arxiv.org/format/2011.09468">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Dynamical Systems">math.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Gradient Starvation: A Learning Proclivity in Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Pezeshki%2C+M">Mohammad Pezeshki</a>, <a href="/search/stat?searchtype=author&amp;query=Kaba%2C+S">S茅kou-Oumar Kaba</a>, <a href="/search/stat?searchtype=author&amp;query=Bengio%2C+Y">Yoshua Bengio</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a>, <a href="/search/stat?searchtype=author&amp;query=Precup%2C+D">Doina Precup</a>, <a href="/search/stat?searchtype=author&amp;query=Lajoie%2C+G">Guillaume Lajoie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.09468v4-abstract-short" style="display: inline;"> We identify and formalize a fundamental gradient descent phenomenon resulting in a learning proclivity in over-parameterized neural networks. Gradient Starvation arises when cross-entropy loss is minimized by capturing only a subset of features relevant for the task, despite the presence of other predictive features that fail to be discovered. This work provides a theoretical explanation for the e&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.09468v4-abstract-full').style.display = 'inline'; document.getElementById('2011.09468v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.09468v4-abstract-full" style="display: none;"> We identify and formalize a fundamental gradient descent phenomenon resulting in a learning proclivity in over-parameterized neural networks. Gradient Starvation arises when cross-entropy loss is minimized by capturing only a subset of features relevant for the task, despite the presence of other predictive features that fail to be discovered. This work provides a theoretical explanation for the emergence of such feature imbalance in neural networks. Using tools from Dynamical Systems theory, we identify simple properties of learning dynamics during gradient descent that lead to this imbalance, and prove that such a situation can be expected given certain statistical structure in training data. Based on our proposed formalism, we develop guarantees for a novel regularization method aimed at decoupling feature learning dynamics, improving accuracy and robustness in cases hindered by gradient starvation. We illustrate our findings with simple and real-world out-of-distribution (OOD) generalization experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.09468v4-abstract-full').style.display = 'none'; document.getElementById('2011.09468v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Proceeding of NeurIPS 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2010.10079">arXiv:2010.10079</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2010.10079">pdf</a>, <a href="https://arxiv.org/format/2010.10079">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applications">stat.AP</span> </div> </div> <p class="title is-5 mathjax"> Neural Approximate Sufficient Statistics for Implicit Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Chen%2C+Y">Yanzhi Chen</a>, <a href="/search/stat?searchtype=author&amp;query=Zhang%2C+D">Dinghuai Zhang</a>, <a href="/search/stat?searchtype=author&amp;query=Gutmann%2C+M">Michael Gutmann</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a>, <a href="/search/stat?searchtype=author&amp;query=Zhu%2C+Z">Zhanxing Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2010.10079v2-abstract-short" style="display: inline;"> We consider the fundamental problem of how to automatically construct summary statistics for implicit generative models where the evaluation of the likelihood function is intractable, but sampling data from the model is possible. The idea is to frame the task of constructing sufficient statistics as learning mutual information maximizing representations of the data with the help of deep neural net&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.10079v2-abstract-full').style.display = 'inline'; document.getElementById('2010.10079v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2010.10079v2-abstract-full" style="display: none;"> We consider the fundamental problem of how to automatically construct summary statistics for implicit generative models where the evaluation of the likelihood function is intractable, but sampling data from the model is possible. The idea is to frame the task of constructing sufficient statistics as learning mutual information maximizing representations of the data with the help of deep neural networks. The infomax learning procedure does not need to estimate any density or density ratio. We apply our approach to both traditional approximate Bayesian computation and recent neural likelihood methods, boosting their performance on a range of tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.10079v2-abstract-full').style.display = 'none'; document.getElementById('2010.10079v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR2021 spotlight</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2010.01262">arXiv:2010.01262</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2010.01262">pdf</a>, <a href="https://arxiv.org/format/2010.01262">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Integrating Categorical Semantics into Unsupervised Domain Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Lavoie%2C+S">Samuel Lavoie</a>, <a href="/search/stat?searchtype=author&amp;query=Ahmed%2C+F">Faruk Ahmed</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2010.01262v2-abstract-short" style="display: inline;"> While unsupervised domain translation (UDT) has seen a lot of success recently, we argue that mediating its translation via categorical semantic features could broaden its applicability. In particular, we demonstrate that categorical semantics improves the translation between perceptually different domains sharing multiple object categories. We propose a method to learn, in an unsupervised manner,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.01262v2-abstract-full').style.display = 'inline'; document.getElementById('2010.01262v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2010.01262v2-abstract-full" style="display: none;"> While unsupervised domain translation (UDT) has seen a lot of success recently, we argue that mediating its translation via categorical semantic features could broaden its applicability. In particular, we demonstrate that categorical semantics improves the translation between perceptually different domains sharing multiple object categories. We propose a method to learn, in an unsupervised manner, categorical semantic features (such as object labels) that are invariant of the source and target domains. We show that conditioning the style encoder of unsupervised domain translation methods on the learned categorical semantics leads to a translation preserving the digits on MNIST$\leftrightarrow$SVHN and to a more realistic stylization on Sketches$\to$Reals. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.01262v2-abstract-full').style.display = 'none'; document.getElementById('2010.01262v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages. In submission to the International Conference on Learning Representation (ICLR) 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2007.05929">arXiv:2007.05929</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2007.05929">pdf</a>, <a href="https://arxiv.org/format/2007.05929">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Data-Efficient Reinforcement Learning with Self-Predictive Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Schwarzer%2C+M">Max Schwarzer</a>, <a href="/search/stat?searchtype=author&amp;query=Anand%2C+A">Ankesh Anand</a>, <a href="/search/stat?searchtype=author&amp;query=Goel%2C+R">Rishab Goel</a>, <a href="/search/stat?searchtype=author&amp;query=Hjelm%2C+R+D">R Devon Hjelm</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a>, <a href="/search/stat?searchtype=author&amp;query=Bachman%2C+P">Philip Bachman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2007.05929v4-abstract-short" style="display: inline;"> While deep reinforcement learning excels at solving tasks where large amounts of data can be collected through virtually unlimited interaction with the environment, learning from limited interaction remains a key challenge. We posit that an agent can learn more efficiently if we augment reward maximization with self-supervised objectives based on structure in its visual input and sequential intera&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.05929v4-abstract-full').style.display = 'inline'; document.getElementById('2007.05929v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2007.05929v4-abstract-full" style="display: none;"> While deep reinforcement learning excels at solving tasks where large amounts of data can be collected through virtually unlimited interaction with the environment, learning from limited interaction remains a key challenge. We posit that an agent can learn more efficiently if we augment reward maximization with self-supervised objectives based on structure in its visual input and sequential interaction with the environment. Our method, Self-Predictive Representations(SPR), trains an agent to predict its own latent state representations multiple steps into the future. We compute target representations for future states using an encoder which is an exponential moving average of the agent&#39;s parameters and we make predictions using a learned transition model. On its own, this future prediction objective outperforms prior methods for sample-efficient deep RL from pixels. We further improve performance by adding data augmentation to the future prediction loss, which forces the agent&#39;s representations to be consistent across multiple views of an observation. Our full self-supervised objective, which combines future prediction and data augmentation, achieves a median human-normalized score of 0.415 on Atari in a setting limited to 100k steps of environment interaction, which represents a 55% relative improvement over the previous state-of-the-art. Notably, even in this limited data regime, SPR exceeds expert human scores on 7 out of 26 games. The code associated with this work is available at https://github.com/mila-iqia/spr <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.05929v4-abstract-full').style.display = 'none'; document.getElementById('2007.05929v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 May, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The first two authors contributed equally to this work. v4 includes new ablations and reformatting for ICLR camera ready</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2007.05756">arXiv:2007.05756</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2007.05756">pdf</a>, <a href="https://arxiv.org/format/2007.05756">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Generative Compositional Augmentations for Scene Graph Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Knyazev%2C+B">Boris Knyazev</a>, <a href="/search/stat?searchtype=author&amp;query=de+Vries%2C+H">Harm de Vries</a>, <a href="/search/stat?searchtype=author&amp;query=Cangea%2C+C">C膬t膬lina Cangea</a>, <a href="/search/stat?searchtype=author&amp;query=Taylor%2C+G+W">Graham W. Taylor</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a>, <a href="/search/stat?searchtype=author&amp;query=Belilovsky%2C+E">Eugene Belilovsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2007.05756v3-abstract-short" style="display: inline;"> Inferring objects and their relationships from an image in the form of a scene graph is useful in many applications at the intersection of vision and language. We consider a challenging problem of compositional generalization that emerges in this task due to a long tail data distribution. Current scene graph generation models are trained on a tiny fraction of the distribution corresponding to the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.05756v3-abstract-full').style.display = 'inline'; document.getElementById('2007.05756v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2007.05756v3-abstract-full" style="display: none;"> Inferring objects and their relationships from an image in the form of a scene graph is useful in many applications at the intersection of vision and language. We consider a challenging problem of compositional generalization that emerges in this task due to a long tail data distribution. Current scene graph generation models are trained on a tiny fraction of the distribution corresponding to the most frequent compositions, e.g. &lt;cup, on, table&gt;. However, test images might contain zero- and few-shot compositions of objects and relationships, e.g. &lt;cup, on, surfboard&gt;. Despite each of the object categories and the predicate (e.g. &#39;on&#39;) being frequent in the training data, the models often fail to properly understand such unseen or rare compositions. To improve generalization, it is natural to attempt increasing the diversity of the training distribution. However, in the graph domain this is non-trivial. To that end, we propose a method to synthesize rare yet plausible scene graphs by perturbing real ones. We then propose and empirically study a model based on conditional generative adversarial networks (GANs) that allows us to generate visual features of perturbed scene graphs and learn from them in a joint fashion. When evaluated on the Visual Genome dataset, our approach yields marginal, but consistent improvements in zero- and few-shot metrics. We analyze the limitations of our approach indicating promising directions for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.05756v3-abstract-full').style.display = 'none'; document.getElementById('2007.05756v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICCV 2021 camera ready. Added more baselines, combining GANs with Neural Motifs and t-sne visualizations. Code is available at https://github.com/bknyaz/sgg</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2006.05164">arXiv:2006.05164</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2006.05164">pdf</a>, <a href="https://arxiv.org/format/2006.05164">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> AR-DAE: Towards Unbiased Neural Entropy Gradient Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Lim%2C+J+H">Jae Hyun Lim</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a>, <a href="/search/stat?searchtype=author&amp;query=Pal%2C+C">Christopher Pal</a>, <a href="/search/stat?searchtype=author&amp;query=Huang%2C+C">Chin-Wei Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2006.05164v1-abstract-short" style="display: inline;"> Entropy is ubiquitous in machine learning, but it is in general intractable to compute the entropy of the distribution of an arbitrary continuous random variable. In this paper, we propose the amortized residual denoising autoencoder (AR-DAE) to approximate the gradient of the log density function, which can be used to estimate the gradient of entropy. Amortization allows us to significantly reduc&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.05164v1-abstract-full').style.display = 'inline'; document.getElementById('2006.05164v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2006.05164v1-abstract-full" style="display: none;"> Entropy is ubiquitous in machine learning, but it is in general intractable to compute the entropy of the distribution of an arbitrary continuous random variable. In this paper, we propose the amortized residual denoising autoencoder (AR-DAE) to approximate the gradient of the log density function, which can be used to estimate the gradient of entropy. Amortization allows us to significantly reduce the error of the gradient approximator by approaching asymptotic optimality of a regular DAE, in which case the estimation is in theory unbiased. We conduct theoretical and experimental analyses on the approximation error of the proposed method, as well as extensive studies on heuristics to ensure its robustness. Finally, using the proposed gradient approximator to estimate the gradient of entropy, we demonstrate state-of-the-art performance on density estimation with variational autoencoders and continuous control with soft actor-critic. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.05164v1-abstract-full').style.display = 'none'; document.getElementById('2006.05164v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted in ICML 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2003.14166">arXiv:2003.14166</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2003.14166">pdf</a>, <a href="https://arxiv.org/format/2003.14166">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/s11263-020-01322-1">10.1007/s11263-020-01322-1 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Pix2Shape: Towards Unsupervised Learning of 3D Scenes from Images using a View-based Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Rajeswar%2C+S">Sai Rajeswar</a>, <a href="/search/stat?searchtype=author&amp;query=Mannan%2C+F">Fahim Mannan</a>, <a href="/search/stat?searchtype=author&amp;query=Golemo%2C+F">Florian Golemo</a>, <a href="/search/stat?searchtype=author&amp;query=Parent-L%C3%A9vesque%2C+J">J茅r么me Parent-L茅vesque</a>, <a href="/search/stat?searchtype=author&amp;query=Vazquez%2C+D">David Vazquez</a>, <a href="/search/stat?searchtype=author&amp;query=Nowrouzezahrai%2C+D">Derek Nowrouzezahrai</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2003.14166v2-abstract-short" style="display: inline;"> We infer and generate three-dimensional (3D) scene information from a single input image and without supervision. This problem is under-explored, with most prior work relying on supervision from, e.g., 3D ground-truth, multiple images of a scene, image silhouettes or key-points. We propose Pix2Shape, an approach to solve this problem with four components: (i) an encoder that infers the latent 3D r&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.14166v2-abstract-full').style.display = 'inline'; document.getElementById('2003.14166v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2003.14166v2-abstract-full" style="display: none;"> We infer and generate three-dimensional (3D) scene information from a single input image and without supervision. This problem is under-explored, with most prior work relying on supervision from, e.g., 3D ground-truth, multiple images of a scene, image silhouettes or key-points. We propose Pix2Shape, an approach to solve this problem with four components: (i) an encoder that infers the latent 3D representation from an image, (ii) a decoder that generates an explicit 2.5D surfel-based reconstruction of a scene from the latent code (iii) a differentiable renderer that synthesizes a 2D image from the surfel representation, and (iv) a critic network trained to discriminate between images generated by the decoder-renderer and those from a training distribution. Pix2Shape can generate complex 3D scenes that scale with the view-dependent on-screen resolution, unlike representations that capture world-space resolution, i.e., voxels or meshes. We show that Pix2Shape learns a consistent scene representation in its encoded latent space and that the decoder can then be applied to this latent representation in order to synthesize the scene from a novel viewpoint. We evaluate Pix2Shape with experiments on the ShapeNet dataset as well as on a novel benchmark we developed, called 3D-IQTT, to evaluate models based on their ability to enable 3d spatial reasoning. Qualitative and quantitative evaluation demonstrate Pix2Shape&#39;s ability to solve scene reconstruction, generation, and understanding tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.14166v2-abstract-full').style.display = 'none'; document.getElementById('2003.14166v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 April, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 March, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This is a pre-print of an article published in International Journal of Computer Vision. The final authenticated version is available online at: https://doi.org/10.1007/s11263-020-01322-1</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> International Journal of Computer Vision, (2020), 1-16 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2003.00688">arXiv:2003.00688</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2003.00688">pdf</a>, <a href="https://arxiv.org/format/2003.00688">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Out-of-Distribution Generalization via Risk Extrapolation (REx) </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Krueger%2C+D">David Krueger</a>, <a href="/search/stat?searchtype=author&amp;query=Caballero%2C+E">Ethan Caballero</a>, <a href="/search/stat?searchtype=author&amp;query=Jacobsen%2C+J">Joern-Henrik Jacobsen</a>, <a href="/search/stat?searchtype=author&amp;query=Zhang%2C+A">Amy Zhang</a>, <a href="/search/stat?searchtype=author&amp;query=Binas%2C+J">Jonathan Binas</a>, <a href="/search/stat?searchtype=author&amp;query=Zhang%2C+D">Dinghuai Zhang</a>, <a href="/search/stat?searchtype=author&amp;query=Priol%2C+R+L">Remi Le Priol</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2003.00688v5-abstract-short" style="display: inline;"> Distributional shift is one of the major obstacles when transferring machine learning prediction systems from the lab to the real world. To tackle this problem, we assume that variation across training domains is representative of the variation we might encounter at test time, but also that shifts at test time may be more extreme in magnitude. In particular, we show that reducing differences in ri&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.00688v5-abstract-full').style.display = 'inline'; document.getElementById('2003.00688v5-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2003.00688v5-abstract-full" style="display: none;"> Distributional shift is one of the major obstacles when transferring machine learning prediction systems from the lab to the real world. To tackle this problem, we assume that variation across training domains is representative of the variation we might encounter at test time, but also that shifts at test time may be more extreme in magnitude. In particular, we show that reducing differences in risk across training domains can reduce a model&#39;s sensitivity to a wide range of extreme distributional shifts, including the challenging setting where the input contains both causal and anti-causal elements. We motivate this approach, Risk Extrapolation (REx), as a form of robust optimization over a perturbation set of extrapolated domains (MM-REx), and propose a penalty on the variance of training risks (V-REx) as a simpler variant. We prove that variants of REx can recover the causal mechanisms of the targets, while also providing some robustness to changes in the input distribution (&#34;covariate shift&#34;). By appropriately trading-off robustness to causally induced distributional shifts and covariate shift, REx is able to outperform alternative methods such as Invariant Risk Minimization in situations where these types of shift co-occur. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.00688v5-abstract-full').style.display = 'none'; document.getElementById('2003.00688v5-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 March, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2002.07101">arXiv:2002.07101</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2002.07101">pdf</a>, <a href="https://arxiv.org/format/2002.07101">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Augmented Normalizing Flows: Bridging the Gap Between Generative Flows and Latent Variable Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Huang%2C+C">Chin-Wei Huang</a>, <a href="/search/stat?searchtype=author&amp;query=Dinh%2C+L">Laurent Dinh</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2002.07101v1-abstract-short" style="display: inline;"> In this work, we propose a new family of generative flows on an augmented data space, with an aim to improve expressivity without drastically increasing the computational cost of sampling and evaluation of a lower bound on the likelihood. Theoretically, we prove the proposed flow can approximate a Hamiltonian ODE as a universal transport map. Empirically, we demonstrate state-of-the-art performanc&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2002.07101v1-abstract-full').style.display = 'inline'; document.getElementById('2002.07101v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2002.07101v1-abstract-full" style="display: none;"> In this work, we propose a new family of generative flows on an augmented data space, with an aim to improve expressivity without drastically increasing the computational cost of sampling and evaluation of a lower bound on the likelihood. Theoretically, we prove the proposed flow can approximate a Hamiltonian ODE as a universal transport map. Empirically, we demonstrate state-of-the-art performance on standard benchmarks of flow-based generative modeling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2002.07101v1-abstract-full').style.display = 'none'; document.getElementById('2002.07101v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27 pages, 12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1911.05248">arXiv:1911.05248</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1911.05248">pdf</a>, <a href="https://arxiv.org/format/1911.05248">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> What Do Compressed Deep Neural Networks Forget? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Hooker%2C+S">Sara Hooker</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a>, <a href="/search/stat?searchtype=author&amp;query=Clark%2C+G">Gregory Clark</a>, <a href="/search/stat?searchtype=author&amp;query=Dauphin%2C+Y">Yann Dauphin</a>, <a href="/search/stat?searchtype=author&amp;query=Frome%2C+A">Andrea Frome</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1911.05248v3-abstract-short" style="display: inline;"> Deep neural network pruning and quantization techniques have demonstrated it is possible to achieve high levels of compression with surprisingly little degradation to test set accuracy. However, this measure of performance conceals significant differences in how different classes and images are impacted by model compression techniques. We find that models with radically different numbers of weight&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1911.05248v3-abstract-full').style.display = 'inline'; document.getElementById('1911.05248v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1911.05248v3-abstract-full" style="display: none;"> Deep neural network pruning and quantization techniques have demonstrated it is possible to achieve high levels of compression with surprisingly little degradation to test set accuracy. However, this measure of performance conceals significant differences in how different classes and images are impacted by model compression techniques. We find that models with radically different numbers of weights have comparable top-line performance metrics but diverge considerably in behavior on a narrow subset of the dataset. This small subset of data points, which we term Pruning Identified Exemplars (PIEs) are systematically more impacted by the introduction of sparsity. Compression disproportionately impacts model performance on the underrepresented long-tail of the data distribution. PIEs over-index on atypical or noisy images that are far more challenging for both humans and algorithms to classify. Our work provides intuition into the role of capacity in deep neural networks and the trade-offs incurred by compression. An understanding of this disparate impact is critical given the widespread deployment of compressed models in the wild. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1911.05248v3-abstract-full').style.display = 'none'; document.getElementById('1911.05248v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 November, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1910.09570">arXiv:1910.09570</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1910.09570">pdf</a>, <a href="https://arxiv.org/format/1910.09570">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applications">stat.AP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Icentia11K: An Unsupervised Representation Learning Dataset for Arrhythmia Subtype Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Tan%2C+S">Shawn Tan</a>, <a href="/search/stat?searchtype=author&amp;query=Androz%2C+G">Guillaume Androz</a>, <a href="/search/stat?searchtype=author&amp;query=Chamseddine%2C+A">Ahmad Chamseddine</a>, <a href="/search/stat?searchtype=author&amp;query=Fecteau%2C+P">Pierre Fecteau</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a>, <a href="/search/stat?searchtype=author&amp;query=Bengio%2C+Y">Yoshua Bengio</a>, <a href="/search/stat?searchtype=author&amp;query=Cohen%2C+J+P">Joseph Paul Cohen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1910.09570v1-abstract-short" style="display: inline;"> We release the largest public ECG dataset of continuous raw signals for representation learning containing 11 thousand patients and 2 billion labelled beats. Our goal is to enable semi-supervised ECG models to be made as well as to discover unknown subtypes of arrhythmia and anomalous ECG signal events. To this end, we propose an unsupervised representation learning task, evaluated in a semi-super&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.09570v1-abstract-full').style.display = 'inline'; document.getElementById('1910.09570v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1910.09570v1-abstract-full" style="display: none;"> We release the largest public ECG dataset of continuous raw signals for representation learning containing 11 thousand patients and 2 billion labelled beats. Our goal is to enable semi-supervised ECG models to be made as well as to discover unknown subtypes of arrhythmia and anomalous ECG signal events. To this end, we propose an unsupervised representation learning task, evaluated in a semi-supervised fashion. We provide a set of baselines for different feature extractors that can be built upon. Additionally, we perform qualitative evaluations on results from PCA embeddings, where we identify some clustering of known subtypes indicating the potential for representation learning in arrhythmia sub-type discovery. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.09570v1-abstract-full').style.display = 'none'; document.getElementById('1910.09570v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1908.02388">arXiv:1908.02388</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1908.02388">pdf</a>, <a href="https://arxiv.org/format/1908.02388">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Benchmarking Bonus-Based Exploration Methods on the Arcade Learning Environment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Ta%C3%AFga%2C+A+A">Adrien Ali Ta茂ga</a>, <a href="/search/stat?searchtype=author&amp;query=Fedus%2C+W">William Fedus</a>, <a href="/search/stat?searchtype=author&amp;query=Machado%2C+M+C">Marlos C. Machado</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a>, <a href="/search/stat?searchtype=author&amp;query=Bellemare%2C+M+G">Marc G. Bellemare</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1908.02388v3-abstract-short" style="display: inline;"> This paper provides an empirical evaluation of recently developed exploration algorithms within the Arcade Learning Environment (ALE). We study the use of different reward bonuses that incentives exploration in reinforcement learning. We do so by fixing the learning algorithm used and focusing only on the impact of the different exploration bonuses in the agent&#39;s performance. We use Rainbow, the s&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1908.02388v3-abstract-full').style.display = 'inline'; document.getElementById('1908.02388v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1908.02388v3-abstract-full" style="display: none;"> This paper provides an empirical evaluation of recently developed exploration algorithms within the Arcade Learning Environment (ALE). We study the use of different reward bonuses that incentives exploration in reinforcement learning. We do so by fixing the learning algorithm used and focusing only on the impact of the different exploration bonuses in the agent&#39;s performance. We use Rainbow, the state-of-the-art algorithm for value-based agents, and focus on some of the bonuses proposed in the last few years. We consider the impact these algorithms have on performance within the popular game Montezuma&#39;s Revenge which has gathered a lot of interest from the exploration community, across the the set of seven games identified by Bellemare et al. (2016) as challenging for exploration, and easier games where exploration is not an issue. We find that, in our setting, recently developed bonuses do not provide significantly improved performance on Montezuma&#39;s Revenge or hard exploration games. We also find that existing bonus-based methods may negatively impact performance on games in which exploration is not an issue and may even perform worse than $蔚$-greedy exploration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1908.02388v3-abstract-full').style.display = 'none'; document.getElementById('1908.02388v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 August, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at the second Exploration in Reinforcement Learning Workshop at the 36th International Conference on Machine Learning, Long Beach, California. The full version arxiv.org/abs/2109.11052 was published as a conference paper at ICLR 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1906.09691">arXiv:1906.09691</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1906.09691">pdf</a>, <a href="https://arxiv.org/format/1906.09691">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Adversarial Computation of Optimal Transport Maps </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Leygonie%2C+J">Jacob Leygonie</a>, <a href="/search/stat?searchtype=author&amp;query=She%2C+J">Jennifer She</a>, <a href="/search/stat?searchtype=author&amp;query=Almahairi%2C+A">Amjad Almahairi</a>, <a href="/search/stat?searchtype=author&amp;query=Rajeswar%2C+S">Sai Rajeswar</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1906.09691v1-abstract-short" style="display: inline;"> Computing optimal transport maps between high-dimensional and continuous distributions is a challenging problem in optimal transport (OT). Generative adversarial networks (GANs) are powerful generative models which have been successfully applied to learn maps across high-dimensional domains. However, little is known about the nature of the map learned with a GAN objective. To address this problem,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.09691v1-abstract-full').style.display = 'inline'; document.getElementById('1906.09691v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1906.09691v1-abstract-full" style="display: none;"> Computing optimal transport maps between high-dimensional and continuous distributions is a challenging problem in optimal transport (OT). Generative adversarial networks (GANs) are powerful generative models which have been successfully applied to learn maps across high-dimensional domains. However, little is known about the nature of the map learned with a GAN objective. To address this problem, we propose a generative adversarial model in which the discriminator&#39;s objective is the $2$-Wasserstein metric. We show that during training, our generator follows the $W_2$-geodesic between the initial and the target distributions. As a consequence, it reproduces an optimal map at the end of training. We validate our approach empirically in both low-dimensional and high-dimensional continuous settings, and show that it outperforms prior methods on image data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.09691v1-abstract-full').style.display = 'none'; document.getElementById('1906.09691v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 June, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1906.04282">arXiv:1906.04282</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1906.04282">pdf</a>, <a href="https://arxiv.org/format/1906.04282">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Stochastic Neural Network with Kronecker Flow </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Huang%2C+C">Chin-Wei Huang</a>, <a href="/search/stat?searchtype=author&amp;query=Touati%2C+A">Ahmed Touati</a>, <a href="/search/stat?searchtype=author&amp;query=Vincent%2C+P">Pascal Vincent</a>, <a href="/search/stat?searchtype=author&amp;query=Dziugaite%2C+G+K">Gintare Karolina Dziugaite</a>, <a href="/search/stat?searchtype=author&amp;query=Lacoste%2C+A">Alexandre Lacoste</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1906.04282v2-abstract-short" style="display: inline;"> Recent advances in variational inference enable the modelling of highly structured joint distributions, but are limited in their capacity to scale to the high-dimensional setting of stochastic neural networks. This limitation motivates a need for scalable parameterizations of the noise generation process, in a manner that adequately captures the dependencies among the various parameters. In this w&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.04282v2-abstract-full').style.display = 'inline'; document.getElementById('1906.04282v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1906.04282v2-abstract-full" style="display: none;"> Recent advances in variational inference enable the modelling of highly structured joint distributions, but are limited in their capacity to scale to the high-dimensional setting of stochastic neural networks. This limitation motivates a need for scalable parameterizations of the noise generation process, in a manner that adequately captures the dependencies among the various parameters. In this work, we address this need and present the Kronecker Flow, a generalization of the Kronecker product to invertible mappings designed for stochastic neural networks. We apply our method to variational Bayesian neural networks on predictive tasks, PAC-Bayes generalization bound estimation, and approximate Thompson sampling in contextual bandits. In all setups, our methods prove to be competitive with existing methods and better than the baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.04282v2-abstract-full').style.display = 'none'; document.getElementById('1906.04282v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 June, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Proceedings of the 23rdInternational Conference on ArtificialIntelligence and Statistics (AISTATS) 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1906.03708">arXiv:1906.03708</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1906.03708">pdf</a>, <a href="https://arxiv.org/format/1906.03708">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Note on the bias and variance of variational inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Huang%2C+C">Chin-Wei Huang</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1906.03708v1-abstract-short" style="display: inline;"> In this note, we study the relationship between the variational gap and the variance of the (log) likelihood ratio. We show that the gap can be upper bounded by some form of dispersion measure of the likelihood ratio, which suggests the bias of variational inference can be reduced by making the distribution of the likelihood ratio more concentrated, such as via averaging and variance reduction. </span> <span class="abstract-full has-text-grey-dark mathjax" id="1906.03708v1-abstract-full" style="display: none;"> In this note, we study the relationship between the variational gap and the variance of the (log) likelihood ratio. We show that the gap can be upper bounded by some form of dispersion measure of the likelihood ratio, which suggests the bias of variational inference can be reduced by making the distribution of the likelihood ratio more concentrated, such as via averaging and variance reduction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.03708v1-abstract-full').style.display = 'none'; document.getElementById('1906.03708v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 June, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1905.12760">arXiv:1905.12760</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1905.12760">pdf</a>, <a href="https://arxiv.org/format/1905.12760">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Batch weight for domain adaptation with mass shift </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Bi%C5%84kowski%2C+M">Miko艂aj Bi艅kowski</a>, <a href="/search/stat?searchtype=author&amp;query=Hjelm%2C+R+D">R Devon Hjelm</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1905.12760v1-abstract-short" style="display: inline;"> Unsupervised domain transfer is the task of transferring or translating samples from a source distribution to a different target distribution. Current solutions unsupervised domain transfer often operate on data on which the modes of the distribution are well-matched, for instance have the same frequencies of classes between source and target distributions. However, these models do not perform wel&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1905.12760v1-abstract-full').style.display = 'inline'; document.getElementById('1905.12760v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1905.12760v1-abstract-full" style="display: none;"> Unsupervised domain transfer is the task of transferring or translating samples from a source distribution to a different target distribution. Current solutions unsupervised domain transfer often operate on data on which the modes of the distribution are well-matched, for instance have the same frequencies of classes between source and target distributions. However, these models do not perform well when the modes are not well-matched, as would be the case when samples are drawn independently from two different, but related, domains. This mode imbalance is problematic as generative adversarial networks (GANs), a successful approach in this setting, are sensitive to mode frequency, which results in a mismatch of semantics between source samples and generated samples of the target distribution. We propose a principled method of re-weighting training samples to correct for such mass shift between the transferred distributions, which we call batch-weight. We also provide rigorous probabilistic setting for domain transfer and new simplified objective for training transfer networks, an alternative to complex, multi-component loss functions used in the current state-of-the art image-to-image translation models. The new objective stems from the discrimination of joint distributions and enforces cycle-consistency in an abstract, high-level, rather than pixel-wise, sense. Lastly, we experimentally show the effectiveness of the proposed methods in several image-to-image translation tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1905.12760v1-abstract-full').style.display = 'none'; document.getElementById('1905.12760v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1905.04866">arXiv:1905.04866</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1905.04866">pdf</a>, <a href="https://arxiv.org/format/1905.04866">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Hierarchical Importance Weighted Autoencoders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Huang%2C+C">Chin-Wei Huang</a>, <a href="/search/stat?searchtype=author&amp;query=Sankaran%2C+K">Kris Sankaran</a>, <a href="/search/stat?searchtype=author&amp;query=Dhekane%2C+E">Eeshan Dhekane</a>, <a href="/search/stat?searchtype=author&amp;query=Lacoste%2C+A">Alexandre Lacoste</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1905.04866v1-abstract-short" style="display: inline;"> Importance weighted variational inference (Burda et al., 2015) uses multiple i.i.d. samples to have a tighter variational lower bound. We believe a joint proposal has the potential of reducing the number of redundant samples, and introduce a hierarchical structure to induce correlation. The hope is that the proposals would coordinate to make up for the error made by one another to reduce the varia&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1905.04866v1-abstract-full').style.display = 'inline'; document.getElementById('1905.04866v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1905.04866v1-abstract-full" style="display: none;"> Importance weighted variational inference (Burda et al., 2015) uses multiple i.i.d. samples to have a tighter variational lower bound. We believe a joint proposal has the potential of reducing the number of redundant samples, and introduce a hierarchical structure to induce correlation. The hope is that the proposals would coordinate to make up for the error made by one another to reduce the variance of the importance estimator. Theoretically, we analyze the condition under which convergence of the estimator variance can be connected to convergence of the lower bound. Empirically, we confirm that maximization of the lower bound does implicitly minimize variance. Further analysis shows that this is a result of negative correlation induced by the proposed hierarchical meta sampling scheme, and performance of inference also improves when the number of samples increases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1905.04866v1-abstract-full').style.display = 'none'; document.getElementById('1905.04866v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 May, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICML 2019. 17 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1903.07227">arXiv:1903.07227</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1903.07227">pdf</a>, <a href="https://arxiv.org/format/1903.07227">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Counterpoint by Convolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Huang%2C+C+A">Cheng-Zhi Anna Huang</a>, <a href="/search/stat?searchtype=author&amp;query=Cooijmans%2C+T">Tim Cooijmans</a>, <a href="/search/stat?searchtype=author&amp;query=Roberts%2C+A">Adam Roberts</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a>, <a href="/search/stat?searchtype=author&amp;query=Eck%2C+D">Douglas Eck</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1903.07227v1-abstract-short" style="display: inline;"> Machine learning models of music typically break up the task of composition into a chronological process, composing a piece of music in a single pass from beginning to end. On the contrary, human composers write music in a nonlinear fashion, scribbling motifs here and there, often revisiting choices previously made. In order to better approximate this process, we train a convolutional neural netwo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1903.07227v1-abstract-full').style.display = 'inline'; document.getElementById('1903.07227v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1903.07227v1-abstract-full" style="display: none;"> Machine learning models of music typically break up the task of composition into a chronological process, composing a piece of music in a single pass from beginning to end. On the contrary, human composers write music in a nonlinear fashion, scribbling motifs here and there, often revisiting choices previously made. In order to better approximate this process, we train a convolutional neural network to complete partial musical scores, and explore the use of blocked Gibbs sampling as an analogue to rewriting. Neither the model nor the generative procedure are tied to a particular causal direction of composition. Our model is an instance of orderless NADE (Uria et al., 2014), which allows more direct ancestral sampling. However, we find that Gibbs sampling greatly improves sample quality, which we demonstrate to be due to some conditional distributions being poorly modeled. Moreover, we show that even the cheap approximate blocked Gibbs procedure from Yao et al. (2014) yields better samples than ancestral sampling, based on both log-likelihood and human evaluation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1903.07227v1-abstract-full').style.display = 'none'; document.getElementById('1903.07227v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Proceedings of the 18th International Society for Music Information Retrieval Conference, ISMIR 2017</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> H.5.5; I.2 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1901.08508">arXiv:1901.08508</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1901.08508">pdf</a>, <a href="https://arxiv.org/format/1901.08508">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Maximum Entropy Generators for Energy-Based Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Kumar%2C+R">Rithesh Kumar</a>, <a href="/search/stat?searchtype=author&amp;query=Ozair%2C+S">Sherjil Ozair</a>, <a href="/search/stat?searchtype=author&amp;query=Goyal%2C+A">Anirudh Goyal</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a>, <a href="/search/stat?searchtype=author&amp;query=Bengio%2C+Y">Yoshua Bengio</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1901.08508v2-abstract-short" style="display: inline;"> Maximum likelihood estimation of energy-based models is a challenging problem due to the intractability of the log-likelihood gradient. In this work, we propose learning both the energy function and an amortized approximate sampling mechanism using a neural generator network, which provides an efficient approximation of the log-likelihood gradient. The resulting objective requires maximizing entro&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1901.08508v2-abstract-full').style.display = 'inline'; document.getElementById('1901.08508v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1901.08508v2-abstract-full" style="display: none;"> Maximum likelihood estimation of energy-based models is a challenging problem due to the intractability of the log-likelihood gradient. In this work, we propose learning both the energy function and an amortized approximate sampling mechanism using a neural generator network, which provides an efficient approximation of the log-likelihood gradient. The resulting objective requires maximizing entropy of the generated samples, which we perform using recently proposed nonparametric mutual information estimators. Finally, to stabilize the resulting adversarial game, we use a zero-centered gradient penalty derived as a necessary condition from the score matching literature. The proposed technique can generate sharp images with Inception and FID scores competitive with recent GAN techniques, does not suffer from mode collapse, and is competitive with state-of-the-art anomaly detection techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1901.08508v2-abstract-full').style.display = 'none'; document.getElementById('1901.08508v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 May, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 January, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1811.10097">arXiv:1811.10097</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1811.10097">pdf</a>, <a href="https://arxiv.org/format/1811.10097">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Planning in Dynamic Environments with Conditional Autoregressive Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Hansen%2C+J">Johanna Hansen</a>, <a href="/search/stat?searchtype=author&amp;query=Kastner%2C+K">Kyle Kastner</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a>, <a href="/search/stat?searchtype=author&amp;query=Dudek%2C+G">Gregory Dudek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1811.10097v1-abstract-short" style="display: inline;"> We demonstrate the use of conditional autoregressive generative models (van den Oord et al., 2016a) over a discrete latent space (van den Oord et al., 2017b) for forward planning with MCTS. In order to test this method, we introduce a new environment featuring varying difficulty levels, along with moving goals and obstacles. The combination of high-quality frame generation and classical planning a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1811.10097v1-abstract-full').style.display = 'inline'; document.getElementById('1811.10097v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1811.10097v1-abstract-full" style="display: none;"> We demonstrate the use of conditional autoregressive generative models (van den Oord et al., 2016a) over a discrete latent space (van den Oord et al., 2017b) for forward planning with MCTS. In order to test this method, we introduce a new environment featuring varying difficulty levels, along with moving goals and obstacles. The combination of high-quality frame generation and classical planning approaches nearly matches true environment performance for our task, demonstrating the usefulness of this method for model-based planning in dynamic environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1811.10097v1-abstract-full').style.display = 'none'; document.getElementById('1811.10097v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 1 figure, in Proceedings of the Prediction and Generative Modeling in Reinforcement Learning Workshop at the International Conference on Machine Learning (ICML) in 2018</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1811.07426">arXiv:1811.07426</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1811.07426">pdf</a>, <a href="https://arxiv.org/format/1811.07426">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Harmonic Recomposition using Conditional Autoregressive Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Kastner%2C+K">Kyle Kastner</a>, <a href="/search/stat?searchtype=author&amp;query=Kumar%2C+R">Rithesh Kumar</a>, <a href="/search/stat?searchtype=author&amp;query=Cooijmans%2C+T">Tim Cooijmans</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1811.07426v1-abstract-short" style="display: inline;"> We demonstrate a conditional autoregressive pipeline for efficient music recomposition, based on methods presented in van den Oord et al.(2017). Recomposition (Casal &amp; Casey, 2010) focuses on reworking existing musical pieces, adhering to structure at a high level while also re-imagining other aspects of the work. This can involve reuse of pre-existing themes or parts of the original piece, while&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1811.07426v1-abstract-full').style.display = 'inline'; document.getElementById('1811.07426v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1811.07426v1-abstract-full" style="display: none;"> We demonstrate a conditional autoregressive pipeline for efficient music recomposition, based on methods presented in van den Oord et al.(2017). Recomposition (Casal &amp; Casey, 2010) focuses on reworking existing musical pieces, adhering to structure at a high level while also re-imagining other aspects of the work. This can involve reuse of pre-existing themes or parts of the original piece, while also requiring the flexibility to generate new content at different levels of granularity. Applying the aforementioned modeling pipeline to recomposition, we show diverse and structured generation conditioned on chord sequence annotations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1811.07426v1-abstract-full').style.display = 'none'; document.getElementById('1811.07426v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">3 pages, 2 figures. In Proceedings of The Joint Workshop on Machine Learning for Music, ICML 2018</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1811.07240">arXiv:1811.07240</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1811.07240">pdf</a>, <a href="https://arxiv.org/format/1811.07240">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Representation Mixing for TTS Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Kastner%2C+K">Kyle Kastner</a>, <a href="/search/stat?searchtype=author&amp;query=Santos%2C+J+F">Jo茫o Felipe Santos</a>, <a href="/search/stat?searchtype=author&amp;query=Bengio%2C+Y">Yoshua Bengio</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1811.07240v2-abstract-short" style="display: inline;"> Recent character and phoneme-based parametric TTS systems using deep learning have shown strong performance in natural speech generation. However, the choice between character or phoneme input can create serious limitations for practical deployment, as direct control of pronunciation is crucial in certain cases. We demonstrate a simple method for combining multiple types of linguistic information&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1811.07240v2-abstract-full').style.display = 'inline'; document.getElementById('1811.07240v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1811.07240v2-abstract-full" style="display: none;"> Recent character and phoneme-based parametric TTS systems using deep learning have shown strong performance in natural speech generation. However, the choice between character or phoneme input can create serious limitations for practical deployment, as direct control of pronunciation is crucial in certain cases. We demonstrate a simple method for combining multiple types of linguistic information in a single encoder, named representation mixing, enabling flexible choice between character, phoneme, or mixed representations during inference. Experiments and user studies on a public audiobook corpus show the efficacy of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1811.07240v2-abstract-full').style.display = 'none'; document.getElementById('1811.07240v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 November, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1809.06848">arXiv:1809.06848</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1809.06848">pdf</a>, <a href="https://arxiv.org/format/1809.06848">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> On the Learning Dynamics of Deep Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Tachet%2C+R">Remi Tachet</a>, <a href="/search/stat?searchtype=author&amp;query=Pezeshki%2C+M">Mohammad Pezeshki</a>, <a href="/search/stat?searchtype=author&amp;query=Shabanian%2C+S">Samira Shabanian</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a>, <a href="/search/stat?searchtype=author&amp;query=Bengio%2C+Y">Yoshua Bengio</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1809.06848v3-abstract-short" style="display: inline;"> While a lot of progress has been made in recent years, the dynamics of learning in deep nonlinear neural networks remain to this day largely misunderstood. In this work, we study the case of binary classification and prove various properties of learning in such networks under strong assumptions such as linear separability of the data. Extending existing results from the linear case, we confirm emp&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1809.06848v3-abstract-full').style.display = 'inline'; document.getElementById('1809.06848v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1809.06848v3-abstract-full" style="display: none;"> While a lot of progress has been made in recent years, the dynamics of learning in deep nonlinear neural networks remain to this day largely misunderstood. In this work, we study the case of binary classification and prove various properties of learning in such networks under strong assumptions such as linear separability of the data. Extending existing results from the linear case, we confirm empirical observations by proving that the classification error also follows a sigmoidal shape in nonlinear architectures. We show that given proper initialization, learning expounds parallel independent modes and that certain regions of parameter space might lead to failed training. We also demonstrate that input norm and features&#39; frequency in the dataset lead to distinct convergence speeds which might shed some light on the generalization capabilities of deep neural networks. We provide a comparison between the dynamics of learning with cross-entropy and hinge losses, which could prove useful to understand recent progress in the training of generative adversarial networks. Finally, we identify a phenomenon that we baptize gradient starvation where the most frequent features in a dataset prevent the learning of other less frequent but equally informative features. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1809.06848v3-abstract-full').style.display = 'none'; document.getElementById('1809.06848v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 September, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1809.01818">arXiv:1809.01818</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1809.01818">pdf</a>, <a href="https://arxiv.org/format/1809.01818">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Improving Explorability in Variational Inference with Annealed Variational Objectives </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Huang%2C+C">Chin-Wei Huang</a>, <a href="/search/stat?searchtype=author&amp;query=Tan%2C+S">Shawn Tan</a>, <a href="/search/stat?searchtype=author&amp;query=Lacoste%2C+A">Alexandre Lacoste</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1809.01818v3-abstract-short" style="display: inline;"> Despite the advances in the representational capacity of approximate distributions for variational inference, the optimization process can still limit the density that is ultimately learned. We demonstrate the drawbacks of biasing the true posterior to be unimodal, and introduce Annealed Variational Objectives (AVO) into the training of hierarchical variational methods. Inspired by Annealed Import&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1809.01818v3-abstract-full').style.display = 'inline'; document.getElementById('1809.01818v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1809.01818v3-abstract-full" style="display: none;"> Despite the advances in the representational capacity of approximate distributions for variational inference, the optimization process can still limit the density that is ultimately learned. We demonstrate the drawbacks of biasing the true posterior to be unimodal, and introduce Annealed Variational Objectives (AVO) into the training of hierarchical variational methods. Inspired by Annealed Importance Sampling, the proposed method facilitates learning by incorporating energy tempering into the optimization objective. In our experiments, we demonstrate our method&#39;s robustness to deterministic warm up, and the benefits of encouraging exploration in the latent space. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1809.01818v3-abstract-full').style.display = 'none'; document.getElementById('1809.01818v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 September, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in NIPS 2018</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1808.09819">arXiv:1808.09819</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1808.09819">pdf</a>, <a href="https://arxiv.org/format/1808.09819">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Approximate Exploration through State Abstraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Ta%C3%AFga%2C+A+A">Adrien Ali Ta茂ga</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a>, <a href="/search/stat?searchtype=author&amp;query=Bellemare%2C+M+G">Marc G. Bellemare</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1808.09819v2-abstract-short" style="display: inline;"> Although exploration in reinforcement learning is well understood from a theoretical point of view, provably correct methods remain impractical. In this paper we study the interplay between exploration and approximation, what we call approximate exploration. Our main goal is to further our theoretical understanding of pseudo-count based exploration bonuses (Bellemare et al., 2016), a practical exp&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1808.09819v2-abstract-full').style.display = 'inline'; document.getElementById('1808.09819v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1808.09819v2-abstract-full" style="display: none;"> Although exploration in reinforcement learning is well understood from a theoretical point of view, provably correct methods remain impractical. In this paper we study the interplay between exploration and approximation, what we call approximate exploration. Our main goal is to further our theoretical understanding of pseudo-count based exploration bonuses (Bellemare et al., 2016), a practical exploration scheme based on density modelling. As a warm-up, we quantify the performance of an exploration algorithm, MBIE-EB (Strehl and Littman, 2008), when explicitly combined with state aggregation. This allows us to confirm that, as might be expected, approximation allows the agent to trade off between learning speed and quality of the learned policy. Next, we show how a given density model can be related to an abstraction and that the corresponding pseudo-count bonus can act as a substitute in MBIE-EB combined with this abstraction, but may lead to either under- or over-exploration. Then, we show that a given density model also defines an implicit abstraction, and find a surprising mismatch between pseudo-counts derived either implicitly or explicitly. Finally we derive a new pseudo-count bonus alleviating this issue. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1808.09819v2-abstract-full').style.display = 'none'; document.getElementById('1808.09819v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 August, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2018. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1808.04446">arXiv:1808.04446</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1808.04446">pdf</a>, <a href="https://arxiv.org/format/1808.04446">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Visual Reasoning with Multi-hop Feature Modulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Strub%2C+F">Florian Strub</a>, <a href="/search/stat?searchtype=author&amp;query=Seurin%2C+M">Mathieu Seurin</a>, <a href="/search/stat?searchtype=author&amp;query=Perez%2C+E">Ethan Perez</a>, <a href="/search/stat?searchtype=author&amp;query=de+Vries%2C+H">Harm de Vries</a>, <a href="/search/stat?searchtype=author&amp;query=Mary%2C+J">J茅r茅mie Mary</a>, <a href="/search/stat?searchtype=author&amp;query=Preux%2C+P">Philippe Preux</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a>, <a href="/search/stat?searchtype=author&amp;query=Pietquin%2C+O">Olivier Pietquin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1808.04446v2-abstract-short" style="display: inline;"> Recent breakthroughs in computer vision and natural language processing have spurred interest in challenging multi-modal tasks such as visual question-answering and visual dialogue. For such tasks, one successful approach is to condition image-based convolutional network computation on language via Feature-wise Linear Modulation (FiLM) layers, i.e., per-channel scaling and shifting. We propose to&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1808.04446v2-abstract-full').style.display = 'inline'; document.getElementById('1808.04446v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1808.04446v2-abstract-full" style="display: none;"> Recent breakthroughs in computer vision and natural language processing have spurred interest in challenging multi-modal tasks such as visual question-answering and visual dialogue. For such tasks, one successful approach is to condition image-based convolutional network computation on language via Feature-wise Linear Modulation (FiLM) layers, i.e., per-channel scaling and shifting. We propose to generate the parameters of FiLM layers going up the hierarchy of a convolutional network in a multi-hop fashion rather than all at once, as in prior work. By alternating between attending to the language input and generating FiLM layer parameters, this approach is better able to scale to settings with longer input sequences such as dialogue. We demonstrate that multi-hop FiLM generation achieves state-of-the-art for the short input sequence task ReferIt --- on-par with single-hop FiLM generation --- while also significantly outperforming prior state-of-the-art and single-hop FiLM generation on the GuessWhat?! visual dialogue task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1808.04446v2-abstract-full').style.display = 'none'; document.getElementById('1808.04446v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 August, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In Proc of ECCV 2018</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1806.08734">arXiv:1806.08734</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1806.08734">pdf</a>, <a href="https://arxiv.org/format/1806.08734">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> On the Spectral Bias of Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Rahaman%2C+N">Nasim Rahaman</a>, <a href="/search/stat?searchtype=author&amp;query=Baratin%2C+A">Aristide Baratin</a>, <a href="/search/stat?searchtype=author&amp;query=Arpit%2C+D">Devansh Arpit</a>, <a href="/search/stat?searchtype=author&amp;query=Draxler%2C+F">Felix Draxler</a>, <a href="/search/stat?searchtype=author&amp;query=Lin%2C+M">Min Lin</a>, <a href="/search/stat?searchtype=author&amp;query=Hamprecht%2C+F+A">Fred A. Hamprecht</a>, <a href="/search/stat?searchtype=author&amp;query=Bengio%2C+Y">Yoshua Bengio</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1806.08734v3-abstract-short" style="display: inline;"> Neural networks are known to be a class of highly expressive functions able to fit even random input-output mappings with $100\%$ accuracy. In this work, we present properties of neural networks that complement this aspect of expressivity. By using tools from Fourier analysis, we show that deep ReLU networks are biased towards low frequency functions, meaning that they cannot have local fluctuatio&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1806.08734v3-abstract-full').style.display = 'inline'; document.getElementById('1806.08734v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1806.08734v3-abstract-full" style="display: none;"> Neural networks are known to be a class of highly expressive functions able to fit even random input-output mappings with $100\%$ accuracy. In this work, we present properties of neural networks that complement this aspect of expressivity. By using tools from Fourier analysis, we show that deep ReLU networks are biased towards low frequency functions, meaning that they cannot have local fluctuations without affecting their global behavior. Intuitively, this property is in line with the observation that over-parameterized networks find simple patterns that generalize across data samples. We also investigate how the shape of the data manifold affects expressivity by showing evidence that learning high frequencies gets \emph{easier} with increasing manifold complexity, and present a theoretical understanding of this behavior. Finally, we study the robustness of the frequency components with respect to parameter perturbation, to develop the intuition that the parameters must be finely tuned to express high frequency functions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1806.08734v3-abstract-full').style.display = 'none'; document.getElementById('1806.08734v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 May, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 June, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ICML 2019 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1806.06875">arXiv:1806.06875</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1806.06875">pdf</a>, <a href="https://arxiv.org/format/1806.06875">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/2792838.2800192">10.1145/2792838.2800192 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Learning Distributed Representations from Reviews for Collaborative Filtering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Almahairi%2C+A">Amjad Almahairi</a>, <a href="/search/stat?searchtype=author&amp;query=Kastner%2C+K">Kyle Kastner</a>, <a href="/search/stat?searchtype=author&amp;query=Cho%2C+K">Kyunghyun Cho</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1806.06875v1-abstract-short" style="display: inline;"> Recent work has shown that collaborative filter-based recommender systems can be improved by incorporating side information, such as natural language reviews, as a way of regularizing the derived product representations. Motivated by the success of this approach, we introduce two different models of reviews and study their effect on collaborative filtering performance. While the previous state-of-&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1806.06875v1-abstract-full').style.display = 'inline'; document.getElementById('1806.06875v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1806.06875v1-abstract-full" style="display: none;"> Recent work has shown that collaborative filter-based recommender systems can be improved by incorporating side information, such as natural language reviews, as a way of regularizing the derived product representations. Motivated by the success of this approach, we introduce two different models of reviews and study their effect on collaborative filtering performance. While the previous state-of-the-art approach is based on a latent Dirichlet allocation (LDA) model of reviews, the models we explore are neural network based: a bag-of-words product-of-experts model and a recurrent neural network. We demonstrate that the increased flexibility offered by the product-of-experts model allowed it to achieve state-of-the-art performance on the Amazon review dataset, outperforming the LDA-based approach. However, interestingly, the greater modeling power offered by the recurrent neural network appears to undermine the model&#39;s ability to act as a regularizer of the product representations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1806.06875v1-abstract-full').style.display = 'none'; document.getElementById('1806.06875v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in RecSys 2015 conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1806.05236">arXiv:1806.05236</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1806.05236">pdf</a>, <a href="https://arxiv.org/format/1806.05236">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Manifold Mixup: Better Representations by Interpolating Hidden States </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Verma%2C+V">Vikas Verma</a>, <a href="/search/stat?searchtype=author&amp;query=Lamb%2C+A">Alex Lamb</a>, <a href="/search/stat?searchtype=author&amp;query=Beckham%2C+C">Christopher Beckham</a>, <a href="/search/stat?searchtype=author&amp;query=Najafi%2C+A">Amir Najafi</a>, <a href="/search/stat?searchtype=author&amp;query=Mitliagkas%2C+I">Ioannis Mitliagkas</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a>, <a href="/search/stat?searchtype=author&amp;query=Lopez-Paz%2C+D">David Lopez-Paz</a>, <a href="/search/stat?searchtype=author&amp;query=Bengio%2C+Y">Yoshua Bengio</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1806.05236v7-abstract-short" style="display: inline;"> Deep neural networks excel at learning the training data, but often provide incorrect and confident predictions when evaluated on slightly different test examples. This includes distribution shifts, outliers, and adversarial examples. To address these issues, we propose Manifold Mixup, a simple regularizer that encourages neural networks to predict less confidently on interpolations of hidden repr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1806.05236v7-abstract-full').style.display = 'inline'; document.getElementById('1806.05236v7-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1806.05236v7-abstract-full" style="display: none;"> Deep neural networks excel at learning the training data, but often provide incorrect and confident predictions when evaluated on slightly different test examples. This includes distribution shifts, outliers, and adversarial examples. To address these issues, we propose Manifold Mixup, a simple regularizer that encourages neural networks to predict less confidently on interpolations of hidden representations. Manifold Mixup leverages semantic interpolations as additional training signal, obtaining neural networks with smoother decision boundaries at multiple levels of representation. As a result, neural networks trained with Manifold Mixup learn class-representations with fewer directions of variance. We prove theory on why this flattening happens under ideal conditions, validate it on practical situations, and connect it to previous works on information theory and generalization. In spite of incurring no significant computation and being implemented in a few lines of code, Manifold Mixup improves strong baselines in supervised learning, robustness to single-step adversarial attacks, and test log-likelihood. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1806.05236v7-abstract-full').style.display = 'none'; document.getElementById('1806.05236v7-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 May, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 June, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in ICML 2019</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1804.00779">arXiv:1804.00779</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1804.00779">pdf</a>, <a href="https://arxiv.org/format/1804.00779">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Neural Autoregressive Flows </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Huang%2C+C">Chin-Wei Huang</a>, <a href="/search/stat?searchtype=author&amp;query=Krueger%2C+D">David Krueger</a>, <a href="/search/stat?searchtype=author&amp;query=Lacoste%2C+A">Alexandre Lacoste</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1804.00779v1-abstract-short" style="display: inline;"> Normalizing flows and autoregressive models have been successfully combined to produce state-of-the-art results in density estimation, via Masked Autoregressive Flows (MAF), and to accelerate state-of-the-art WaveNet-based speech synthesis to 20x faster than real-time, via Inverse Autoregressive Flows (IAF). We unify and generalize these approaches, replacing the (conditionally) affine univariate&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1804.00779v1-abstract-full').style.display = 'inline'; document.getElementById('1804.00779v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1804.00779v1-abstract-full" style="display: none;"> Normalizing flows and autoregressive models have been successfully combined to produce state-of-the-art results in density estimation, via Masked Autoregressive Flows (MAF), and to accelerate state-of-the-art WaveNet-based speech synthesis to 20x faster than real-time, via Inverse Autoregressive Flows (IAF). We unify and generalize these approaches, replacing the (conditionally) affine univariate transformations of MAF/IAF with a more general class of invertible univariate transformations expressed as monotonic neural networks. We demonstrate that the proposed neural autoregressive flows (NAF) are universal approximators for continuous probability distributions, and their greater expressivity allows them to better capture multimodal target distributions. Experimentally, NAF yields state-of-the-art performance on a suite of density estimation tasks and outperforms IAF in variational autoencoders trained on binarized MNIST. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1804.00779v1-abstract-full').style.display = 'none'; document.getElementById('1804.00779v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 April, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 10 figures, 3 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1802.01071">arXiv:1802.01071</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1802.01071">pdf</a>, <a href="https://arxiv.org/format/1802.01071">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Hierarchical Adversarially Learned Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Belghazi%2C+M+I">Mohamed Ishmael Belghazi</a>, <a href="/search/stat?searchtype=author&amp;query=Rajeswar%2C+S">Sai Rajeswar</a>, <a href="/search/stat?searchtype=author&amp;query=Mastropietro%2C+O">Olivier Mastropietro</a>, <a href="/search/stat?searchtype=author&amp;query=Rostamzadeh%2C+N">Negar Rostamzadeh</a>, <a href="/search/stat?searchtype=author&amp;query=Mitrovic%2C+J">Jovana Mitrovic</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1802.01071v1-abstract-short" style="display: inline;"> We propose a novel hierarchical generative model with a simple Markovian structure and a corresponding inference model. Both the generative and inference model are trained using the adversarial learning paradigm. We demonstrate that the hierarchical structure supports the learning of progressively more abstract representations as well as providing semantically meaningful reconstructions with diffe&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1802.01071v1-abstract-full').style.display = 'inline'; document.getElementById('1802.01071v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1802.01071v1-abstract-full" style="display: none;"> We propose a novel hierarchical generative model with a simple Markovian structure and a corresponding inference model. Both the generative and inference model are trained using the adversarial learning paradigm. We demonstrate that the hierarchical structure supports the learning of progressively more abstract representations as well as providing semantically meaningful reconstructions with different levels of fidelity. Furthermore, we show that minimizing the Jensen-Shanon divergence between the generative and inference network is enough to minimize the reconstruction error. The resulting semantically meaningful hierarchical latent structure discovery is exemplified on the CelebA dataset. There, we show that the features learned by our model in an unsupervised way outperform the best handcrafted features. Furthermore, the extracted features remain competitive when compared to several recent deep supervised approaches on an attribute prediction task on CelebA. Finally, we leverage the model&#39;s inference network to achieve state-of-the-art performance on a semi-supervised variant of the MNIST digit classification task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1802.01071v1-abstract-full').style.display = 'none'; document.getElementById('1802.01071v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1801.04062">arXiv:1801.04062</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1801.04062">pdf</a>, <a href="https://arxiv.org/format/1801.04062">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> MINE: Mutual Information Neural Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Belghazi%2C+M+I">Mohamed Ishmael Belghazi</a>, <a href="/search/stat?searchtype=author&amp;query=Baratin%2C+A">Aristide Baratin</a>, <a href="/search/stat?searchtype=author&amp;query=Rajeswar%2C+S">Sai Rajeswar</a>, <a href="/search/stat?searchtype=author&amp;query=Ozair%2C+S">Sherjil Ozair</a>, <a href="/search/stat?searchtype=author&amp;query=Bengio%2C+Y">Yoshua Bengio</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a>, <a href="/search/stat?searchtype=author&amp;query=Hjelm%2C+R+D">R Devon Hjelm</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1801.04062v5-abstract-short" style="display: inline;"> We argue that the estimation of mutual information between high dimensional continuous random variables can be achieved by gradient descent over neural networks. We present a Mutual Information Neural Estimator (MINE) that is linearly scalable in dimensionality as well as in sample size, trainable through back-prop, and strongly consistent. We present a handful of applications on which MINE can be&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1801.04062v5-abstract-full').style.display = 'inline'; document.getElementById('1801.04062v5-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1801.04062v5-abstract-full" style="display: none;"> We argue that the estimation of mutual information between high dimensional continuous random variables can be achieved by gradient descent over neural networks. We present a Mutual Information Neural Estimator (MINE) that is linearly scalable in dimensionality as well as in sample size, trainable through back-prop, and strongly consistent. We present a handful of applications on which MINE can be used to minimize or maximize mutual information. We apply MINE to improve adversarially trained generative models. We also use MINE to implement Information Bottleneck, applying it to supervised classification; our results demonstrate substantial improvement in flexibility and performance in these settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1801.04062v5-abstract-full').style.display = 'none'; document.getElementById('1801.04062v5-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 January, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 6 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ICML 2018 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1712.04120">arXiv:1712.04120</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1712.04120">pdf</a>, <a href="https://arxiv.org/format/1712.04120">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GibbsNet: Iterative Adversarial Inference for Deep Graphical Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Lamb%2C+A">Alex Lamb</a>, <a href="/search/stat?searchtype=author&amp;query=Hjelm%2C+D">Devon Hjelm</a>, <a href="/search/stat?searchtype=author&amp;query=Ganin%2C+Y">Yaroslav Ganin</a>, <a href="/search/stat?searchtype=author&amp;query=Cohen%2C+J+P">Joseph Paul Cohen</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a>, <a href="/search/stat?searchtype=author&amp;query=Bengio%2C+Y">Yoshua Bengio</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1712.04120v1-abstract-short" style="display: inline;"> Directed latent variable models that formulate the joint distribution as $p(x,z) = p(z) p(x \mid z)$ have the advantage of fast and exact sampling. However, these models have the weakness of needing to specify $p(z)$, often with a simple fixed prior that limits the expressiveness of the model. Undirected latent variable models discard the requirement that $p(z)$ be specified with a prior, yet samp&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1712.04120v1-abstract-full').style.display = 'inline'; document.getElementById('1712.04120v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1712.04120v1-abstract-full" style="display: none;"> Directed latent variable models that formulate the joint distribution as $p(x,z) = p(z) p(x \mid z)$ have the advantage of fast and exact sampling. However, these models have the weakness of needing to specify $p(z)$, often with a simple fixed prior that limits the expressiveness of the model. Undirected latent variable models discard the requirement that $p(z)$ be specified with a prior, yet sampling from them generally requires an iterative procedure such as blocked Gibbs-sampling that may require many steps to draw samples from the joint distribution $p(x, z)$. We propose a novel approach to learning the joint distribution between the data and a latent code which uses an adversarially learned iterative procedure to gradually refine the joint distribution, $p(x, z)$, to better match with the data distribution on each step. GibbsNet is the best of both worlds both in theory and in practice. Achieving the speed and simplicity of a directed latent variable model, it is guaranteed (assuming the adversarial game reaches the virtual training criteria global minimum) to produce samples from $p(x, z)$ with only a few sampling iterations. Achieving the expressiveness and flexibility of an undirected latent variable model, GibbsNet does away with the need for an explicit $p(z)$ and has the ability to do attribute prediction, class-conditional generation, and joint image-attribute modeling in a single model which is not trained for any of these specific tasks. We show empirically that GibbsNet is able to learn a more complex $p(z)$ and show that this leads to improved inpainting and iterative refinement of $p(x, z)$ for dozens of steps and stable generation without collapse for thousands of steps, despite being trained on only a few steps. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1712.04120v1-abstract-full').style.display = 'none'; document.getElementById('1712.04120v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2017. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NIPS 2017</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1710.04759">arXiv:1710.04759</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1710.04759">pdf</a>, <a href="https://arxiv.org/format/1710.04759">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Bayesian Hypernetworks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Krueger%2C+D">David Krueger</a>, <a href="/search/stat?searchtype=author&amp;query=Huang%2C+C">Chin-Wei Huang</a>, <a href="/search/stat?searchtype=author&amp;query=Islam%2C+R">Riashat Islam</a>, <a href="/search/stat?searchtype=author&amp;query=Turner%2C+R">Ryan Turner</a>, <a href="/search/stat?searchtype=author&amp;query=Lacoste%2C+A">Alexandre Lacoste</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1710.04759v2-abstract-short" style="display: inline;"> We study Bayesian hypernetworks: a framework for approximate Bayesian inference in neural networks. A Bayesian hypernetwork $\h$ is a neural network which learns to transform a simple noise distribution, $p(\vec蔚) = \N(\vec 0,\mat I)$, to a distribution $q(\pp) := q(h(\vec蔚))$ over the parameters $\pp$ of another neural network (the &#34;primary network&#34;)\@. We train $q$ with variational inference, us&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1710.04759v2-abstract-full').style.display = 'inline'; document.getElementById('1710.04759v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1710.04759v2-abstract-full" style="display: none;"> We study Bayesian hypernetworks: a framework for approximate Bayesian inference in neural networks. A Bayesian hypernetwork $\h$ is a neural network which learns to transform a simple noise distribution, $p(\vec蔚) = \N(\vec 0,\mat I)$, to a distribution $q(\pp) := q(h(\vec蔚))$ over the parameters $\pp$ of another neural network (the &#34;primary network&#34;)\@. We train $q$ with variational inference, using an invertible $\h$ to enable efficient estimation of the variational lower bound on the posterior $p(\pp | \D)$ via sampling. In contrast to most methods for Bayesian deep learning, Bayesian hypernets can represent a complex multimodal approximate posterior with correlations between parameters, while enabling cheap iid sampling of~$q(\pp)$. In practice, Bayesian hypernets can provide a better defense against adversarial examples than dropout, and also exhibit competitive performance on a suite of tasks which evaluate model uncertainty, including regularization, active learning, and anomaly detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1710.04759v2-abstract-full').style.display = 'none'; document.getElementById('1710.04759v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 April, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 October, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2017. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">David Krueger and Chin-Wei Huang contributed equally</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1710.02248">arXiv:1710.02248</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1710.02248">pdf</a>, <a href="https://arxiv.org/format/1710.02248">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Learnable Explicit Density for Continuous Latent Space and Variational Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Huang%2C+C">Chin-Wei Huang</a>, <a href="/search/stat?searchtype=author&amp;query=Touati%2C+A">Ahmed Touati</a>, <a href="/search/stat?searchtype=author&amp;query=Dinh%2C+L">Laurent Dinh</a>, <a href="/search/stat?searchtype=author&amp;query=Drozdzal%2C+M">Michal Drozdzal</a>, <a href="/search/stat?searchtype=author&amp;query=Havaei%2C+M">Mohammad Havaei</a>, <a href="/search/stat?searchtype=author&amp;query=Charlin%2C+L">Laurent Charlin</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1710.02248v1-abstract-short" style="display: inline;"> In this paper, we study two aspects of the variational autoencoder (VAE): the prior distribution over the latent variables and its corresponding posterior. First, we decompose the learning of VAEs into layerwise density estimation, and argue that having a flexible prior is beneficial to both sample generation and inference. Second, we analyze the family of inverse autoregressive flows (inverse AF)&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1710.02248v1-abstract-full').style.display = 'inline'; document.getElementById('1710.02248v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1710.02248v1-abstract-full" style="display: none;"> In this paper, we study two aspects of the variational autoencoder (VAE): the prior distribution over the latent variables and its corresponding posterior. First, we decompose the learning of VAEs into layerwise density estimation, and argue that having a flexible prior is beneficial to both sample generation and inference. Second, we analyze the family of inverse autoregressive flows (inverse AF) and show that with further improvement, inverse AF could be used as universal approximation to any complicated posterior. Our analysis results in a unified approach to parameterizing a VAE, without the need to restrict ourselves to use factorial Gaussians in the latent real space. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1710.02248v1-abstract-full').style.display = 'none'; document.getElementById('1710.02248v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2017. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2 figures, 5 pages, submitted to ICML Principled Approaches to Deep Learning workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1709.07871">arXiv:1709.07871</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1709.07871">pdf</a>, <a href="https://arxiv.org/format/1709.07871">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> FiLM: Visual Reasoning with a General Conditioning Layer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Perez%2C+E">Ethan Perez</a>, <a href="/search/stat?searchtype=author&amp;query=Strub%2C+F">Florian Strub</a>, <a href="/search/stat?searchtype=author&amp;query=de+Vries%2C+H">Harm de Vries</a>, <a href="/search/stat?searchtype=author&amp;query=Dumoulin%2C+V">Vincent Dumoulin</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1709.07871v2-abstract-short" style="display: inline;"> We introduce a general-purpose conditioning method for neural networks called FiLM: Feature-wise Linear Modulation. FiLM layers influence neural network computation via a simple, feature-wise affine transformation based on conditioning information. We show that FiLM layers are highly effective for visual reasoning - answering image-related questions which require a multi-step, high-level process -&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1709.07871v2-abstract-full').style.display = 'inline'; document.getElementById('1709.07871v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1709.07871v2-abstract-full" style="display: none;"> We introduce a general-purpose conditioning method for neural networks called FiLM: Feature-wise Linear Modulation. FiLM layers influence neural network computation via a simple, feature-wise affine transformation based on conditioning information. We show that FiLM layers are highly effective for visual reasoning - answering image-related questions which require a multi-step, high-level process - a task which has proven difficult for standard deep learning methods that do not explicitly model reasoning. Specifically, we show on visual reasoning tasks that FiLM layers 1) halve state-of-the-art error for the CLEVR benchmark, 2) modulate features in a coherent manner, 3) are robust to ablations and architectural modifications, and 4) generalize well to challenging, new data from few examples or even zero-shot. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1709.07871v2-abstract-full').style.display = 'none'; document.getElementById('1709.07871v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2017; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 September, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2017. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AAAI 2018. Code available at http://github.com/ethanjperez/film . Extends arXiv:1707.03017</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1707.03017">arXiv:1707.03017</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1707.03017">pdf</a>, <a href="https://arxiv.org/format/1707.03017">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Learning Visual Reasoning Without Strong Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Perez%2C+E">Ethan Perez</a>, <a href="/search/stat?searchtype=author&amp;query=de+Vries%2C+H">Harm de Vries</a>, <a href="/search/stat?searchtype=author&amp;query=Strub%2C+F">Florian Strub</a>, <a href="/search/stat?searchtype=author&amp;query=Dumoulin%2C+V">Vincent Dumoulin</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1707.03017v5-abstract-short" style="display: inline;"> Achieving artificial visual reasoning - the ability to answer image-related questions which require a multi-step, high-level process - is an important step towards artificial general intelligence. This multi-modal task requires learning a question-dependent, structured reasoning process over images from language. Standard deep learning approaches tend to exploit biases in the data rather than lear&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1707.03017v5-abstract-full').style.display = 'inline'; document.getElementById('1707.03017v5-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1707.03017v5-abstract-full" style="display: none;"> Achieving artificial visual reasoning - the ability to answer image-related questions which require a multi-step, high-level process - is an important step towards artificial general intelligence. This multi-modal task requires learning a question-dependent, structured reasoning process over images from language. Standard deep learning approaches tend to exploit biases in the data rather than learn this underlying structure, while leading methods learn to visually reason successfully but are hand-crafted for reasoning. We show that a general-purpose, Conditional Batch Normalization approach achieves state-of-the-art results on the CLEVR Visual Reasoning benchmark with a 2.4% error rate. We outperform the next best end-to-end method (4.5%) and even methods that use extra supervision (3.1%). We probe our model to shed light on how it reasons, showing it has learned a question-dependent, multi-step process. Previous work has operated under the assumption that visual reasoning calls for a specialized architecture, but we show that a general architecture with proper conditioning can learn to visually reason effectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1707.03017v5-abstract-full').style.display = 'none'; document.getElementById('1707.03017v5-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2017; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 July, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2017. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Full AAAI 2018 paper is at arXiv:1709.07871. Presented at ICML 2017&#39;s Machine Learning in Speech and Language Processing Workshop. Code is at http://github.com/ethanjperez/film</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1706.05394">arXiv:1706.05394</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1706.05394">pdf</a>, <a href="https://arxiv.org/format/1706.05394">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Closer Look at Memorization in Deep Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/stat?searchtype=author&amp;query=Arpit%2C+D">Devansh Arpit</a>, <a href="/search/stat?searchtype=author&amp;query=Jastrz%C4%99bski%2C+S">Stanis艂aw Jastrz臋bski</a>, <a href="/search/stat?searchtype=author&amp;query=Ballas%2C+N">Nicolas Ballas</a>, <a href="/search/stat?searchtype=author&amp;query=Krueger%2C+D">David Krueger</a>, <a href="/search/stat?searchtype=author&amp;query=Bengio%2C+E">Emmanuel Bengio</a>, <a href="/search/stat?searchtype=author&amp;query=Kanwal%2C+M+S">Maxinder S. Kanwal</a>, <a href="/search/stat?searchtype=author&amp;query=Maharaj%2C+T">Tegan Maharaj</a>, <a href="/search/stat?searchtype=author&amp;query=Fischer%2C+A">Asja Fischer</a>, <a href="/search/stat?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a>, <a href="/search/stat?searchtype=author&amp;query=Bengio%2C+Y">Yoshua Bengio</a>, <a href="/search/stat?searchtype=author&amp;query=Lacoste-Julien%2C+S">Simon Lacoste-Julien</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1706.05394v2-abstract-short" style="display: inline;"> We examine the role of memorization in deep learning, drawing connections to capacity, generalization, and adversarial robustness. While deep networks are capable of memorizing noise data, our results suggest that they tend to prioritize learning simple patterns first. In our experiments, we expose qualitative differences in gradient-based optimization of deep neural networks (DNNs) on noise vs. r&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1706.05394v2-abstract-full').style.display = 'inline'; document.getElementById('1706.05394v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1706.05394v2-abstract-full" style="display: none;"> We examine the role of memorization in deep learning, drawing connections to capacity, generalization, and adversarial robustness. While deep networks are capable of memorizing noise data, our results suggest that they tend to prioritize learning simple patterns first. In our experiments, we expose qualitative differences in gradient-based optimization of deep neural networks (DNNs) on noise vs. real data. We also demonstrate that for appropriately tuned explicit regularization (e.g., dropout) we can degrade DNN training performance on noise datasets without compromising generalization on real data. Our analysis suggests that the notions of effective capacity which are dataset independent are unlikely to explain the generalization performance of deep networks when trained with gradient based methods because training data itself plays an important role in determining the degree of memorization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1706.05394v2-abstract-full').style.display = 'none'; document.getElementById('1706.05394v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2017; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 June, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2017. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Appears in Proceedings of the 34th International Conference on Machine Learning (ICML 2017), Devansh Arpit, Stanis艂aw Jastrz臋bski, Nicolas Ballas, and David Krueger contributed equally to this work</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Courville%2C+A&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Courville%2C+A&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Courville%2C+A&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10