CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–30 of 30 results for author: <span class="mathjax">Azar, M G</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Azar%2C+M+G">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Azar, M G"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Azar%2C+M+G&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Azar, M G"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19188">arXiv:2406.19188</a> <span> [<a href="https://arxiv.org/pdf/2406.19188">pdf</a>, <a href="https://arxiv.org/format/2406.19188">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Averaging log-likelihoods in direct alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Grinsztajn%2C+N">Nathan Grinsztajn</a>, <a href="/search/cs?searchtype=author&query=Flet-Berliac%2C+Y">Yannis Flet-Berliac</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Strub%2C+F">Florian Strub</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Bill Wu</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+E">Eugene Choi</a>, <a href="/search/cs?searchtype=author&query=Cremer%2C+C">Chris Cremer</a>, <a href="/search/cs?searchtype=author&query=Ahmadian%2C+A">Arash Ahmadian</a>, <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Pietquin%2C+O">Olivier Pietquin</a>, <a href="/search/cs?searchtype=author&query=Geist%2C+M">Matthieu Geist</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19188v1-abstract-short" style="display: inline;"> To better align Large Language Models (LLMs) with human judgment, Reinforcement Learning from Human Feedback (RLHF) learns a reward model and then optimizes it using regularized RL. Recently, direct alignment methods were introduced to learn such a fine-tuned model directly from a preference dataset without computing a proxy reward function. These methods are built upon contrastive losses involvin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19188v1-abstract-full').style.display = 'inline'; document.getElementById('2406.19188v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19188v1-abstract-full" style="display: none;"> To better align Large Language Models (LLMs) with human judgment, Reinforcement Learning from Human Feedback (RLHF) learns a reward model and then optimizes it using regularized RL. Recently, direct alignment methods were introduced to learn such a fine-tuned model directly from a preference dataset without computing a proxy reward function. These methods are built upon contrastive losses involving the log-likelihood of (dis)preferred completions according to the trained model. However, completions have various lengths, and the log-likelihood is not length-invariant. On the other side, the cross-entropy loss used in supervised training is length-invariant, as batches are typically averaged token-wise. To reconcile these approaches, we introduce a principled approach for making direct alignment length-invariant. Formally, we introduce a new averaging operator, to be composed with the optimality operator giving the best policy for the underlying RL problem. It translates into averaging the log-likelihood within the loss. We empirically study the effect of such averaging, observing a trade-off between the length of generations and their scores. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19188v1-abstract-full').style.display = 'none'; document.getElementById('2406.19188v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19185">arXiv:2406.19185</a> <span> [<a href="https://arxiv.org/pdf/2406.19185">pdf</a>, <a href="https://arxiv.org/format/2406.19185">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Contrastive Policy Gradient: Aligning LLMs on sequence-level scores in a supervised-friendly fashion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Flet-Berliac%2C+Y">Yannis Flet-Berliac</a>, <a href="/search/cs?searchtype=author&query=Grinsztajn%2C+N">Nathan Grinsztajn</a>, <a href="/search/cs?searchtype=author&query=Strub%2C+F">Florian Strub</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+E">Eugene Choi</a>, <a href="/search/cs?searchtype=author&query=Cremer%2C+C">Chris Cremer</a>, <a href="/search/cs?searchtype=author&query=Ahmadian%2C+A">Arash Ahmadian</a>, <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Pietquin%2C+O">Olivier Pietquin</a>, <a href="/search/cs?searchtype=author&query=Geist%2C+M">Matthieu Geist</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19185v1-abstract-short" style="display: inline;"> Reinforcement Learning (RL) has been used to finetune Large Language Models (LLMs) using a reward model trained from preference data, to better align with human judgment. The recently introduced direct alignment methods, which are often simpler, more stable, and computationally lighter, can more directly achieve this. However, these approaches cannot optimize arbitrary rewards, and the preference-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19185v1-abstract-full').style.display = 'inline'; document.getElementById('2406.19185v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19185v1-abstract-full" style="display: none;"> Reinforcement Learning (RL) has been used to finetune Large Language Models (LLMs) using a reward model trained from preference data, to better align with human judgment. The recently introduced direct alignment methods, which are often simpler, more stable, and computationally lighter, can more directly achieve this. However, these approaches cannot optimize arbitrary rewards, and the preference-based ones are not the only rewards of interest for LLMs (eg., unit tests for code generation or textual entailment for summarization, among others). RL-finetuning is usually done with a variation of policy gradient, which calls for on-policy or near-on-policy samples, requiring costly generations. We introduce Contrastive Policy Gradient, or CoPG, a simple and mathematically principled new RL algorithm that can estimate the optimal policy even from off-policy data. It can be seen as an off-policy policy gradient approach that does not rely on important sampling techniques and highlights the importance of using (the right) state baseline. We show this approach to generalize the direct alignment method IPO (identity preference optimization) and classic policy gradient. We experiment with the proposed CoPG on a toy bandit problem to illustrate its properties, as well as for finetuning LLMs on a summarization task, using a learned reward function considered as ground truth for the purpose of the experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19185v1-abstract-full').style.display = 'none'; document.getElementById('2406.19185v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.01660">arXiv:2406.01660</a> <span> [<a href="https://arxiv.org/pdf/2406.01660">pdf</a>, <a href="https://arxiv.org/format/2406.01660">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Self-Improving Robust Preference Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Choi%2C+E">Eugene Choi</a>, <a href="/search/cs?searchtype=author&query=Ahmadian%2C+A">Arash Ahmadian</a>, <a href="/search/cs?searchtype=author&query=Geist%2C+M">Matthieu Geist</a>, <a href="/search/cs?searchtype=author&query=Pietquin%2C+O">Oilvier Pietquin</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.01660v3-abstract-short" style="display: inline;"> Both online and offline RLHF methods such as PPO and DPO have been extremely successful in aligning AI with human preferences. Despite their success, the existing methods suffer from a fundamental problem that their optimal solution is highly task-dependent (i.e., not robust to out-of-distribution (OOD) tasks). Here we address this challenge by proposing Self-Improving Robust Preference Optimizati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01660v3-abstract-full').style.display = 'inline'; document.getElementById('2406.01660v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.01660v3-abstract-full" style="display: none;"> Both online and offline RLHF methods such as PPO and DPO have been extremely successful in aligning AI with human preferences. Despite their success, the existing methods suffer from a fundamental problem that their optimal solution is highly task-dependent (i.e., not robust to out-of-distribution (OOD) tasks). Here we address this challenge by proposing Self-Improving Robust Preference Optimization SRPO, a practical and mathematically principled offline RLHF framework that is completely robust to the changes in the task. The key idea of SRPO is to cast the problem of learning from human preferences as a self-improvement process, which can be mathematically expressed in terms of a min-max objective that aims at joint optimization of self-improvement policy and the generative policy in an adversarial fashion. The solution for this optimization problem is independent of the training task and thus it is robust to its changes. We then show that this objective can be re-expressed in the form of a non-adversarial offline loss which can be optimized using standard supervised optimization techniques at scale without any need for reward model and online inference. We show the effectiveness of SRPO in terms of AI Win-Rate (WR) against human (GOLD) completions. In particular, when SRPO is evaluated on the OOD XSUM dataset, it outperforms the celebrated DPO by a clear margin of 15% after 5 self-revisions, achieving WR of 90%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01660v3-abstract-full').style.display = 'none'; document.getElementById('2406.01660v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.19107">arXiv:2405.19107</a> <span> [<a href="https://arxiv.org/pdf/2405.19107">pdf</a>, <a href="https://arxiv.org/ps/2405.19107">ps</a>, <a href="https://arxiv.org/format/2405.19107">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Offline Regularised Reinforcement Learning for Large Language Models Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Richemond%2C+P+H">Pierre Harvey Richemond</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Daniel Guo</a>, <a href="/search/cs?searchtype=author&query=Calandriello%2C+D">Daniele Calandriello</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Rafailov%2C+R">Rafael Rafailov</a>, <a href="/search/cs?searchtype=author&query=Pires%2C+B+A">Bernardo Avila Pires</a>, <a href="/search/cs?searchtype=author&query=Tarassov%2C+E">Eugene Tarassov</a>, <a href="/search/cs?searchtype=author&query=Spangher%2C+L">Lucas Spangher</a>, <a href="/search/cs?searchtype=author&query=Ellsworth%2C+W">Will Ellsworth</a>, <a href="/search/cs?searchtype=author&query=Severyn%2C+A">Aliaksei Severyn</a>, <a href="/search/cs?searchtype=author&query=Mallinson%2C+J">Jonathan Mallinson</a>, <a href="/search/cs?searchtype=author&query=Shani%2C+L">Lior Shani</a>, <a href="/search/cs?searchtype=author&query=Shamir%2C+G">Gil Shamir</a>, <a href="/search/cs?searchtype=author&query=Joshi%2C+R">Rishabh Joshi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianqi Liu</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">Remi Munos</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.19107v1-abstract-short" style="display: inline;"> The dominant framework for alignment of large language models (LLM), whether through reinforcement learning from human feedback or direct preference optimisation, is to learn from preference data. This involves building datasets where each element is a quadruplet composed of a prompt, two independent responses (completions of the prompt) and a human preference between the two independent responses… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19107v1-abstract-full').style.display = 'inline'; document.getElementById('2405.19107v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.19107v1-abstract-full" style="display: none;"> The dominant framework for alignment of large language models (LLM), whether through reinforcement learning from human feedback or direct preference optimisation, is to learn from preference data. This involves building datasets where each element is a quadruplet composed of a prompt, two independent responses (completions of the prompt) and a human preference between the two independent responses, yielding a preferred and a dis-preferred response. Such data is typically scarce and expensive to collect. On the other hand, \emph{single-trajectory} datasets where each element is a triplet composed of a prompt, a response and a human feedback is naturally more abundant. The canonical element of such datasets is for instance an LLM's response to a user's prompt followed by a user's feedback such as a thumbs-up/down. Consequently, in this work, we propose DRO, or \emph{Direct Reward Optimisation}, as a framework and associated algorithms that do not require pairwise preferences. DRO uses a simple mean-squared objective that can be implemented in various ways. We validate our findings empirically, using T5 encoder-decoder language models, and show DRO's performance over selected baselines such as Kahneman-Tversky Optimization (KTO). Thus, we confirm that DRO is a simple and empirically compelling method for single-trajectory policy optimisation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19107v1-abstract-full').style.display = 'none'; document.getElementById('2405.19107v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.00886">arXiv:2312.00886</a> <span> [<a href="https://arxiv.org/pdf/2312.00886">pdf</a>, <a href="https://arxiv.org/format/2312.00886">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Nash Learning from Human Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&query=Calandriello%2C+D">Daniele Calandriello</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z+D">Zhaohan Daniel Guo</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Geist%2C+M">Matthieu Geist</a>, <a href="/search/cs?searchtype=author&query=Mesnard%2C+T">Thomas Mesnard</a>, <a href="/search/cs?searchtype=author&query=Michi%2C+A">Andrea Michi</a>, <a href="/search/cs?searchtype=author&query=Selvi%2C+M">Marco Selvi</a>, <a href="/search/cs?searchtype=author&query=Girgin%2C+S">Sertan Girgin</a>, <a href="/search/cs?searchtype=author&query=Momchev%2C+N">Nikola Momchev</a>, <a href="/search/cs?searchtype=author&query=Bachem%2C+O">Olivier Bachem</a>, <a href="/search/cs?searchtype=author&query=Mankowitz%2C+D+J">Daniel J. Mankowitz</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.00886v4-abstract-short" style="display: inline;"> Reinforcement learning from human feedback (RLHF) has emerged as the main paradigm for aligning large language models (LLMs) with human preferences. Typically, RLHF involves the initial step of learning a reward model from human feedback, often expressed as preferences between pairs of text generations produced by a pre-trained LLM. Subsequently, the LLM's policy is fine-tuned by optimizing it to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.00886v4-abstract-full').style.display = 'inline'; document.getElementById('2312.00886v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.00886v4-abstract-full" style="display: none;"> Reinforcement learning from human feedback (RLHF) has emerged as the main paradigm for aligning large language models (LLMs) with human preferences. Typically, RLHF involves the initial step of learning a reward model from human feedback, often expressed as preferences between pairs of text generations produced by a pre-trained LLM. Subsequently, the LLM's policy is fine-tuned by optimizing it to maximize the reward model through a reinforcement learning algorithm. However, an inherent limitation of current reward models is their inability to fully represent the richness of human preferences and their dependency on the sampling distribution. In this study, we introduce an alternative pipeline for the fine-tuning of LLMs using pairwise human feedback. Our approach entails the initial learning of a preference model, which is conditioned on two inputs given a prompt, followed by the pursuit of a policy that consistently generates responses preferred over those generated by any competing policy, thus defining the Nash equilibrium of this preference model. We term this approach Nash learning from human feedback (NLHF). In the context of a tabular policy representation, we present a novel algorithmic solution, Nash-MD, founded on the principles of mirror descent. This algorithm produces a sequence of policies, with the last iteration converging to the regularized Nash equilibrium. Additionally, we explore parametric representations of policies and introduce gradient descent algorithms for deep-learning architectures. To demonstrate the effectiveness of our approach, we present experimental results involving the fine-tuning of a LLM for a text summarization task. We believe NLHF offers a compelling avenue for preference learning and policy optimization with the potential of advancing the field of aligning LLMs with human preferences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.00886v4-abstract-full').style.display = 'none'; document.getElementById('2312.00886v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.12036">arXiv:2310.12036</a> <span> [<a href="https://arxiv.org/pdf/2310.12036">pdf</a>, <a href="https://arxiv.org/format/2310.12036">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> A General Theoretical Paradigm to Understand Learning from Human Preferences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Daniel Guo</a>, <a href="/search/cs?searchtype=author&query=Calandriello%2C+D">Daniele Calandriello</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.12036v2-abstract-short" style="display: inline;"> The prevalent deployment of learning from human preferences through reinforcement learning (RLHF) relies on two important approximations: the first assumes that pairwise preferences can be substituted with pointwise rewards. The second assumes that a reward model trained on these pointwise rewards can generalize from collected data to out-of-distribution data sampled by the policy. Recently, Direc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.12036v2-abstract-full').style.display = 'inline'; document.getElementById('2310.12036v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.12036v2-abstract-full" style="display: none;"> The prevalent deployment of learning from human preferences through reinforcement learning (RLHF) relies on two important approximations: the first assumes that pairwise preferences can be substituted with pointwise rewards. The second assumes that a reward model trained on these pointwise rewards can generalize from collected data to out-of-distribution data sampled by the policy. Recently, Direct Preference Optimisation (DPO) has been proposed as an approach that bypasses the second approximation and learn directly a policy from collected data without the reward modelling stage. However, this method still heavily relies on the first approximation. In this paper we try to gain a deeper theoretical understanding of these practical algorithms. In particular we derive a new general objective called $唯$PO for learning from human preferences that is expressed in terms of pairwise preferences and therefore bypasses both approximations. This new general objective allows us to perform an in-depth analysis of the behavior of RLHF and DPO (as special cases of $唯$PO) and to identify their potential pitfalls. We then consider another special case for $唯$PO by setting $唯$ simply to Identity, for which we can derive an efficient optimisation procedure, prove performance guarantees and demonstrate its empirical superiority to DPO on some illustrative examples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.12036v2-abstract-full').style.display = 'none'; document.getElementById('2310.12036v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.13185">arXiv:2305.13185</a> <span> [<a href="https://arxiv.org/pdf/2305.13185">pdf</a>, <a href="https://arxiv.org/format/2305.13185">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Regularization and Variance-Weighted Regression Achieves Minimax Optimality in Linear MDPs: Theory and Practice </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kitamura%2C+T">Toshinori Kitamura</a>, <a href="/search/cs?searchtype=author&query=Kozuno%2C+T">Tadashi Kozuno</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Vieillard%2C+N">Nino Vieillard</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenhao Yang</a>, <a href="/search/cs?searchtype=author&query=Mei%2C+J">Jincheng Mei</a>, <a href="/search/cs?searchtype=author&query=M%C3%A9nard%2C+P">Pierre M茅nard</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Pietquin%2C+O">Olivier Pietquin</a>, <a href="/search/cs?searchtype=author&query=Geist%2C+M">Matthieu Geist</a>, <a href="/search/cs?searchtype=author&query=Szepesv%C3%A1ri%2C+C">Csaba Szepesv谩ri</a>, <a href="/search/cs?searchtype=author&query=Kumagai%2C+W">Wataru Kumagai</a>, <a href="/search/cs?searchtype=author&query=Matsuo%2C+Y">Yutaka Matsuo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.13185v1-abstract-short" style="display: inline;"> Mirror descent value iteration (MDVI), an abstraction of Kullback-Leibler (KL) and entropy-regularized reinforcement learning (RL), has served as the basis for recent high-performing practical RL algorithms. However, despite the use of function approximation in practice, the theoretical understanding of MDVI has been limited to tabular Markov decision processes (MDPs). We study MDVI with linear fu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.13185v1-abstract-full').style.display = 'inline'; document.getElementById('2305.13185v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.13185v1-abstract-full" style="display: none;"> Mirror descent value iteration (MDVI), an abstraction of Kullback-Leibler (KL) and entropy-regularized reinforcement learning (RL), has served as the basis for recent high-performing practical RL algorithms. However, despite the use of function approximation in practice, the theoretical understanding of MDVI has been limited to tabular Markov decision processes (MDPs). We study MDVI with linear function approximation through its sample complexity required to identify an $\varepsilon$-optimal policy with probability $1-未$ under the settings of an infinite-horizon linear MDP, generative model, and G-optimal design. We demonstrate that least-squares regression weighted by the variance of an estimated optimal value function of the next state is crucial to achieving minimax optimality. Based on this observation, we present Variance-Weighted Least-Squares MDVI (VWLS-MDVI), the first theoretical algorithm that achieves nearly minimax optimal sample complexity for infinite-horizon linear MDPs. Furthermore, we propose a practical VWLS algorithm for value-based deep RL, Deep Variance Weighting (DVW). Our experiments demonstrate that DVW improves the performance of popular value-based deep RL algorithms on a set of MinAtar benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.13185v1-abstract-full').style.display = 'none'; document.getElementById('2305.13185v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2023 accepted</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.04462">arXiv:2301.04462</a> <span> [<a href="https://arxiv.org/pdf/2301.04462">pdf</a>, <a href="https://arxiv.org/format/2301.04462">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> An Analysis of Quantile Temporal-Difference Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Ostrovski%2C+G">Georg Ostrovski</a>, <a href="/search/cs?searchtype=author&query=Harutyunyan%2C+A">Anna Harutyunyan</a>, <a href="/search/cs?searchtype=author&query=Tuyls%2C+K">Karl Tuyls</a>, <a href="/search/cs?searchtype=author&query=Bellemare%2C+M+G">Marc G. Bellemare</a>, <a href="/search/cs?searchtype=author&query=Dabney%2C+W">Will Dabney</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.04462v3-abstract-short" style="display: inline;"> We analyse quantile temporal-difference learning (QTD), a distributional reinforcement learning algorithm that has proven to be a key component in several successful large-scale applications of reinforcement learning. Despite these empirical successes, a theoretical understanding of QTD has proven elusive until now. Unlike classical TD learning, which can be analysed with standard stochastic appro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.04462v3-abstract-full').style.display = 'inline'; document.getElementById('2301.04462v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.04462v3-abstract-full" style="display: none;"> We analyse quantile temporal-difference learning (QTD), a distributional reinforcement learning algorithm that has proven to be a key component in several successful large-scale applications of reinforcement learning. Despite these empirical successes, a theoretical understanding of QTD has proven elusive until now. Unlike classical TD learning, which can be analysed with standard stochastic approximation tools, QTD updates do not approximate contraction mappings, are highly non-linear, and may have multiple fixed points. The core result of this paper is a proof of convergence to the fixed points of a related family of dynamic programming procedures with probability 1, putting QTD on firm theoretical footing. The proof establishes connections between QTD and non-linear differential inclusions through stochastic approximation theory and non-smooth analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.04462v3-abstract-full').style.display = 'none'; document.getElementById('2301.04462v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to JMLR</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.03319">arXiv:2212.03319</a> <span> [<a href="https://arxiv.org/pdf/2212.03319">pdf</a>, <a href="https://arxiv.org/format/2212.03319">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Understanding Self-Predictive Learning for Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z+D">Zhaohan Daniel Guo</a>, <a href="/search/cs?searchtype=author&query=Richemond%2C+P+H">Pierre Harvey Richemond</a>, <a href="/search/cs?searchtype=author&query=Pires%2C+B+%C3%81">Bernardo 脕vila Pires</a>, <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+C+L">Charline Le Lan</a>, <a href="/search/cs?searchtype=author&query=Lyle%2C+C">Clare Lyle</a>, <a href="/search/cs?searchtype=author&query=Gy%C3%B6rgy%2C+A">Andr谩s Gy枚rgy</a>, <a href="/search/cs?searchtype=author&query=Thakoor%2C+S">Shantanu Thakoor</a>, <a href="/search/cs?searchtype=author&query=Dabney%2C+W">Will Dabney</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&query=Calandriello%2C+D">Daniele Calandriello</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.03319v1-abstract-short" style="display: inline;"> We study the learning dynamics of self-predictive learning for reinforcement learning, a family of algorithms that learn representations by minimizing the prediction error of their own future latent representations. Despite its recent empirical success, such algorithms have an apparent defect: trivial representations (such as constants) minimize the prediction error, yet it is obviously undesirabl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.03319v1-abstract-full').style.display = 'inline'; document.getElementById('2212.03319v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.03319v1-abstract-full" style="display: none;"> We study the learning dynamics of self-predictive learning for reinforcement learning, a family of algorithms that learn representations by minimizing the prediction error of their own future latent representations. Despite its recent empirical success, such algorithms have an apparent defect: trivial representations (such as constants) minimize the prediction error, yet it is obviously undesirable to converge to such solutions. Our central insight is that careful designs of the optimization dynamics are critical to learning meaningful representations. We identify that a faster paced optimization of the predictor and semi-gradient updates on the representation, are crucial to preventing the representation collapse. Then in an idealized setup, we show self-predictive learning dynamics carries out spectral decomposition on the state transition matrix, effectively capturing information of the transition dynamics. Building on the theoretical insights, we propose bidirectional self-predictive learning, a novel self-predictive algorithm that learns two representations simultaneously. We examine the robustness of our theoretical insights with a number of small-scale experiments and showcase the promise of the novel representation learning algorithm with large-scale experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.03319v1-abstract-full').style.display = 'none'; document.getElementById('2212.03319v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.08332">arXiv:2206.08332</a> <span> [<a href="https://arxiv.org/pdf/2206.08332">pdf</a>, <a href="https://arxiv.org/format/2206.08332">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> BYOL-Explore: Exploration by Bootstrapped Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+Z+D">Zhaohan Daniel Guo</a>, <a href="/search/cs?searchtype=author&query=Thakoor%2C+S">Shantanu Thakoor</a>, <a href="/search/cs?searchtype=author&query=P%C3%AEslar%2C+M">Miruna P卯slar</a>, <a href="/search/cs?searchtype=author&query=Pires%2C+B+A">Bernardo Avila Pires</a>, <a href="/search/cs?searchtype=author&query=Altch%C3%A9%2C+F">Florent Altch茅</a>, <a href="/search/cs?searchtype=author&query=Tallec%2C+C">Corentin Tallec</a>, <a href="/search/cs?searchtype=author&query=Saade%2C+A">Alaa Saade</a>, <a href="/search/cs?searchtype=author&query=Calandriello%2C+D">Daniele Calandriello</a>, <a href="/search/cs?searchtype=author&query=Grill%2C+J">Jean-Bastien Grill</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.08332v1-abstract-short" style="display: inline;"> We present BYOL-Explore, a conceptually simple yet general approach for curiosity-driven exploration in visually-complex environments. BYOL-Explore learns a world representation, the world dynamics, and an exploration policy all-together by optimizing a single prediction loss in the latent space with no additional auxiliary objective. We show that BYOL-Explore is effective in DM-HARD-8, a challeng… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.08332v1-abstract-full').style.display = 'inline'; document.getElementById('2206.08332v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.08332v1-abstract-full" style="display: none;"> We present BYOL-Explore, a conceptually simple yet general approach for curiosity-driven exploration in visually-complex environments. BYOL-Explore learns a world representation, the world dynamics, and an exploration policy all-together by optimizing a single prediction loss in the latent space with no additional auxiliary objective. We show that BYOL-Explore is effective in DM-HARD-8, a challenging partially-observable continuous-action hard-exploration benchmark with visually-rich 3-D environments. On this benchmark, we solve the majority of the tasks purely through augmenting the extrinsic reward with BYOL-Explore s intrinsic reward, whereas prior work could only get off the ground with human demonstrations. As further evidence of the generality of BYOL-Explore, we show that it achieves superhuman performance on the ten hardest exploration games in Atari while having a much simpler design than other competitive agents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.08332v1-abstract-full').style.display = 'none'; document.getElementById('2206.08332v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.14211">arXiv:2205.14211</a> <span> [<a href="https://arxiv.org/pdf/2205.14211">pdf</a>, <a href="https://arxiv.org/format/2205.14211">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> KL-Entropy-Regularized RL with a Generative Model is Minimax Optimal </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kozuno%2C+T">Tadashi Kozuno</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenhao Yang</a>, <a href="/search/cs?searchtype=author&query=Vieillard%2C+N">Nino Vieillard</a>, <a href="/search/cs?searchtype=author&query=Kitamura%2C+T">Toshinori Kitamura</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Mei%2C+J">Jincheng Mei</a>, <a href="/search/cs?searchtype=author&query=M%C3%A9nard%2C+P">Pierre M茅nard</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Pietquin%2C+O">Olivier Pietquin</a>, <a href="/search/cs?searchtype=author&query=Geist%2C+M">Matthieu Geist</a>, <a href="/search/cs?searchtype=author&query=Szepesv%C3%A1ri%2C+C">Csaba Szepesv谩ri</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.14211v1-abstract-short" style="display: inline;"> In this work, we consider and analyze the sample complexity of model-free reinforcement learning with a generative model. Particularly, we analyze mirror descent value iteration (MDVI) by Geist et al. (2019) and Vieillard et al. (2020a), which uses the Kullback-Leibler divergence and entropy regularization in its value and policy updates. Our analysis shows that it is nearly minimax-optimal for fi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.14211v1-abstract-full').style.display = 'inline'; document.getElementById('2205.14211v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.14211v1-abstract-full" style="display: none;"> In this work, we consider and analyze the sample complexity of model-free reinforcement learning with a generative model. Particularly, we analyze mirror descent value iteration (MDVI) by Geist et al. (2019) and Vieillard et al. (2020a), which uses the Kullback-Leibler divergence and entropy regularization in its value and policy updates. Our analysis shows that it is nearly minimax-optimal for finding an $\varepsilon$-optimal policy when $\varepsilon$ is sufficiently small. This is the first theoretical result that demonstrates that a simple model-free algorithm without variance-reduction can be nearly minimax-optimal under the considered setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.14211v1-abstract-full').style.display = 'none'; document.getElementById('2205.14211v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">29 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.02338">arXiv:2111.02338</a> <span> [<a href="https://arxiv.org/pdf/2111.02338">pdf</a>, <a href="https://arxiv.org/format/2111.02338">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Drop, Swap, and Generate: A Self-Supervised Approach for Generating Neural Activity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+R">Ran Liu</a>, <a href="/search/cs?searchtype=author&query=Azabou%2C+M">Mehdi Azabou</a>, <a href="/search/cs?searchtype=author&query=Dabagia%2C+M">Max Dabagia</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chi-Heng Lin</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Hengen%2C+K+B">Keith B. Hengen</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&query=Dyer%2C+E+L">Eva L. Dyer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.02338v1-abstract-short" style="display: inline;"> Meaningful and simplified representations of neural activity can yield insights into how and what information is being processed within a neural circuit. However, without labels, finding representations that reveal the link between the brain and behavior can be challenging. Here, we introduce a novel unsupervised approach for learning disentangled representations of neural activity called Swap-VAE… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.02338v1-abstract-full').style.display = 'inline'; document.getElementById('2111.02338v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.02338v1-abstract-full" style="display: none;"> Meaningful and simplified representations of neural activity can yield insights into how and what information is being processed within a neural circuit. However, without labels, finding representations that reveal the link between the brain and behavior can be challenging. Here, we introduce a novel unsupervised approach for learning disentangled representations of neural activity called Swap-VAE. Our approach combines a generative modeling framework with an instance-specific alignment loss that tries to maximize the representational similarity between transformed views of the input (brain state). These transformed (or augmented) views are created by dropping out neurons and jittering samples in time, which intuitively should lead the network to a representation that maintains both temporal consistency and invariance to the specific neurons used to represent the neural state. Through evaluations on both synthetic data and neural recordings from hundreds of neurons in different primate brains, we show that it is possible to build representations that disentangle neural datasets along relevant latent dimensions linked to behavior. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.02338v1-abstract-full').style.display = 'none'; document.getElementById('2111.02338v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To be published in Neurips 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2102.10106">arXiv:2102.10106</a> <span> [<a href="https://arxiv.org/pdf/2102.10106">pdf</a>, <a href="https://arxiv.org/format/2102.10106">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Mine Your Own vieW: Self-Supervised Learning Through Across-Sample Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Azabou%2C+M">Mehdi Azabou</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+R">Ran Liu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chi-Heng Lin</a>, <a href="/search/cs?searchtype=author&query=Johnson%2C+E+C">Erik C. Johnson</a>, <a href="/search/cs?searchtype=author&query=Bhaskaran-Nair%2C+K">Kiran Bhaskaran-Nair</a>, <a href="/search/cs?searchtype=author&query=Dabagia%2C+M">Max Dabagia</a>, <a href="/search/cs?searchtype=author&query=Avila-Pires%2C+B">Bernardo Avila-Pires</a>, <a href="/search/cs?searchtype=author&query=Kitchell%2C+L">Lindsey Kitchell</a>, <a href="/search/cs?searchtype=author&query=Hengen%2C+K+B">Keith B. Hengen</a>, <a href="/search/cs?searchtype=author&query=Gray-Roncal%2C+W">William Gray-Roncal</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&query=Dyer%2C+E+L">Eva L. Dyer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2102.10106v3-abstract-short" style="display: inline;"> State-of-the-art methods for self-supervised learning (SSL) build representations by maximizing the similarity between different transformed "views" of a sample. Without sufficient diversity in the transformations used to create views, however, it can be difficult to overcome nuisance variables in the data and build rich representations. This motivates the use of the dataset itself to find similar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.10106v3-abstract-full').style.display = 'inline'; document.getElementById('2102.10106v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2102.10106v3-abstract-full" style="display: none;"> State-of-the-art methods for self-supervised learning (SSL) build representations by maximizing the similarity between different transformed "views" of a sample. Without sufficient diversity in the transformations used to create views, however, it can be difficult to overcome nuisance variables in the data and build rich representations. This motivates the use of the dataset itself to find similar, yet distinct, samples to serve as views for one another. In this paper, we introduce Mine Your Own vieW (MYOW), a new approach for self-supervised learning that looks within the dataset to define diverse targets for prediction. The idea behind our approach is to actively mine views, finding samples that are neighbors in the representation space of the network, and then predict, from one sample's latent representation, the representation of a nearby sample. After showing the promise of MYOW on benchmarks used in computer vision, we highlight the power of this idea in a novel application in neuroscience where SSL has yet to be applied. When tested on multi-unit neural recordings, we find that MYOW outperforms other self-supervised approaches in all examples (in some cases by more than 10%), and often surpasses the supervised baseline. With MYOW, we show that it is possible to harness the diversity of the data to build rich views and leverage self-supervision in new domains where augmentations are limited or unknown. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.10106v3-abstract-full').style.display = 'none'; document.getElementById('2102.10106v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2102.06514">arXiv:2102.06514</a> <span> [<a href="https://arxiv.org/pdf/2102.06514">pdf</a>, <a href="https://arxiv.org/format/2102.06514">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Large-Scale Representation Learning on Graphs via Bootstrapping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Thakoor%2C+S">Shantanu Thakoor</a>, <a href="/search/cs?searchtype=author&query=Tallec%2C+C">Corentin Tallec</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Azabou%2C+M">Mehdi Azabou</a>, <a href="/search/cs?searchtype=author&query=Dyer%2C+E+L">Eva L. Dyer</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Veli%C4%8Dkovi%C4%87%2C+P">Petar Veli膷kovi膰</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2102.06514v3-abstract-short" style="display: inline;"> Self-supervised learning provides a promising path towards eliminating the need for costly label information in representation learning on graphs. However, to achieve state-of-the-art performance, methods often need large numbers of negative examples and rely on complex augmentations. This can be prohibitively expensive, especially for large graphs. To address these challenges, we introduce Bootst… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.06514v3-abstract-full').style.display = 'inline'; document.getElementById('2102.06514v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2102.06514v3-abstract-full" style="display: none;"> Self-supervised learning provides a promising path towards eliminating the need for costly label information in representation learning on graphs. However, to achieve state-of-the-art performance, methods often need large numbers of negative examples and rely on complex augmentations. This can be prohibitively expensive, especially for large graphs. To address these challenges, we introduce Bootstrapped Graph Latents (BGRL) - a graph representation learning method that learns by predicting alternative augmentations of the input. BGRL uses only simple augmentations and alleviates the need for contrasting with negative examples, and is thus scalable by design. BGRL outperforms or matches prior methods on several established benchmarks, while achieving a 2-10x reduction in memory costs. Furthermore, we show that BGRL can be scaled up to extremely large graphs with hundreds of millions of nodes in the semi-supervised regime - achieving state-of-the-art performance and improving over supervised baselines where representations are shaped only through label information. In particular, our solution centered on BGRL constituted one of the winning entries to the Open Graph Benchmark - Large Scale Challenge at KDD Cup 2021, on a graph orders of magnitudes larger than all previously available benchmarks, thus demonstrating the scalability and effectiveness of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.06514v3-abstract-full').style.display = 'none'; document.getElementById('2102.06514v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published as a conference paper at ICLR 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2101.02055">arXiv:2101.02055</a> <span> [<a href="https://arxiv.org/pdf/2101.02055">pdf</a>, <a href="https://arxiv.org/format/2101.02055">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Geometric Entropic Exploration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+Z+D">Zhaohan Daniel Guo</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Saade%2C+A">Alaa Saade</a>, <a href="/search/cs?searchtype=author&query=Thakoor%2C+S">Shantanu Thakoor</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&query=Pires%2C+B+A">Bernardo Avila Pires</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&query=Mesnard%2C+T">Thomas Mesnard</a>, <a href="/search/cs?searchtype=author&query=Lattimore%2C+T">Tor Lattimore</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2101.02055v2-abstract-short" style="display: inline;"> Exploration is essential for solving complex Reinforcement Learning (RL) tasks. Maximum State-Visitation Entropy (MSVE) formulates the exploration problem as a well-defined policy optimization problem whose solution aims at visiting all states as uniformly as possible. This is in contrast to standard uncertainty-based approaches where exploration is transient and eventually vanishes. However, exis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.02055v2-abstract-full').style.display = 'inline'; document.getElementById('2101.02055v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2101.02055v2-abstract-full" style="display: none;"> Exploration is essential for solving complex Reinforcement Learning (RL) tasks. Maximum State-Visitation Entropy (MSVE) formulates the exploration problem as a well-defined policy optimization problem whose solution aims at visiting all states as uniformly as possible. This is in contrast to standard uncertainty-based approaches where exploration is transient and eventually vanishes. However, existing approaches to MSVE are theoretically justified only for discrete state-spaces as they are oblivious to the geometry of continuous domains. We address this challenge by introducing Geometric Entropy Maximisation (GEM), a new algorithm that maximises the geometry-aware Shannon entropy of state-visits in both discrete and continuous domains. Our key theoretical contribution is casting geometry-aware MSVE exploration as a tractable problem of optimising a simple and novel noise-contrastive objective function. In our experiments, we show the efficiency of GEM in solving several RL problems with sparse rewards, compared against other deep RL exploration approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.02055v2-abstract-full').style.display = 'none'; document.getElementById('2101.02055v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 January, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2008.12234">arXiv:2008.12234</a> <span> [<a href="https://arxiv.org/pdf/2008.12234">pdf</a>, <a href="https://arxiv.org/format/2008.12234">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> The Advantage Regret-Matching Actor-Critic </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gruslys%2C+A">Audr奴nas Gruslys</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Timbers%2C+F">Finbarr Timbers</a>, <a href="/search/cs?searchtype=author&query=Schmid%2C+M">Martin Schmid</a>, <a href="/search/cs?searchtype=author&query=Perolat%2C+J">Julien Perolat</a>, <a href="/search/cs?searchtype=author&query=Morrill%2C+D">Dustin Morrill</a>, <a href="/search/cs?searchtype=author&query=Zambaldi%2C+V">Vinicius Zambaldi</a>, <a href="/search/cs?searchtype=author&query=Lespiau%2C+J">Jean-Baptiste Lespiau</a>, <a href="/search/cs?searchtype=author&query=Schultz%2C+J">John Schultz</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Bowling%2C+M">Michael Bowling</a>, <a href="/search/cs?searchtype=author&query=Tuyls%2C+K">Karl Tuyls</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2008.12234v1-abstract-short" style="display: inline;"> Regret minimization has played a key role in online learning, equilibrium computation in games, and reinforcement learning (RL). In this paper, we describe a general model-free RL method for no-regret learning based on repeated reconsideration of past behavior. We propose a model-free RL algorithm, the AdvantageRegret-Matching Actor-Critic (ARMAC): rather than saving past state-action data, ARMAC… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.12234v1-abstract-full').style.display = 'inline'; document.getElementById('2008.12234v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2008.12234v1-abstract-full" style="display: none;"> Regret minimization has played a key role in online learning, equilibrium computation in games, and reinforcement learning (RL). In this paper, we describe a general model-free RL method for no-regret learning based on repeated reconsideration of past behavior. We propose a model-free RL algorithm, the AdvantageRegret-Matching Actor-Critic (ARMAC): rather than saving past state-action data, ARMAC saves a buffer of past policies, replaying through them to reconstruct hindsight assessments of past behavior. These retrospective value estimates are used to predict conditional advantages which, combined with regret matching, produces a new policy. In particular, ARMAC learns from sampled trajectories in a centralized training setting, without requiring the application of importance sampling commonly used in Monte Carlo counterfactual regret (CFR) minimization; hence, it does not suffer from excessive variance in large environments. In the single-agent setting, ARMAC shows an interesting form of exploration by keeping past policies intact. In the multiagent setting, ARMAC in self-play approaches Nash equilibria on some partially-observable zero-sum benchmarks. We provide exploitability estimates in the significantly larger game of betting-abstracted no-limit Texas Hold'em. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.12234v1-abstract-full').style.display = 'none'; document.getElementById('2008.12234v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2006.07733">arXiv:2006.07733</a> <span> [<a href="https://arxiv.org/pdf/2006.07733">pdf</a>, <a href="https://arxiv.org/format/2006.07733">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Bootstrap your own latent: A new approach to self-supervised Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Grill%2C+J">Jean-Bastien Grill</a>, <a href="/search/cs?searchtype=author&query=Strub%2C+F">Florian Strub</a>, <a href="/search/cs?searchtype=author&query=Altch%C3%A9%2C+F">Florent Altch茅</a>, <a href="/search/cs?searchtype=author&query=Tallec%2C+C">Corentin Tallec</a>, <a href="/search/cs?searchtype=author&query=Richemond%2C+P+H">Pierre H. Richemond</a>, <a href="/search/cs?searchtype=author&query=Buchatskaya%2C+E">Elena Buchatskaya</a>, <a href="/search/cs?searchtype=author&query=Doersch%2C+C">Carl Doersch</a>, <a href="/search/cs?searchtype=author&query=Pires%2C+B+A">Bernardo Avila Pires</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z+D">Zhaohan Daniel Guo</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&query=Kavukcuoglu%2C+K">Koray Kavukcuoglu</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2006.07733v3-abstract-short" style="display: inline;"> We introduce Bootstrap Your Own Latent (BYOL), a new approach to self-supervised image representation learning. BYOL relies on two neural networks, referred to as online and target networks, that interact and learn from each other. From an augmented view of an image, we train the online network to predict the target network representation of the same image under a different augmented view. At the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.07733v3-abstract-full').style.display = 'inline'; document.getElementById('2006.07733v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2006.07733v3-abstract-full" style="display: none;"> We introduce Bootstrap Your Own Latent (BYOL), a new approach to self-supervised image representation learning. BYOL relies on two neural networks, referred to as online and target networks, that interact and learn from each other. From an augmented view of an image, we train the online network to predict the target network representation of the same image under a different augmented view. At the same time, we update the target network with a slow-moving average of the online network. While state-of-the art methods rely on negative pairs, BYOL achieves a new state of the art without them. BYOL reaches $74.3\%$ top-1 classification accuracy on ImageNet using a linear evaluation with a ResNet-50 architecture and $79.6\%$ with a larger ResNet. We show that BYOL performs on par or better than the current state of the art on both transfer and semi-supervised benchmarks. Our implementation and pretrained models are given on GitHub. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.07733v3-abstract-full').style.display = 'none'; document.getElementById('2006.07733v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2004.14646">arXiv:2004.14646</a> <span> [<a href="https://arxiv.org/pdf/2004.14646">pdf</a>, <a href="https://arxiv.org/format/2004.14646">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Bootstrap Latent-Predictive Representations for Multitask Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+D">Daniel Guo</a>, <a href="/search/cs?searchtype=author&query=Pires%2C+B+A">Bernardo Avila Pires</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&query=Grill%2C+J">Jean-bastien Grill</a>, <a href="/search/cs?searchtype=author&query=Altch%C3%A9%2C+F">Florent Altch茅</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2004.14646v1-abstract-short" style="display: inline;"> Learning a good representation is an essential component for deep reinforcement learning (RL). Representation learning is especially important in multitask and partially observable settings where building a representation of the unknown environment is crucial to solve the tasks. Here we introduce Prediction of Bootstrap Latents (PBL), a simple and flexible self-supervised representation learning a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.14646v1-abstract-full').style.display = 'inline'; document.getElementById('2004.14646v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2004.14646v1-abstract-full" style="display: none;"> Learning a good representation is an essential component for deep reinforcement learning (RL). Representation learning is especially important in multitask and partially observable settings where building a representation of the unknown environment is crucial to solve the tasks. Here we introduce Prediction of Bootstrap Latents (PBL), a simple and flexible self-supervised representation learning algorithm for multitask deep RL. PBL builds on multistep predictive representations of future observations, and focuses on capturing structured information about environment dynamics. Specifically, PBL trains its representation by predicting latent embeddings of future observations. These latent embeddings are themselves trained to be predictive of the aforementioned representations. These predictions form a bootstrapping effect, allowing the agent to learn more about the key aspects of the environment dynamics. In addition, by defining prediction tasks completely in latent space, PBL provides the flexibility of using multimodal observations involving pixel images, language instructions, rewards and more. We show in our experiments that PBL delivers across-the-board improved performance over state of the art deep RL agents in the DMLab-30 and Atari-57 multitask setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.14646v1-abstract-full').style.display = 'none'; document.getElementById('2004.14646v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 April, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1902.07685">arXiv:1902.07685</a> <span> [<a href="https://arxiv.org/pdf/1902.07685">pdf</a>, <a href="https://arxiv.org/format/1902.07685">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applications">stat.AP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> World Discovery Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&query=Pires%2C+B+A">Bernardo Avila Pires</a>, <a href="/search/cs?searchtype=author&query=Grill%2C+J">Jean-Bastien Grill</a>, <a href="/search/cs?searchtype=author&query=Altch%C3%A9%2C+F">Florent Altch茅</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1902.07685v3-abstract-short" style="display: inline;"> As humans we are driven by a strong desire for seeking novelty in our world. Also upon observing a novel pattern we are capable of refining our understanding of the world based on the new information---humans can discover their world. The outstanding ability of the human mind for discovery has led to many breakthroughs in science, art and technology. Here we investigate the possibility of building… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1902.07685v3-abstract-full').style.display = 'inline'; document.getElementById('1902.07685v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1902.07685v3-abstract-full" style="display: none;"> As humans we are driven by a strong desire for seeking novelty in our world. Also upon observing a novel pattern we are capable of refining our understanding of the world based on the new information---humans can discover their world. The outstanding ability of the human mind for discovery has led to many breakthroughs in science, art and technology. Here we investigate the possibility of building an agent capable of discovering its world using the modern AI technology. In particular we introduce NDIGO, Neural Differential Information Gain Optimisation, a self-supervised discovery model that aims at seeking new information to construct a global view of its world from partial and noisy observations. Our experiments on some controlled 2-D navigation tasks show that NDIGO outperforms state-of-the-art information-seeking methods in terms of the quality of the learned representation. The improvement in performance is particularly significant in the presence of white or structured noise where other information-seeking methods follow the noise instead of discovering their world. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1902.07685v3-abstract-full').style.display = 'none'; document.getElementById('1902.07685v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 March, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 February, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1811.06407">arXiv:1811.06407</a> <span> [<a href="https://arxiv.org/pdf/1811.06407">pdf</a>, <a href="https://arxiv.org/format/1811.06407">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Neural Predictive Belief Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+Z+D">Zhaohan Daniel Guo</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&query=Pires%2C+B+A">Bernardo A. Pires</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1811.06407v2-abstract-short" style="display: inline;"> Unsupervised representation learning has succeeded with excellent results in many applications. It is an especially powerful tool to learn a good representation of environments with partial or noisy observations. In partially observable domains it is important for the representation to encode a belief state, a sufficient statistic of the observations seen so far. In this paper, we investigate whet… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1811.06407v2-abstract-full').style.display = 'inline'; document.getElementById('1811.06407v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1811.06407v2-abstract-full" style="display: none;"> Unsupervised representation learning has succeeded with excellent results in many applications. It is an especially powerful tool to learn a good representation of environments with partial or noisy observations. In partially observable domains it is important for the representation to encode a belief state, a sufficient statistic of the observations seen so far. In this paper, we investigate whether it is possible to learn such a belief representation using modern neural architectures. Specifically, we focus on one-step frame prediction and two variants of contrastive predictive coding (CPC) as the objective functions to learn the representations. To evaluate these learned representations, we test how well they can predict various pieces of information about the underlying state of the environment, e.g., position of the agent in a 3D maze. We show that all three methods are able to learn belief representations of the environment, they encode not only the state information, but also its uncertainty, a crucial aspect of belief states. We also find that for CPC multi-step predictions and action-conditioning are critical for accurate belief representations in visually complex environments. The ability of neural representations to capture the belief information has the potential to spur new advances for learning and planning in partially observable domains, where leveraging uncertainty is essential for optimal decision making. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1811.06407v2-abstract-full').style.display = 'none'; document.getElementById('1811.06407v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 November, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2018. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1805.11593">arXiv:1805.11593</a> <span> [<a href="https://arxiv.org/pdf/1805.11593">pdf</a>, <a href="https://arxiv.org/format/1805.11593">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Observe and Look Further: Achieving Consistent Performance on Atari </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pohlen%2C+T">Tobias Pohlen</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&query=Hester%2C+T">Todd Hester</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Horgan%2C+D">Dan Horgan</a>, <a href="/search/cs?searchtype=author&query=Budden%2C+D">David Budden</a>, <a href="/search/cs?searchtype=author&query=Barth-Maron%2C+G">Gabriel Barth-Maron</a>, <a href="/search/cs?searchtype=author&query=van+Hasselt%2C+H">Hado van Hasselt</a>, <a href="/search/cs?searchtype=author&query=Quan%2C+J">John Quan</a>, <a href="/search/cs?searchtype=author&query=Ve%C4%8Der%C3%ADk%2C+M">Mel Ve膷er铆k</a>, <a href="/search/cs?searchtype=author&query=Hessel%2C+M">Matteo Hessel</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Pietquin%2C+O">Olivier Pietquin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1805.11593v1-abstract-short" style="display: inline;"> Despite significant advances in the field of deep Reinforcement Learning (RL), today's algorithms still fail to learn human-level policies consistently over a set of diverse tasks such as Atari 2600 games. We identify three key challenges that any algorithm needs to master in order to perform well on all games: processing diverse reward distributions, reasoning over long time horizons, and explori… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1805.11593v1-abstract-full').style.display = 'inline'; document.getElementById('1805.11593v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1805.11593v1-abstract-full" style="display: none;"> Despite significant advances in the field of deep Reinforcement Learning (RL), today's algorithms still fail to learn human-level policies consistently over a set of diverse tasks such as Atari 2600 games. We identify three key challenges that any algorithm needs to master in order to perform well on all games: processing diverse reward distributions, reasoning over long time horizons, and exploring efficiently. In this paper, we propose an algorithm that addresses each of these challenges and is able to learn human-level policies on nearly all Atari games. A new transformed Bellman operator allows our algorithm to process rewards of varying densities and scales; an auxiliary temporal consistency loss allows us to train stably using a discount factor of $纬= 0.999$ (instead of $纬= 0.99$) extending the effective planning horizon by an order of magnitude; and we ease the exploration problem by using human demonstrations that guide the agent towards rewarding states. When tested on a set of 42 Atari games, our algorithm exceeds the performance of an average human on 40 games using a common set of hyper parameters. Furthermore, it is the first deep RL algorithm to solve the first level of Montezuma's Revenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1805.11593v1-abstract-full').style.display = 'none'; document.getElementById('1805.11593v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2018. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1706.10295">arXiv:1706.10295</a> <span> [<a href="https://arxiv.org/pdf/1706.10295">pdf</a>, <a href="https://arxiv.org/format/1706.10295">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Noisy Networks for Exploration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fortunato%2C+M">Meire Fortunato</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&query=Menick%2C+J">Jacob Menick</a>, <a href="/search/cs?searchtype=author&query=Osband%2C+I">Ian Osband</a>, <a href="/search/cs?searchtype=author&query=Graves%2C+A">Alex Graves</a>, <a href="/search/cs?searchtype=author&query=Mnih%2C+V">Vlad Mnih</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">Remi Munos</a>, <a href="/search/cs?searchtype=author&query=Hassabis%2C+D">Demis Hassabis</a>, <a href="/search/cs?searchtype=author&query=Pietquin%2C+O">Olivier Pietquin</a>, <a href="/search/cs?searchtype=author&query=Blundell%2C+C">Charles Blundell</a>, <a href="/search/cs?searchtype=author&query=Legg%2C+S">Shane Legg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1706.10295v3-abstract-short" style="display: inline;"> We introduce NoisyNet, a deep reinforcement learning agent with parametric noise added to its weights, and show that the induced stochasticity of the agent's policy can be used to aid efficient exploration. The parameters of the noise are learned with gradient descent along with the remaining network weights. NoisyNet is straightforward to implement and adds little computational overhead. We find… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1706.10295v3-abstract-full').style.display = 'inline'; document.getElementById('1706.10295v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1706.10295v3-abstract-full" style="display: none;"> We introduce NoisyNet, a deep reinforcement learning agent with parametric noise added to its weights, and show that the induced stochasticity of the agent's policy can be used to aid efficient exploration. The parameters of the noise are learned with gradient descent along with the remaining network weights. NoisyNet is straightforward to implement and adds little computational overhead. We find that replacing the conventional exploration heuristics for A3C, DQN and dueling agents (entropy reward and $蔚$-greedy respectively) with NoisyNet yields substantially higher scores for a wide range of Atari games, in some cases advancing the agent from sub to super-human performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1706.10295v3-abstract-full').style.display = 'none'; document.getElementById('1706.10295v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 July, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 June, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2017. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2018</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1704.04651">arXiv:1704.04651</a> <span> [<a href="https://arxiv.org/pdf/1704.04651">pdf</a>, <a href="https://arxiv.org/format/1704.04651">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> The Reactor: A fast and sample-efficient Actor-Critic agent for Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gruslys%2C+A">Audrunas Gruslys</a>, <a href="/search/cs?searchtype=author&query=Dabney%2C+W">Will Dabney</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&query=Bellemare%2C+M">Marc Bellemare</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">Remi Munos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1704.04651v2-abstract-short" style="display: inline;"> In this work we present a new agent architecture, called Reactor, which combines multiple algorithmic and architectural contributions to produce an agent with higher sample-efficiency than Prioritized Dueling DQN (Wang et al., 2016) and Categorical DQN (Bellemare et al., 2017), while giving better run-time performance than A3C (Mnih et al., 2016). Our first contribution is a new policy evaluation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1704.04651v2-abstract-full').style.display = 'inline'; document.getElementById('1704.04651v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1704.04651v2-abstract-full" style="display: none;"> In this work we present a new agent architecture, called Reactor, which combines multiple algorithmic and architectural contributions to produce an agent with higher sample-efficiency than Prioritized Dueling DQN (Wang et al., 2016) and Categorical DQN (Bellemare et al., 2017), while giving better run-time performance than A3C (Mnih et al., 2016). Our first contribution is a new policy evaluation algorithm called Distributional Retrace, which brings multi-step off-policy updates to the distributional reinforcement learning setting. The same approach can be used to convert several classes of multi-step policy evaluation algorithms designed for expected value evaluation into distributional ones. Next, we introduce the \b{eta}-leave-one-out policy gradient algorithm which improves the trade-off between variance and bias by using action values as a baseline. Our final algorithmic contribution is a new prioritized replay algorithm for sequences, which exploits the temporal locality of neighboring observations for more efficient replay prioritization. Using the Atari 2600 benchmarks, we show that each of these innovations contribute to both the sample efficiency and final agent performance. Finally, we demonstrate that Reactor reaches state-of-the-art performance after 200 million frames and less than a day of training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1704.04651v2-abstract-full').style.display = 'none'; document.getElementById('1704.04651v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 June, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 April, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2017. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1703.05449">arXiv:1703.05449</a> <span> [<a href="https://arxiv.org/pdf/1703.05449">pdf</a>, <a href="https://arxiv.org/ps/1703.05449">ps</a>, <a href="https://arxiv.org/format/1703.05449">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Minimax Regret Bounds for Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Osband%2C+I">Ian Osband</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1703.05449v2-abstract-short" style="display: inline;"> We consider the problem of provably optimal exploration in reinforcement learning for finite horizon MDPs. We show that an optimistic modification to value iteration achieves a regret bound of $\tilde{O}( \sqrt{HSAT} + H^2S^2A+H\sqrt{T})$ where $H$ is the time horizon, $S$ the number of states, $A$ the number of actions and $T$ the number of time-steps. This result improves over the best previous… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1703.05449v2-abstract-full').style.display = 'inline'; document.getElementById('1703.05449v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1703.05449v2-abstract-full" style="display: none;"> We consider the problem of provably optimal exploration in reinforcement learning for finite horizon MDPs. We show that an optimistic modification to value iteration achieves a regret bound of $\tilde{O}( \sqrt{HSAT} + H^2S^2A+H\sqrt{T})$ where $H$ is the time horizon, $S$ the number of states, $A$ the number of actions and $T$ the number of time-steps. This result improves over the best previous known bound $\tilde{O}(HS \sqrt{AT})$ achieved by the UCRL2 algorithm of Jaksch et al., 2010. The key significance of our new results is that when $T\geq H^3S^3A$ and $SA\geq H$, it leads to a regret of $\tilde{O}(\sqrt{HSAT})$ that matches the established lower bound of $惟(\sqrt{HSAT})$ up to a logarithmic factor. Our analysis contains two key insights. We use careful application of concentration inequalities to the optimal value function as a whole, rather than to the transitions probabilities (to improve scaling in $S$), and we define Bernstein-based "exploration bonuses" that use the empirical variance of the estimated values at the next states (to improve scaling in $H$). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1703.05449v2-abstract-full').style.display = 'none'; document.getElementById('1703.05449v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2017; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 March, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2017. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1602.02191">arXiv:1602.02191</a> <span> [<a href="https://arxiv.org/pdf/1602.02191">pdf</a>, <a href="https://arxiv.org/format/1602.02191">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Convex Relaxation Regression: Black-Box Optimization of Smooth Functions by Learning Their Convex Envelopes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Dyer%2C+E">Eva Dyer</a>, <a href="/search/cs?searchtype=author&query=Kording%2C+K">Konrad Kording</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1602.02191v3-abstract-short" style="display: inline;"> Finding efficient and provable methods to solve non-convex optimization problems is an outstanding challenge in machine learning and optimization theory. A popular approach used to tackle non-convex problems is to use convex relaxation techniques to find a convex surrogate for the problem. Unfortunately, convex relaxations typically must be found on a problem-by-problem basis. Thus, providing a ge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1602.02191v3-abstract-full').style.display = 'inline'; document.getElementById('1602.02191v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1602.02191v3-abstract-full" style="display: none;"> Finding efficient and provable methods to solve non-convex optimization problems is an outstanding challenge in machine learning and optimization theory. A popular approach used to tackle non-convex problems is to use convex relaxation techniques to find a convex surrogate for the problem. Unfortunately, convex relaxations typically must be found on a problem-by-problem basis. Thus, providing a general-purpose strategy to estimate a convex relaxation would have a wide reaching impact. Here, we introduce Convex Relaxation Regression (CoRR), an approach for learning convex relaxations for a class of smooth functions. The main idea behind our approach is to estimate the convex envelope of a function $f$ by evaluating $f$ at a set of $T$ random points and then fitting a convex function to these function evaluations. We prove that with probability greater than $1-未$, the solution of our algorithm converges to the global optimizer of $f$ with error $\mathcal{O} \Big( \big(\frac{\log(1/未) }{T} \big)^伪 \Big)$ for some $伪> 0$. Our approach enables the use of convex optimization tools to solve a class of non-convex optimization problems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1602.02191v3-abstract-full').style.display = 'none'; document.getElementById('1602.02191v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 March, 2016; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 February, 2016; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2016. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proc. of the Conference on Uncertainty in Artificial Intelligence, pg. 22-31, 2016 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1402.0562">arXiv:1402.0562</a> <span> [<a href="https://arxiv.org/pdf/1402.0562">pdf</a>, <a href="https://arxiv.org/ps/1402.0562">ps</a>, <a href="https://arxiv.org/format/1402.0562">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Online Stochastic Optimization under Correlated Bandit Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Lazaric%2C+A">Alessandro Lazaric</a>, <a href="/search/cs?searchtype=author&query=Brunskill%2C+E">Emma Brunskill</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1402.0562v3-abstract-short" style="display: inline;"> In this paper we consider the problem of online stochastic optimization of a locally smooth function under bandit feedback. We introduce the high-confidence tree (HCT) algorithm, a novel any-time $\mathcal{X}$-armed bandit algorithm, and derive regret bounds matching the performance of existing state-of-the-art in terms of dependency on number of steps and smoothness factor. The main advantage of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1402.0562v3-abstract-full').style.display = 'inline'; document.getElementById('1402.0562v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1402.0562v3-abstract-full" style="display: none;"> In this paper we consider the problem of online stochastic optimization of a locally smooth function under bandit feedback. We introduce the high-confidence tree (HCT) algorithm, a novel any-time $\mathcal{X}$-armed bandit algorithm, and derive regret bounds matching the performance of existing state-of-the-art in terms of dependency on number of steps and smoothness factor. The main advantage of HCT is that it handles the challenging case of correlated rewards, whereas existing methods require that the reward-generating process of each arm is an identically and independent distributed (iid) random process. HCT also improves on the state-of-the-art in terms of its memory requirement as well as requiring a weaker smoothness assumption on the mean-reward function in compare to the previous anytime algorithms. Finally, we discuss how HCT can be applied to the problem of policy search in reinforcement learning and we report preliminary empirical results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1402.0562v3-abstract-full').style.display = 'none'; document.getElementById('1402.0562v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 May, 2014; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 February, 2014; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2014. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1307.6887">arXiv:1307.6887</a> <span> [<a href="https://arxiv.org/pdf/1307.6887">pdf</a>, <a href="https://arxiv.org/ps/1307.6887">ps</a>, <a href="https://arxiv.org/format/1307.6887">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Sequential Transfer in Multi-armed Bandit with Finite Set of Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Lazaric%2C+A">Alessandro Lazaric</a>, <a href="/search/cs?searchtype=author&query=Brunskill%2C+E">Emma Brunskill</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1307.6887v1-abstract-short" style="display: inline;"> Learning from prior tasks and transferring that experience to improve future performance is critical for building lifelong learning agents. Although results in supervised and reinforcement learning show that transfer may significantly improve the learning performance, most of the literature on transfer is focused on batch learning tasks. In this paper we study the problem of \textit{sequential tra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1307.6887v1-abstract-full').style.display = 'inline'; document.getElementById('1307.6887v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1307.6887v1-abstract-full" style="display: none;"> Learning from prior tasks and transferring that experience to improve future performance is critical for building lifelong learning agents. Although results in supervised and reinforcement learning show that transfer may significantly improve the learning performance, most of the literature on transfer is focused on batch learning tasks. In this paper we study the problem of \textit{sequential transfer in online learning}, notably in the multi-armed bandit framework, where the objective is to minimize the cumulative regret over a sequence of tasks by incrementally transferring knowledge from prior tasks. We introduce a novel bandit algorithm based on a method-of-moments approach for the estimation of the possible tasks and derive regret bounds for it. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1307.6887v1-abstract-full').style.display = 'none'; document.getElementById('1307.6887v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 July, 2013; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2013. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1305.1027">arXiv:1305.1027</a> <span> [<a href="https://arxiv.org/pdf/1305.1027">pdf</a>, <a href="https://arxiv.org/ps/1305.1027">ps</a>, <a href="https://arxiv.org/format/1305.1027">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Regret Bounds for Reinforcement Learning with Policy Advice </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Lazaric%2C+A">Alessandro Lazaric</a>, <a href="/search/cs?searchtype=author&query=Brunskill%2C+E">Emma Brunskill</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1305.1027v2-abstract-short" style="display: inline;"> In some reinforcement learning problems an agent may be provided with a set of input policies, perhaps learned from prior experience or provided by advisors. We present a reinforcement learning with policy advice (RLPA) algorithm which leverages this input set and learns to use the best policy in the set for the reinforcement learning task at hand. We prove that RLPA has a sub-linear regret of \ti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1305.1027v2-abstract-full').style.display = 'inline'; document.getElementById('1305.1027v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1305.1027v2-abstract-full" style="display: none;"> In some reinforcement learning problems an agent may be provided with a set of input policies, perhaps learned from prior experience or provided by advisors. We present a reinforcement learning with policy advice (RLPA) algorithm which leverages this input set and learns to use the best policy in the set for the reinforcement learning task at hand. We prove that RLPA has a sub-linear regret of \tilde O(\sqrt{T}) relative to the best input policy, and that both this regret and its computational complexity are independent of the size of the state and action space. Our empirical simulations support our theoretical analysis. This suggests RLPA may offer significant advantages in large domains where some prior good policies are provided. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1305.1027v2-abstract-full').style.display = 'none'; document.getElementById('1305.1027v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2013; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 May, 2013; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2013. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1206.6461">arXiv:1206.6461</a> <span> [<a href="https://arxiv.org/pdf/1206.6461">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> On the Sample Complexity of Reinforcement Learning with a Generative Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">Remi Munos</a>, <a href="/search/cs?searchtype=author&query=Kappen%2C+B">Bert Kappen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1206.6461v1-abstract-short" style="display: inline;"> We consider the problem of learning the optimal action-value function in the discounted-reward Markov decision processes (MDPs). We prove a new PAC bound on the sample-complexity of model-based value iteration algorithm in the presence of the generative model, which indicates that for an MDP with N state-action pairs and the discount factor 纬\in[0,1) only O(N\log(N/未)/((1-纬)^3蔚^2)) samples are req… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1206.6461v1-abstract-full').style.display = 'inline'; document.getElementById('1206.6461v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1206.6461v1-abstract-full" style="display: none;"> We consider the problem of learning the optimal action-value function in the discounted-reward Markov decision processes (MDPs). We prove a new PAC bound on the sample-complexity of model-based value iteration algorithm in the presence of the generative model, which indicates that for an MDP with N state-action pairs and the discount factor 纬\in[0,1) only O(N\log(N/未)/((1-纬)^3蔚^2)) samples are required to find an 蔚-optimal estimation of the action-value function with the probability 1-未. We also prove a matching lower bound of 螛(N\log(N/未)/((1-纬)^3蔚^2)) on the sample complexity of estimating the optimal action-value function by every RL algorithm. To the best of our knowledge, this is the first matching result on the sample complexity of estimating the optimal (action-) value function in which the upper bound matches the lower bound of RL in terms of N, 蔚, 未and 1/(1-纬). Also, both our lower bound and our upper bound significantly improve on the state-of-the-art in terms of 1/(1-纬). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1206.6461v1-abstract-full').style.display = 'none'; document.getElementById('1206.6461v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 June, 2012; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2012. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Appears in Proceedings of the 29th International Conference on Machine Learning (ICML 2012)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1004.2027">arXiv:1004.2027</a> <span> [<a href="https://arxiv.org/pdf/1004.2027">pdf</a>, <a href="https://arxiv.org/ps/1004.2027">ps</a>, <a href="https://arxiv.org/format/1004.2027">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Policy Programming </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Gomez%2C+V">Vicenc Gomez</a>, <a href="/search/cs?searchtype=author&query=Kappen%2C+H+J">Hilbert J. Kappen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1004.2027v2-abstract-short" style="display: inline;"> In this paper, we propose a novel policy iteration method, called dynamic policy programming (DPP), to estimate the optimal policy in the infinite-horizon Markov decision processes. We prove the finite-iteration and asymptotic l\infty-norm performance-loss bounds for DPP in the presence of approximation/estimation error. The bounds are expressed in terms of the l\infty-norm of the average accumula… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1004.2027v2-abstract-full').style.display = 'inline'; document.getElementById('1004.2027v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1004.2027v2-abstract-full" style="display: none;"> In this paper, we propose a novel policy iteration method, called dynamic policy programming (DPP), to estimate the optimal policy in the infinite-horizon Markov decision processes. We prove the finite-iteration and asymptotic l\infty-norm performance-loss bounds for DPP in the presence of approximation/estimation error. The bounds are expressed in terms of the l\infty-norm of the average accumulated error as opposed to the l\infty-norm of the error in the case of the standard approximate value iteration (AVI) and the approximate policy iteration (API). This suggests that DPP can achieve a better performance than AVI and API since it averages out the simulation noise caused by Monte-Carlo sampling throughout the learning process. We examine this theoretical results numerically by com- paring the performance of the approximate variants of DPP with existing reinforcement learning (RL) methods on different problem domains. Our results show that, in all cases, DPP-based algorithms outperform other RL methods by a wide margin. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1004.2027v2-abstract-full').style.display = 'none'; document.getElementById('1004.2027v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 September, 2011; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 April, 2010; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2010. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to Journal of Machine Learning Research</span> </p> </li> </ol> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>