CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 109 results for author: <span class="mathjax">Munos, R</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Munos%2C+R">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Munos, R"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Munos%2C+R&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Munos, R"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Munos%2C+R&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Munos%2C+R&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Munos%2C+R&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Munos%2C+R&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.19107">arXiv:2405.19107</a> <span> [<a href="https://arxiv.org/pdf/2405.19107">pdf</a>, <a href="https://arxiv.org/ps/2405.19107">ps</a>, <a href="https://arxiv.org/format/2405.19107">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Offline Regularised Reinforcement Learning for Large Language Models Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Richemond%2C+P+H">Pierre Harvey Richemond</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Daniel Guo</a>, <a href="/search/cs?searchtype=author&query=Calandriello%2C+D">Daniele Calandriello</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Rafailov%2C+R">Rafael Rafailov</a>, <a href="/search/cs?searchtype=author&query=Pires%2C+B+A">Bernardo Avila Pires</a>, <a href="/search/cs?searchtype=author&query=Tarassov%2C+E">Eugene Tarassov</a>, <a href="/search/cs?searchtype=author&query=Spangher%2C+L">Lucas Spangher</a>, <a href="/search/cs?searchtype=author&query=Ellsworth%2C+W">Will Ellsworth</a>, <a href="/search/cs?searchtype=author&query=Severyn%2C+A">Aliaksei Severyn</a>, <a href="/search/cs?searchtype=author&query=Mallinson%2C+J">Jonathan Mallinson</a>, <a href="/search/cs?searchtype=author&query=Shani%2C+L">Lior Shani</a>, <a href="/search/cs?searchtype=author&query=Shamir%2C+G">Gil Shamir</a>, <a href="/search/cs?searchtype=author&query=Joshi%2C+R">Rishabh Joshi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianqi Liu</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">Remi Munos</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.19107v1-abstract-short" style="display: inline;"> The dominant framework for alignment of large language models (LLM), whether through reinforcement learning from human feedback or direct preference optimisation, is to learn from preference data. This involves building datasets where each element is a quadruplet composed of a prompt, two independent responses (completions of the prompt) and a human preference between the two independent responses… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19107v1-abstract-full').style.display = 'inline'; document.getElementById('2405.19107v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.19107v1-abstract-full" style="display: none;"> The dominant framework for alignment of large language models (LLM), whether through reinforcement learning from human feedback or direct preference optimisation, is to learn from preference data. This involves building datasets where each element is a quadruplet composed of a prompt, two independent responses (completions of the prompt) and a human preference between the two independent responses, yielding a preferred and a dis-preferred response. Such data is typically scarce and expensive to collect. On the other hand, \emph{single-trajectory} datasets where each element is a triplet composed of a prompt, a response and a human feedback is naturally more abundant. The canonical element of such datasets is for instance an LLM's response to a user's prompt followed by a user's feedback such as a thumbs-up/down. Consequently, in this work, we propose DRO, or \emph{Direct Reward Optimisation}, as a framework and associated algorithms that do not require pairwise preferences. DRO uses a simple mean-squared objective that can be implemented in various ways. We validate our findings empirically, using T5 encoder-decoder language models, and show DRO's performance over selected baselines such as Kahneman-Tversky Optimization (KTO). Thus, we confirm that DRO is a simple and empirically compelling method for single-trajectory policy optimisation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19107v1-abstract-full').style.display = 'none'; document.getElementById('2405.19107v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.14655">arXiv:2405.14655</a> <span> [<a href="https://arxiv.org/pdf/2405.14655">pdf</a>, <a href="https://arxiv.org/format/2405.14655">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Multi-turn Reinforcement Learning from Preference Human Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shani%2C+L">Lior Shani</a>, <a href="/search/cs?searchtype=author&query=Rosenberg%2C+A">Aviv Rosenberg</a>, <a href="/search/cs?searchtype=author&query=Cassel%2C+A">Asaf Cassel</a>, <a href="/search/cs?searchtype=author&query=Lang%2C+O">Oran Lang</a>, <a href="/search/cs?searchtype=author&query=Calandriello%2C+D">Daniele Calandriello</a>, <a href="/search/cs?searchtype=author&query=Zipori%2C+A">Avital Zipori</a>, <a href="/search/cs?searchtype=author&query=Noga%2C+H">Hila Noga</a>, <a href="/search/cs?searchtype=author&query=Keller%2C+O">Orgad Keller</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&query=Szpektor%2C+I">Idan Szpektor</a>, <a href="/search/cs?searchtype=author&query=Hassidim%2C+A">Avinatan Hassidim</a>, <a href="/search/cs?searchtype=author&query=Matias%2C+Y">Yossi Matias</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.14655v1-abstract-short" style="display: inline;"> Reinforcement Learning from Human Feedback (RLHF) has become the standard approach for aligning Large Language Models (LLMs) with human preferences, allowing LLMs to demonstrate remarkable abilities in various tasks. Existing methods work by emulating the preferences at the single decision (turn) level, limiting their capabilities in settings that require planning or multi-turn interactions to ach… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.14655v1-abstract-full').style.display = 'inline'; document.getElementById('2405.14655v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.14655v1-abstract-full" style="display: none;"> Reinforcement Learning from Human Feedback (RLHF) has become the standard approach for aligning Large Language Models (LLMs) with human preferences, allowing LLMs to demonstrate remarkable abilities in various tasks. Existing methods work by emulating the preferences at the single decision (turn) level, limiting their capabilities in settings that require planning or multi-turn interactions to achieve a long-term goal. In this paper, we address this issue by developing novel methods for Reinforcement Learning (RL) from preference feedback between two full multi-turn conversations. In the tabular setting, we present a novel mirror-descent-based policy optimization algorithm for the general multi-turn preference-based RL problem, and prove its convergence to Nash equilibrium. To evaluate performance, we create a new environment, Education Dialogue, where a teacher agent guides a student in learning a random topic, and show that a deep RL variant of our algorithm outperforms RLHF baselines. Finally, we show that in an environment with explicit rewards, our algorithm recovers the same performance as a reward-based RL baseline, despite relying solely on a weaker preference signal. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.14655v1-abstract-full').style.display = 'none'; document.getElementById('2405.14655v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.08448">arXiv:2405.08448</a> <span> [<a href="https://arxiv.org/pdf/2405.08448">pdf</a>, <a href="https://arxiv.org/format/2405.08448">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Understanding the performance gap between online and offline alignment algorithms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D+Z">Daniel Zhaohan Guo</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zeyu Zheng</a>, <a href="/search/cs?searchtype=author&query=Calandriello%2C+D">Daniele Calandriello</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuan Cao</a>, <a href="/search/cs?searchtype=author&query=Tarassov%2C+E">Eugene Tarassov</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Pires%2C+B+%C3%81">Bernardo 脕vila Pires</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yong Cheng</a>, <a href="/search/cs?searchtype=author&query=Dabney%2C+W">Will Dabney</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.08448v1-abstract-short" style="display: inline;"> Reinforcement learning from human feedback (RLHF) is the canonical framework for large language model alignment. However, rising popularity in offline alignment algorithms challenge the need for on-policy sampling in RLHF. Within the context of reward over-optimization, we start with an opening set of experiments that demonstrate the clear advantage of online methods over offline methods. This pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.08448v1-abstract-full').style.display = 'inline'; document.getElementById('2405.08448v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.08448v1-abstract-full" style="display: none;"> Reinforcement learning from human feedback (RLHF) is the canonical framework for large language model alignment. However, rising popularity in offline alignment algorithms challenge the need for on-policy sampling in RLHF. Within the context of reward over-optimization, we start with an opening set of experiments that demonstrate the clear advantage of online methods over offline methods. This prompts us to investigate the causes to the performance discrepancy through a series of carefully designed experimental ablations. We show empirically that hypotheses such as offline data coverage and data quality by itself cannot convincingly explain the performance difference. We also find that while offline algorithms train policy to become good at pairwise classification, it is worse at generations; in the meantime the policies trained by online algorithms are good at generations while worse at pairwise classification. This hints at a unique interplay between discriminative and generative capabilities, which is greatly impacted by the sampling process. Lastly, we observe that the performance discrepancy persists for both contrastive and non-contrastive loss functions, and appears not to be addressed by simply scaling up policy networks. Taken together, our study sheds light on the pivotal role of on-policy sampling in AI alignment, and hints at certain fundamental challenges of offline alignment algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.08448v1-abstract-full').style.display = 'none'; document.getElementById('2405.08448v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.04407">arXiv:2405.04407</a> <span> [<a href="https://arxiv.org/pdf/2405.04407">pdf</a>, <a href="https://arxiv.org/ps/2405.04407">ps</a>, <a href="https://arxiv.org/format/2405.04407">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Super-Exponential Regret for UCT, AlphaGo and Variants </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Orseau%2C+L">Laurent Orseau</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">Remi Munos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.04407v2-abstract-short" style="display: inline;"> We improve the proofs of the lower bounds of Coquelin and Munos (2007) that demonstrate that UCT can have $\exp(\dots\exp(1)\dots)$ regret (with $惟(D)$ exp terms) on the $D$-chain environment, and that a `polynomial' UCT variant has $\exp_2(\exp_2(D - O(\log D)))$ regret on the same environment -- the original proofs contain an oversight for rewards bounded in $[0, 1]$, which we fix in the present… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04407v2-abstract-full').style.display = 'inline'; document.getElementById('2405.04407v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.04407v2-abstract-full" style="display: none;"> We improve the proofs of the lower bounds of Coquelin and Munos (2007) that demonstrate that UCT can have $\exp(\dots\exp(1)\dots)$ regret (with $惟(D)$ exp terms) on the $D$-chain environment, and that a `polynomial' UCT variant has $\exp_2(\exp_2(D - O(\log D)))$ regret on the same environment -- the original proofs contain an oversight for rewards bounded in $[0, 1]$, which we fix in the present draft. We also adapt the proofs to AlphaGo's MCTS and its descendants (e.g., AlphaZero, Leela Zero) to also show $\exp_2(\exp_2(D - O(\log D)))$ regret. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04407v2-abstract-full').style.display = 'none'; document.getElementById('2405.04407v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.08635">arXiv:2403.08635</a> <span> [<a href="https://arxiv.org/pdf/2403.08635">pdf</a>, <a href="https://arxiv.org/format/2403.08635">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Human Alignment of Large Language Models through Online Preference Optimisation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Calandriello%2C+D">Daniele Calandriello</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Daniel Guo</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">Remi Munos</a>, <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Pires%2C+B+A">Bernardo Avila Pires</a>, <a href="/search/cs?searchtype=author&query=Richemond%2C+P+H">Pierre Harvey Richemond</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+C+L">Charline Le Lan</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianqi Liu</a>, <a href="/search/cs?searchtype=author&query=Joshi%2C+R">Rishabh Joshi</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zeyu Zheng</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.08635v1-abstract-short" style="display: inline;"> Ensuring alignment of language models' outputs with human preferences is critical to guarantee a useful, safe, and pleasant user experience. Thus, human alignment has been extensively studied recently and several methods such as Reinforcement Learning from Human Feedback (RLHF), Direct Policy Optimisation (DPO) and Sequence Likelihood Calibration (SLiC) have emerged. In this paper, our contributio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08635v1-abstract-full').style.display = 'inline'; document.getElementById('2403.08635v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.08635v1-abstract-full" style="display: none;"> Ensuring alignment of language models' outputs with human preferences is critical to guarantee a useful, safe, and pleasant user experience. Thus, human alignment has been extensively studied recently and several methods such as Reinforcement Learning from Human Feedback (RLHF), Direct Policy Optimisation (DPO) and Sequence Likelihood Calibration (SLiC) have emerged. In this paper, our contribution is two-fold. First, we show the equivalence between two recent alignment methods, namely Identity Policy Optimisation (IPO) and Nash Mirror Descent (Nash-MD). Second, we introduce a generalisation of IPO, named IPO-MD, that leverages the regularised sampling approach proposed by Nash-MD. This equivalence may seem surprising at first sight, since IPO is an offline method whereas Nash-MD is an online method using a preference model. However, this equivalence can be proven when we consider the online version of IPO, that is when both generations are sampled by the online policy and annotated by a trained preference model. Optimising the IPO loss with such a stream of data becomes then equivalent to finding the Nash equilibrium of the preference model through self-play. Building on this equivalence, we introduce the IPO-MD algorithm that generates data with a mixture policy (between the online and reference policy) similarly as the general Nash-MD algorithm. We compare online-IPO and IPO-MD to different online versions of existing losses on preference data such as DPO and SLiC on a summarisation task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08635v1-abstract-full').style.display = 'none'; document.getElementById('2403.08635v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.07598">arXiv:2402.07598</a> <span> [<a href="https://arxiv.org/pdf/2402.07598">pdf</a>, <a href="https://arxiv.org/format/2402.07598">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Near-Minimax-Optimal Distributional Reinforcement Learning with a Generative Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Wenliang%2C+L+K">Li Kevin Wenliang</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Lyle%2C+C">Clare Lyle</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Dabney%2C+W">Will Dabney</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.07598v2-abstract-short" style="display: inline;"> We propose a new algorithm for model-based distributional reinforcement learning (RL), and prove that it is minimax-optimal for approximating return distributions with a generative model (up to logarithmic factors), resolving an open question of Zhang et al. (2023). Our analysis provides new theoretical results on categorical approaches to distributional RL, and also introduces a new distributiona… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.07598v2-abstract-full').style.display = 'inline'; document.getElementById('2402.07598v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.07598v2-abstract-full" style="display: none;"> We propose a new algorithm for model-based distributional reinforcement learning (RL), and prove that it is minimax-optimal for approximating return distributions with a generative model (up to logarithmic factors), resolving an open question of Zhang et al. (2023). Our analysis provides new theoretical results on categorical approaches to distributional RL, and also introduces a new distributional Bellman equation, the stochastic categorical CDF Bellman equation, which we expect to be of independent interest. We also provide an experimental study comparing several model-based distributional RL algorithms, with several takeaways for practitioners. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.07598v2-abstract-full').style.display = 'none'; document.getElementById('2402.07598v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.05766">arXiv:2402.05766</a> <span> [<a href="https://arxiv.org/pdf/2402.05766">pdf</a>, <a href="https://arxiv.org/format/2402.05766">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Off-policy Distributional Q($位$): Distributional RL without Importance Sampling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Pires%2C+B+%C3%81">Bernardo 脕vila Pires</a>, <a href="/search/cs?searchtype=author&query=Dabney%2C+W">Will Dabney</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.05766v1-abstract-short" style="display: inline;"> We introduce off-policy distributional Q($位$), a new addition to the family of off-policy distributional evaluation algorithms. Off-policy distributional Q($位$) does not apply importance sampling for off-policy learning, which introduces intriguing interactions with signed measures. Such unique properties distributional Q($位$) from other existing alternatives such as distributional Retrace. We cha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.05766v1-abstract-full').style.display = 'inline'; document.getElementById('2402.05766v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.05766v1-abstract-full" style="display: none;"> We introduce off-policy distributional Q($位$), a new addition to the family of off-policy distributional evaluation algorithms. Off-policy distributional Q($位$) does not apply importance sampling for off-policy learning, which introduces intriguing interactions with signed measures. Such unique properties distributional Q($位$) from other existing alternatives such as distributional Retrace. We characterize the algorithmic properties of distributional Q($位$) and validate theoretical insights with tabular experiments. We show how distributional Q($位$)-C51, a combination of Q($位$) with the C51 agent, exhibits promising results on deep RL benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.05766v1-abstract-full').style.display = 'none'; document.getElementById('2402.05766v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.05749">arXiv:2402.05749</a> <span> [<a href="https://arxiv.org/pdf/2402.05749">pdf</a>, <a href="https://arxiv.org/format/2402.05749">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Generalized Preference Optimization: A Unified Approach to Offline Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z+D">Zhaohan Daniel Guo</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zeyu Zheng</a>, <a href="/search/cs?searchtype=author&query=Calandriello%2C+D">Daniele Calandriello</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Richemond%2C+P+H">Pierre Harvey Richemond</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&query=Pires%2C+B+%C3%81">Bernardo 脕vila Pires</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.05749v2-abstract-short" style="display: inline;"> Offline preference optimization allows fine-tuning large models directly from offline data, and has proved effective in recent alignment practices. We propose generalized preference optimization (GPO), a family of offline losses parameterized by a general class of convex functions. GPO enables a unified view over preference optimization, encompassing existing algorithms such as DPO, IPO and SLiC a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.05749v2-abstract-full').style.display = 'inline'; document.getElementById('2402.05749v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.05749v2-abstract-full" style="display: none;"> Offline preference optimization allows fine-tuning large models directly from offline data, and has proved effective in recent alignment practices. We propose generalized preference optimization (GPO), a family of offline losses parameterized by a general class of convex functions. GPO enables a unified view over preference optimization, encompassing existing algorithms such as DPO, IPO and SLiC as special cases, while naturally introducing new variants. The GPO framework also sheds light on how offline algorithms enforce regularization, through the design of the convex function that defines the loss. Our analysis and experiments reveal the connections and subtle differences between the offline regularization and the KL divergence regularization intended by the canonical RLHF formulation. In a controlled setting akin to Gao et al 2023, we also show that different GPO variants achieve similar trade-offs between regularization and performance, though the optimal values of hyper-parameter might differ as predicted by theory. In all, our results present new algorithmic toolkits and empirical insights to alignment practitioners. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.05749v2-abstract-full').style.display = 'none'; document.getElementById('2402.05749v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ICML 2023 main conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.00886">arXiv:2312.00886</a> <span> [<a href="https://arxiv.org/pdf/2312.00886">pdf</a>, <a href="https://arxiv.org/format/2312.00886">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Nash Learning from Human Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&query=Calandriello%2C+D">Daniele Calandriello</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z+D">Zhaohan Daniel Guo</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Geist%2C+M">Matthieu Geist</a>, <a href="/search/cs?searchtype=author&query=Mesnard%2C+T">Thomas Mesnard</a>, <a href="/search/cs?searchtype=author&query=Michi%2C+A">Andrea Michi</a>, <a href="/search/cs?searchtype=author&query=Selvi%2C+M">Marco Selvi</a>, <a href="/search/cs?searchtype=author&query=Girgin%2C+S">Sertan Girgin</a>, <a href="/search/cs?searchtype=author&query=Momchev%2C+N">Nikola Momchev</a>, <a href="/search/cs?searchtype=author&query=Bachem%2C+O">Olivier Bachem</a>, <a href="/search/cs?searchtype=author&query=Mankowitz%2C+D+J">Daniel J. Mankowitz</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.00886v4-abstract-short" style="display: inline;"> Reinforcement learning from human feedback (RLHF) has emerged as the main paradigm for aligning large language models (LLMs) with human preferences. Typically, RLHF involves the initial step of learning a reward model from human feedback, often expressed as preferences between pairs of text generations produced by a pre-trained LLM. Subsequently, the LLM's policy is fine-tuned by optimizing it to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.00886v4-abstract-full').style.display = 'inline'; document.getElementById('2312.00886v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.00886v4-abstract-full" style="display: none;"> Reinforcement learning from human feedback (RLHF) has emerged as the main paradigm for aligning large language models (LLMs) with human preferences. Typically, RLHF involves the initial step of learning a reward model from human feedback, often expressed as preferences between pairs of text generations produced by a pre-trained LLM. Subsequently, the LLM's policy is fine-tuned by optimizing it to maximize the reward model through a reinforcement learning algorithm. However, an inherent limitation of current reward models is their inability to fully represent the richness of human preferences and their dependency on the sampling distribution. In this study, we introduce an alternative pipeline for the fine-tuning of LLMs using pairwise human feedback. Our approach entails the initial learning of a preference model, which is conditioned on two inputs given a prompt, followed by the pursuit of a policy that consistently generates responses preferred over those generated by any competing policy, thus defining the Nash equilibrium of this preference model. We term this approach Nash learning from human feedback (NLHF). In the context of a tabular policy representation, we present a novel algorithmic solution, Nash-MD, founded on the principles of mirror descent. This algorithm produces a sequence of policies, with the last iteration converging to the regularized Nash equilibrium. Additionally, we explore parametric representations of policies and introduce gradient descent algorithms for deep-learning architectures. To demonstrate the effectiveness of our approach, we present experimental results involving the fine-tuning of a LLM for a text summarization task. We believe NLHF offers a compelling avenue for preference learning and policy optimization with the potential of advancing the field of aligning LLMs with human preferences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.00886v4-abstract-full').style.display = 'none'; document.getElementById('2312.00886v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.18186">arXiv:2310.18186</a> <span> [<a href="https://arxiv.org/pdf/2310.18186">pdf</a>, <a href="https://arxiv.org/format/2310.18186">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Model-free Posterior Sampling via Learning Rate Randomization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tiapkin%2C+D">Daniil Tiapkin</a>, <a href="/search/cs?searchtype=author&query=Belomestny%2C+D">Denis Belomestny</a>, <a href="/search/cs?searchtype=author&query=Calandriello%2C+D">Daniele Calandriello</a>, <a href="/search/cs?searchtype=author&query=Moulines%2C+E">Eric Moulines</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">Remi Munos</a>, <a href="/search/cs?searchtype=author&query=Naumov%2C+A">Alexey Naumov</a>, <a href="/search/cs?searchtype=author&query=Perrault%2C+P">Pierre Perrault</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&query=Menard%2C+P">Pierre Menard</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.18186v1-abstract-short" style="display: inline;"> In this paper, we introduce Randomized Q-learning (RandQL), a novel randomized model-free algorithm for regret minimization in episodic Markov Decision Processes (MDPs). To the best of our knowledge, RandQL is the first tractable model-free posterior sampling-based algorithm. We analyze the performance of RandQL in both tabular and non-tabular metric space settings. In tabular MDPs, RandQL achieve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.18186v1-abstract-full').style.display = 'inline'; document.getElementById('2310.18186v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.18186v1-abstract-full" style="display: none;"> In this paper, we introduce Randomized Q-learning (RandQL), a novel randomized model-free algorithm for regret minimization in episodic Markov Decision Processes (MDPs). To the best of our knowledge, RandQL is the first tractable model-free posterior sampling-based algorithm. We analyze the performance of RandQL in both tabular and non-tabular metric space settings. In tabular MDPs, RandQL achieves a regret bound of order $\widetilde{\mathcal{O}}(\sqrt{H^{5}SAT})$, where $H$ is the planning horizon, $S$ is the number of states, $A$ is the number of actions, and $T$ is the number of episodes. For a metric state-action space, RandQL enjoys a regret bound of order $\widetilde{\mathcal{O}}(H^{5/2} T^{(d_z+1)/(d_z+2)})$, where $d_z$ denotes the zooming dimension. Notably, RandQL achieves optimistic exploration without using bonuses, relying instead on a novel idea of learning rate randomization. Our empirical study shows that RandQL outperforms existing approaches on baseline exploration environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.18186v1-abstract-full').style.display = 'none'; document.getElementById('2310.18186v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS-2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.12036">arXiv:2310.12036</a> <span> [<a href="https://arxiv.org/pdf/2310.12036">pdf</a>, <a href="https://arxiv.org/format/2310.12036">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> A General Theoretical Paradigm to Understand Learning from Human Preferences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Daniel Guo</a>, <a href="/search/cs?searchtype=author&query=Calandriello%2C+D">Daniele Calandriello</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.12036v2-abstract-short" style="display: inline;"> The prevalent deployment of learning from human preferences through reinforcement learning (RLHF) relies on two important approximations: the first assumes that pairwise preferences can be substituted with pointwise rewards. The second assumes that a reward model trained on these pointwise rewards can generalize from collected data to out-of-distribution data sampled by the policy. Recently, Direc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.12036v2-abstract-full').style.display = 'inline'; document.getElementById('2310.12036v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.12036v2-abstract-full" style="display: none;"> The prevalent deployment of learning from human preferences through reinforcement learning (RLHF) relies on two important approximations: the first assumes that pairwise preferences can be substituted with pointwise rewards. The second assumes that a reward model trained on these pointwise rewards can generalize from collected data to out-of-distribution data sampled by the policy. Recently, Direct Preference Optimisation (DPO) has been proposed as an approach that bypasses the second approximation and learn directly a policy from collected data without the reward modelling stage. However, this method still heavily relies on the first approximation. In this paper we try to gain a deeper theoretical understanding of these practical algorithms. In particular we derive a new general objective called $唯$PO for learning from human preferences that is expressed in terms of pairwise preferences and therefore bypasses both approximations. This new general objective allows us to perform an in-depth analysis of the behavior of RLHF and DPO (as special cases of $唯$PO) and to identify their potential pitfalls. We then consider another special case for $唯$PO by setting $唯$ simply to Identity, for which we can derive an efficient optimisation procedure, prove performance guarantees and demonstrate its empirical superiority to DPO on some illustrative examples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.12036v2-abstract-full').style.display = 'none'; document.getElementById('2310.12036v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.00656">arXiv:2309.00656</a> <span> [<a href="https://arxiv.org/pdf/2309.00656">pdf</a>, <a href="https://arxiv.org/format/2309.00656">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Local and adaptive mirror descents in extensive-form games </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fiegel%2C+C">C么me Fiegel</a>, <a href="/search/cs?searchtype=author&query=M%C3%A9nard%2C+P">Pierre M茅nard</a>, <a href="/search/cs?searchtype=author&query=Kozuno%2C+T">Tadashi Kozuno</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Perchet%2C+V">Vianney Perchet</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.00656v1-abstract-short" style="display: inline;"> We study how to learn $蔚$-optimal strategies in zero-sum imperfect information games (IIG) with trajectory feedback. In this setting, players update their policies sequentially based on their observations over a fixed number of episodes, denoted by $T$. Existing procedures suffer from high variance due to the use of importance sampling over sequences of actions (Steinberger et al., 2020; McAleer e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.00656v1-abstract-full').style.display = 'inline'; document.getElementById('2309.00656v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.00656v1-abstract-full" style="display: none;"> We study how to learn $蔚$-optimal strategies in zero-sum imperfect information games (IIG) with trajectory feedback. In this setting, players update their policies sequentially based on their observations over a fixed number of episodes, denoted by $T$. Existing procedures suffer from high variance due to the use of importance sampling over sequences of actions (Steinberger et al., 2020; McAleer et al., 2022). To reduce this variance, we consider a fixed sampling approach, where players still update their policies over time, but with observations obtained through a given fixed sampling policy. Our approach is based on an adaptive Online Mirror Descent (OMD) algorithm that applies OMD locally to each information set, using individually decreasing learning rates and a regularized loss. We show that this approach guarantees a convergence rate of $\tilde{\mathcal{O}}(T^{-1/2})$ with high probability and has a near-optimal dependence on the game parameters when applied with the best theoretical choices of learning rates and sampling policies. To achieve these results, we generalize the notion of OMD stabilization, allowing for time-varying regularization with convex increments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.00656v1-abstract-full').style.display = 'none'; document.getElementById('2309.00656v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.18501">arXiv:2305.18501</a> <span> [<a href="https://arxiv.org/pdf/2305.18501">pdf</a>, <a href="https://arxiv.org/format/2305.18501">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DoMo-AC: Doubly Multi-step Off-policy Actor-Critic Algorithm </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Kozuno%2C+T">Tadashi Kozuno</a>, <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Harutyunyan%2C+A">Anna Harutyunyan</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Pires%2C+B+%C3%81">Bernardo 脕vila Pires</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.18501v1-abstract-short" style="display: inline;"> Multi-step learning applies lookahead over multiple time steps and has proved valuable in policy evaluation settings. However, in the optimal control case, the impact of multi-step learning has been relatively limited despite a number of prior efforts. Fundamentally, this might be because multi-step policy improvements require operations that cannot be approximated by stochastic samples, hence hin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.18501v1-abstract-full').style.display = 'inline'; document.getElementById('2305.18501v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.18501v1-abstract-full" style="display: none;"> Multi-step learning applies lookahead over multiple time steps and has proved valuable in policy evaluation settings. However, in the optimal control case, the impact of multi-step learning has been relatively limited despite a number of prior efforts. Fundamentally, this might be because multi-step policy improvements require operations that cannot be approximated by stochastic samples, hence hindering the widespread adoption of such methods in practice. To address such limitations, we introduce doubly multi-step off-policy VI (DoMo-VI), a novel oracle algorithm that combines multi-step policy improvements and policy evaluations. DoMo-VI enjoys guaranteed convergence speed-up to the optimal policy and is applicable in general off-policy learning settings. We then propose doubly multi-step off-policy actor-critic (DoMo-AC), a practical instantiation of the DoMo-VI algorithm. DoMo-AC introduces a bias-variance trade-off that ensures improved policy gradient estimates. When combined with the IMPALA architecture, DoMo-AC has showed improvements over the baseline algorithm on Atari-57 game benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.18501v1-abstract-full').style.display = 'none'; document.getElementById('2305.18501v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.18491">arXiv:2305.18491</a> <span> [<a href="https://arxiv.org/pdf/2305.18491">pdf</a>, <a href="https://arxiv.org/format/2305.18491">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Towards a Better Understanding of Representation Dynamics under TD-learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.18491v1-abstract-short" style="display: inline;"> TD-learning is a foundation reinforcement learning (RL) algorithm for value prediction. Critical to the accuracy of value predictions is the quality of state representations. In this work, we consider the question: how does end-to-end TD-learning impact the representation over time? Complementary to prior work, we provide a set of analysis that sheds further light on the representation dynamics un… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.18491v1-abstract-full').style.display = 'inline'; document.getElementById('2305.18491v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.18491v1-abstract-full" style="display: none;"> TD-learning is a foundation reinforcement learning (RL) algorithm for value prediction. Critical to the accuracy of value predictions is the quality of state representations. In this work, we consider the question: how does end-to-end TD-learning impact the representation over time? Complementary to prior work, we provide a set of analysis that sheds further light on the representation dynamics under TD-learning. We first show that when the environments are reversible, end-to-end TD-learning strictly decreases the value approximation error over time. Under further assumptions on the environments, we can connect the representation dynamics with spectral decomposition over the transition matrix. This latter finding establishes fitting multiple value functions from randomly generated rewards as a useful auxiliary task for representation learning, as we empirically validate on both tabular and Atari game suites. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.18491v1-abstract-full').style.display = 'none'; document.getElementById('2305.18491v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.18388">arXiv:2305.18388</a> <span> [<a href="https://arxiv.org/pdf/2305.18388">pdf</a>, <a href="https://arxiv.org/format/2305.18388">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> The Statistical Benefits of Quantile Temporal-Difference Learning for Value Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Lyle%2C+C">Clare Lyle</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Bellemare%2C+M+G">Marc G. Bellemare</a>, <a href="/search/cs?searchtype=author&query=Dabney%2C+W">Will Dabney</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.18388v1-abstract-short" style="display: inline;"> We study the problem of temporal-difference-based policy evaluation in reinforcement learning. In particular, we analyse the use of a distributional reinforcement learning algorithm, quantile temporal-difference learning (QTD), for this task. We reach the surprising conclusion that even if a practitioner has no interest in the return distribution beyond the mean, QTD (which learns predictions abou… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.18388v1-abstract-full').style.display = 'inline'; document.getElementById('2305.18388v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.18388v1-abstract-full" style="display: none;"> We study the problem of temporal-difference-based policy evaluation in reinforcement learning. In particular, we analyse the use of a distributional reinforcement learning algorithm, quantile temporal-difference learning (QTD), for this task. We reach the surprising conclusion that even if a practitioner has no interest in the return distribution beyond the mean, QTD (which learns predictions about the full distribution of returns) may offer performance superior to approaches such as classical TD learning, which predict only the mean return, even in the tabular setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.18388v1-abstract-full').style.display = 'none'; document.getElementById('2305.18388v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.18161">arXiv:2305.18161</a> <span> [<a href="https://arxiv.org/pdf/2305.18161">pdf</a>, <a href="https://arxiv.org/format/2305.18161">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> VA-learning as a more efficient alternative to Q-learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.18161v2-abstract-short" style="display: inline;"> In reinforcement learning, the advantage function is critical for policy improvement, but is often extracted from a learned Q-function. A natural question is: Why not learn the advantage function directly? In this work, we introduce VA-learning, which directly learns advantage function and value function using bootstrapping, without explicit reference to Q-functions. VA-learning learns off-policy… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.18161v2-abstract-full').style.display = 'inline'; document.getElementById('2305.18161v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.18161v2-abstract-full" style="display: none;"> In reinforcement learning, the advantage function is critical for policy improvement, but is often extracted from a learned Q-function. A natural question is: Why not learn the advantage function directly? In this work, we introduce VA-learning, which directly learns advantage function and value function using bootstrapping, without explicit reference to Q-functions. VA-learning learns off-policy and enjoys similar theoretical guarantees as Q-learning. Thanks to the direct learning of advantage function and value function, VA-learning improves the sample efficiency over Q-learning both in tabular implementations and deep RL agents on Atari-57 games. We also identify a close connection between VA-learning and the dueling architecture, which partially explains why a simple architectural change to DQN agents tends to improve performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.18161v2-abstract-full').style.display = 'none'; document.getElementById('2305.18161v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICML 2023 as a conference paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.13185">arXiv:2305.13185</a> <span> [<a href="https://arxiv.org/pdf/2305.13185">pdf</a>, <a href="https://arxiv.org/format/2305.13185">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Regularization and Variance-Weighted Regression Achieves Minimax Optimality in Linear MDPs: Theory and Practice </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kitamura%2C+T">Toshinori Kitamura</a>, <a href="/search/cs?searchtype=author&query=Kozuno%2C+T">Tadashi Kozuno</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Vieillard%2C+N">Nino Vieillard</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenhao Yang</a>, <a href="/search/cs?searchtype=author&query=Mei%2C+J">Jincheng Mei</a>, <a href="/search/cs?searchtype=author&query=M%C3%A9nard%2C+P">Pierre M茅nard</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Pietquin%2C+O">Olivier Pietquin</a>, <a href="/search/cs?searchtype=author&query=Geist%2C+M">Matthieu Geist</a>, <a href="/search/cs?searchtype=author&query=Szepesv%C3%A1ri%2C+C">Csaba Szepesv谩ri</a>, <a href="/search/cs?searchtype=author&query=Kumagai%2C+W">Wataru Kumagai</a>, <a href="/search/cs?searchtype=author&query=Matsuo%2C+Y">Yutaka Matsuo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.13185v1-abstract-short" style="display: inline;"> Mirror descent value iteration (MDVI), an abstraction of Kullback-Leibler (KL) and entropy-regularized reinforcement learning (RL), has served as the basis for recent high-performing practical RL algorithms. However, despite the use of function approximation in practice, the theoretical understanding of MDVI has been limited to tabular Markov decision processes (MDPs). We study MDVI with linear fu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.13185v1-abstract-full').style.display = 'inline'; document.getElementById('2305.13185v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.13185v1-abstract-full" style="display: none;"> Mirror descent value iteration (MDVI), an abstraction of Kullback-Leibler (KL) and entropy-regularized reinforcement learning (RL), has served as the basis for recent high-performing practical RL algorithms. However, despite the use of function approximation in practice, the theoretical understanding of MDVI has been limited to tabular Markov decision processes (MDPs). We study MDVI with linear function approximation through its sample complexity required to identify an $\varepsilon$-optimal policy with probability $1-未$ under the settings of an infinite-horizon linear MDP, generative model, and G-optimal design. We demonstrate that least-squares regression weighted by the variance of an estimated optimal value function of the next state is crucial to achieving minimax optimality. Based on this observation, we present Variance-Weighted Least-Squares MDVI (VWLS-MDVI), the first theoretical algorithm that achieves nearly minimax optimal sample complexity for infinite-horizon linear MDPs. Furthermore, we propose a practical VWLS algorithm for value-based deep RL, Deep Variance Weighting (DVW). Our experiments demonstrate that DVW improves the performance of popular value-based deep RL algorithms on a set of MinAtar benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.13185v1-abstract-full').style.display = 'none'; document.getElementById('2305.13185v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2023 accepted</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.00654">arXiv:2305.00654</a> <span> [<a href="https://arxiv.org/pdf/2305.00654">pdf</a>, <a href="https://arxiv.org/format/2305.00654">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Representations and Exploration for Deep Reinforcement Learning using Singular Value Decomposition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Thakoor%2C+S">Shantanu Thakoor</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z+D">Zhaohan Daniel Guo</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">Remi Munos</a>, <a href="/search/cs?searchtype=author&query=Dabney%2C+W">Will Dabney</a>, <a href="/search/cs?searchtype=author&query=Borsa%2C+D+L">Diana L Borsa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.00654v2-abstract-short" style="display: inline;"> Representation learning and exploration are among the key challenges for any deep reinforcement learning agent. In this work, we provide a singular value decomposition based method that can be used to obtain representations that preserve the underlying transition structure in the domain. Perhaps interestingly, we show that these representations also capture the relative frequency of state visitati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.00654v2-abstract-full').style.display = 'inline'; document.getElementById('2305.00654v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.00654v2-abstract-full" style="display: none;"> Representation learning and exploration are among the key challenges for any deep reinforcement learning agent. In this work, we provide a singular value decomposition based method that can be used to obtain representations that preserve the underlying transition structure in the domain. Perhaps interestingly, we show that these representations also capture the relative frequency of state visitations, thereby providing an estimate for pseudo-counts for free. To scale this decomposition method to large-scale domains, we provide an algorithm that never requires building the transition matrix, can make use of deep networks, and also permits mini-batch training. Further, we draw inspiration from predictive state representations and extend our decomposition method to partially observable environments. With experiments on multi-task settings with partially observable domains, we show that the proposed method can not only learn useful representation on DM-Lab-30 environments (that have inputs involving language instructions, pixel images, and rewards, among others) but it can also be effective at hard exploration tasks in DM-Hard-8 environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.00654v2-abstract-full').style.display = 'none'; document.getElementById('2305.00654v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at the 40th International Conference on Machine Learning (ICML 2023)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.08059">arXiv:2303.08059</a> <span> [<a href="https://arxiv.org/pdf/2303.08059">pdf</a>, <a href="https://arxiv.org/format/2303.08059">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Fast Rates for Maximum Entropy Exploration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tiapkin%2C+D">Daniil Tiapkin</a>, <a href="/search/cs?searchtype=author&query=Belomestny%2C+D">Denis Belomestny</a>, <a href="/search/cs?searchtype=author&query=Calandriello%2C+D">Daniele Calandriello</a>, <a href="/search/cs?searchtype=author&query=Moulines%2C+E">Eric Moulines</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">Remi Munos</a>, <a href="/search/cs?searchtype=author&query=Naumov%2C+A">Alexey Naumov</a>, <a href="/search/cs?searchtype=author&query=Perrault%2C+P">Pierre Perrault</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&query=Menard%2C+P">Pierre Menard</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.08059v2-abstract-short" style="display: inline;"> We address the challenge of exploration in reinforcement learning (RL) when the agent operates in an unknown environment with sparse or no rewards. In this work, we study the maximum entropy exploration problem of two different types. The first type is visitation entropy maximization previously considered by Hazan et al.(2019) in the discounted setting. For this type of exploration, we propose a g… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.08059v2-abstract-full').style.display = 'inline'; document.getElementById('2303.08059v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.08059v2-abstract-full" style="display: none;"> We address the challenge of exploration in reinforcement learning (RL) when the agent operates in an unknown environment with sparse or no rewards. In this work, we study the maximum entropy exploration problem of two different types. The first type is visitation entropy maximization previously considered by Hazan et al.(2019) in the discounted setting. For this type of exploration, we propose a game-theoretic algorithm that has $\widetilde{\mathcal{O}}(H^3S^2A/\varepsilon^2)$ sample complexity thus improving the $\varepsilon$-dependence upon existing results, where $S$ is a number of states, $A$ is a number of actions, $H$ is an episode length, and $\varepsilon$ is a desired accuracy. The second type of entropy we study is the trajectory entropy. This objective function is closely related to the entropy-regularized MDPs, and we propose a simple algorithm that has a sample complexity of order $\widetilde{\mathcal{O}}(\mathrm{poly}(S,A,H)/\varepsilon)$. Interestingly, it is the first theoretical result in RL literature that establishes the potential statistical advantage of regularized MDPs for exploration. Finally, we apply developed regularization techniques to reduce sample complexity of visitation entropy maximization to $\widetilde{\mathcal{O}}(H^2SA/\varepsilon^2)$, yielding a statistical separation between maximum entropy exploration and reward-free exploration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.08059v2-abstract-full').style.display = 'none'; document.getElementById('2303.08059v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML-2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.04462">arXiv:2301.04462</a> <span> [<a href="https://arxiv.org/pdf/2301.04462">pdf</a>, <a href="https://arxiv.org/format/2301.04462">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> An Analysis of Quantile Temporal-Difference Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Ostrovski%2C+G">Georg Ostrovski</a>, <a href="/search/cs?searchtype=author&query=Harutyunyan%2C+A">Anna Harutyunyan</a>, <a href="/search/cs?searchtype=author&query=Tuyls%2C+K">Karl Tuyls</a>, <a href="/search/cs?searchtype=author&query=Bellemare%2C+M+G">Marc G. Bellemare</a>, <a href="/search/cs?searchtype=author&query=Dabney%2C+W">Will Dabney</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.04462v3-abstract-short" style="display: inline;"> We analyse quantile temporal-difference learning (QTD), a distributional reinforcement learning algorithm that has proven to be a key component in several successful large-scale applications of reinforcement learning. Despite these empirical successes, a theoretical understanding of QTD has proven elusive until now. Unlike classical TD learning, which can be analysed with standard stochastic appro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.04462v3-abstract-full').style.display = 'inline'; document.getElementById('2301.04462v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.04462v3-abstract-full" style="display: none;"> We analyse quantile temporal-difference learning (QTD), a distributional reinforcement learning algorithm that has proven to be a key component in several successful large-scale applications of reinforcement learning. Despite these empirical successes, a theoretical understanding of QTD has proven elusive until now. Unlike classical TD learning, which can be analysed with standard stochastic approximation tools, QTD updates do not approximate contraction mappings, are highly non-linear, and may have multiple fixed points. The core result of this paper is a proof of convergence to the fixed points of a related family of dynamic programming procedures with probability 1, putting QTD on firm theoretical footing. The proof establishes connections between QTD and non-linear differential inclusions through stochastic approximation theory and non-smooth analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.04462v3-abstract-full').style.display = 'none'; document.getElementById('2301.04462v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to JMLR</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.12567">arXiv:2212.12567</a> <span> [<a href="https://arxiv.org/pdf/2212.12567">pdf</a>, <a href="https://arxiv.org/format/2212.12567">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Adapting to game trees in zero-sum imperfect information games </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fiegel%2C+C">C么me Fiegel</a>, <a href="/search/cs?searchtype=author&query=M%C3%A9nard%2C+P">Pierre M茅nard</a>, <a href="/search/cs?searchtype=author&query=Kozuno%2C+T">Tadashi Kozuno</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Perchet%2C+V">Vianney Perchet</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.12567v2-abstract-short" style="display: inline;"> Imperfect information games (IIG) are games in which each player only partially observes the current game state. We study how to learn $蔚$-optimal strategies in a zero-sum IIG through self-play with trajectory feedback. We give a problem-independent lower bound $\widetilde{\mathcal{O}}(H(A_{\mathcal{X}}+B_{\mathcal{Y}})/蔚^2)$ on the required number of realizations to learn these strategies with hi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.12567v2-abstract-full').style.display = 'inline'; document.getElementById('2212.12567v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.12567v2-abstract-full" style="display: none;"> Imperfect information games (IIG) are games in which each player only partially observes the current game state. We study how to learn $蔚$-optimal strategies in a zero-sum IIG through self-play with trajectory feedback. We give a problem-independent lower bound $\widetilde{\mathcal{O}}(H(A_{\mathcal{X}}+B_{\mathcal{Y}})/蔚^2)$ on the required number of realizations to learn these strategies with high probability, where $H$ is the length of the game, $A_{\mathcal{X}}$ and $B_{\mathcal{Y}}$ are the total number of actions for the two players. We also propose two Follow the Regularized leader (FTRL) algorithms for this setting: Balanced FTRL which matches this lower bound, but requires the knowledge of the information set structure beforehand to define the regularization; and Adaptive FTRL which needs $\widetilde{\mathcal{O}}(H^2(A_{\mathcal{X}}+B_{\mathcal{Y}})/蔚^2)$ realizations without this requirement by progressively adapting the regularization to the observations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.12567v2-abstract-full').style.display = 'none'; document.getElementById('2212.12567v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.03319">arXiv:2212.03319</a> <span> [<a href="https://arxiv.org/pdf/2212.03319">pdf</a>, <a href="https://arxiv.org/format/2212.03319">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Understanding Self-Predictive Learning for Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z+D">Zhaohan Daniel Guo</a>, <a href="/search/cs?searchtype=author&query=Richemond%2C+P+H">Pierre Harvey Richemond</a>, <a href="/search/cs?searchtype=author&query=Pires%2C+B+%C3%81">Bernardo 脕vila Pires</a>, <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+C+L">Charline Le Lan</a>, <a href="/search/cs?searchtype=author&query=Lyle%2C+C">Clare Lyle</a>, <a href="/search/cs?searchtype=author&query=Gy%C3%B6rgy%2C+A">Andr谩s Gy枚rgy</a>, <a href="/search/cs?searchtype=author&query=Thakoor%2C+S">Shantanu Thakoor</a>, <a href="/search/cs?searchtype=author&query=Dabney%2C+W">Will Dabney</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&query=Calandriello%2C+D">Daniele Calandriello</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.03319v1-abstract-short" style="display: inline;"> We study the learning dynamics of self-predictive learning for reinforcement learning, a family of algorithms that learn representations by minimizing the prediction error of their own future latent representations. Despite its recent empirical success, such algorithms have an apparent defect: trivial representations (such as constants) minimize the prediction error, yet it is obviously undesirabl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.03319v1-abstract-full').style.display = 'inline'; document.getElementById('2212.03319v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.03319v1-abstract-full" style="display: none;"> We study the learning dynamics of self-predictive learning for reinforcement learning, a family of algorithms that learn representations by minimizing the prediction error of their own future latent representations. Despite its recent empirical success, such algorithms have an apparent defect: trivial representations (such as constants) minimize the prediction error, yet it is obviously undesirable to converge to such solutions. Our central insight is that careful designs of the optimization dynamics are critical to learning meaningful representations. We identify that a faster paced optimization of the predictor and semi-gradient updates on the representation, are crucial to preventing the representation collapse. Then in an idealized setup, we show self-predictive learning dynamics carries out spectral decomposition on the state transition matrix, effectively capturing information of the transition dynamics. Building on the theoretical insights, we propose bidirectional self-predictive learning, a novel self-predictive algorithm that learns two representations simultaneously. We examine the robustness of our theoretical insights with a number of small-scale experiments and showcase the promise of the novel representation learning algorithm with large-scale experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.03319v1-abstract-full').style.display = 'none'; document.getElementById('2212.03319v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.10515">arXiv:2211.10515</a> <span> [<a href="https://arxiv.org/pdf/2211.10515">pdf</a>, <a href="https://arxiv.org/format/2211.10515">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Curiosity in Hindsight: Intrinsic Exploration in Stochastic Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jarrett%2C+D">Daniel Jarrett</a>, <a href="/search/cs?searchtype=author&query=Tallec%2C+C">Corentin Tallec</a>, <a href="/search/cs?searchtype=author&query=Altch%C3%A9%2C+F">Florent Altch茅</a>, <a href="/search/cs?searchtype=author&query=Mesnard%2C+T">Thomas Mesnard</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.10515v2-abstract-short" style="display: inline;"> Consider the problem of exploration in sparse-reward or reward-free environments, such as in Montezuma's Revenge. In the curiosity-driven paradigm, the agent is rewarded for how much each realized outcome differs from their predicted outcome. But using predictive error as intrinsic motivation is fragile in stochastic environments, as the agent may become trapped by high-entropy areas of the state-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.10515v2-abstract-full').style.display = 'inline'; document.getElementById('2211.10515v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.10515v2-abstract-full" style="display: none;"> Consider the problem of exploration in sparse-reward or reward-free environments, such as in Montezuma's Revenge. In the curiosity-driven paradigm, the agent is rewarded for how much each realized outcome differs from their predicted outcome. But using predictive error as intrinsic motivation is fragile in stochastic environments, as the agent may become trapped by high-entropy areas of the state-action space, such as a "noisy TV". In this work, we study a natural solution derived from structural causal models of the world: Our key idea is to learn representations of the future that capture precisely the unpredictable aspects of each outcome -- which we use as additional input for predictions, such that intrinsic rewards only reflect the predictable aspects of world dynamics. First, we propose incorporating such hindsight representations into models to disentangle "noise" from "novelty", yielding Curiosity in Hindsight: a simple and scalable generalization of curiosity that is robust to stochasticity. Second, we instantiate this framework for the recently introduced BYOL-Explore algorithm as our prime example, resulting in the noise-robust BYOL-Hindsight. Third, we illustrate its behavior under a variety of different stochasticities in a grid world, and find improvements over BYOL-Explore in hard-exploration Atari games with sticky actions. Notably, we show state-of-the-art results in exploring Montezuma's Revenge with sticky actions, while preserving performance in the non-sticky setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.10515v2-abstract-full').style.display = 'none'; document.getElementById('2211.10515v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> In Proc. 40th International Conference on Machine Learning (ICML 2023) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.14414">arXiv:2209.14414</a> <span> [<a href="https://arxiv.org/pdf/2209.14414">pdf</a>, <a href="https://arxiv.org/format/2209.14414">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Optimistic Posterior Sampling for Reinforcement Learning with Few Samples and Tight Guarantees </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tiapkin%2C+D">Daniil Tiapkin</a>, <a href="/search/cs?searchtype=author&query=Belomestny%2C+D">Denis Belomestny</a>, <a href="/search/cs?searchtype=author&query=Calandriello%2C+D">Daniele Calandriello</a>, <a href="/search/cs?searchtype=author&query=Moulines%2C+E">Eric Moulines</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">Remi Munos</a>, <a href="/search/cs?searchtype=author&query=Naumov%2C+A">Alexey Naumov</a>, <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&query=Menard%2C+P">Pierre Menard</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.14414v1-abstract-short" style="display: inline;"> We consider reinforcement learning in an environment modeled by an episodic, finite, stage-dependent Markov decision process of horizon $H$ with $S$ states, and $A$ actions. The performance of an agent is measured by the regret after interacting with the environment for $T$ episodes. We propose an optimistic posterior sampling algorithm for reinforcement learning (OPSRL), a simple variant of poste… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.14414v1-abstract-full').style.display = 'inline'; document.getElementById('2209.14414v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.14414v1-abstract-full" style="display: none;"> We consider reinforcement learning in an environment modeled by an episodic, finite, stage-dependent Markov decision process of horizon $H$ with $S$ states, and $A$ actions. The performance of an agent is measured by the regret after interacting with the environment for $T$ episodes. We propose an optimistic posterior sampling algorithm for reinforcement learning (OPSRL), a simple variant of posterior sampling that only needs a number of posterior samples logarithmic in $H$, $S$, $A$, and $T$ per state-action pair. For OPSRL we guarantee a high-probability regret bound of order at most $\widetilde{\mathcal{O}}(\sqrt{H^3SAT})$ ignoring $\text{poly}\log(HSAT)$ terms. The key novel technical ingredient is a new sharp anti-concentration inequality for linear forms which may be of independent interest. Specifically, we extend the normal approximation-based lower bound for Beta distributions by Alfers and Dinges [1984] to Dirichlet distributions. Our bound matches the lower bound of order $惟(\sqrt{H^3SAT})$, thereby answering the open problems raised by Agrawal and Jia [2017b] for the episodic setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.14414v1-abstract-full').style.display = 'none'; document.getElementById('2209.14414v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2205.07704</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.07570">arXiv:2207.07570</a> <span> [<a href="https://arxiv.org/pdf/2207.07570">pdf</a>, <a href="https://arxiv.org/format/2207.07570">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> The Nature of Temporal Difference Errors in Multi-step Distributional Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Pires%2C+B+%C3%81">Bernardo 脕vila Pires</a>, <a href="/search/cs?searchtype=author&query=Dabney%2C+W">Will Dabney</a>, <a href="/search/cs?searchtype=author&query=Bellemare%2C+M+G">Marc G. Bellemare</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.07570v1-abstract-short" style="display: inline;"> We study the multi-step off-policy learning approach to distributional RL. Despite the apparent similarity between value-based RL and distributional RL, our study reveals intriguing and fundamental differences between the two cases in the multi-step setting. We identify a novel notion of path-dependent distributional TD error, which is indispensable for principled multi-step distributional RL. The… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.07570v1-abstract-full').style.display = 'inline'; document.getElementById('2207.07570v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.07570v1-abstract-full" style="display: none;"> We study the multi-step off-policy learning approach to distributional RL. Despite the apparent similarity between value-based RL and distributional RL, our study reveals intriguing and fundamental differences between the two cases in the multi-step setting. We identify a novel notion of path-dependent distributional TD error, which is indispensable for principled multi-step distributional RL. The distinction from the value-based case bears important implications on concepts such as backward-view algorithms. Our work provides the first theoretical guarantees on multi-step off-policy distributional RL algorithms, including results that apply to the small number of existing approaches to multi-step distributional RL. In addition, we derive a novel algorithm, Quantile Regression-Retrace, which leads to a deep RL agent QR-DQN-Retrace that shows empirical improvements over QR-DQN on the Atari-57 benchmark. Collectively, we shed light on how unique challenges in multi-step distributional RL can be addressed both in theory and practice. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.07570v1-abstract-full').style.display = 'none'; document.getElementById('2207.07570v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.15378">arXiv:2206.15378</a> <span> [<a href="https://arxiv.org/pdf/2206.15378">pdf</a>, <a href="https://arxiv.org/format/2206.15378">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1126/science.add4679">10.1126/science.add4679 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Mastering the Game of Stratego with Model-Free Multiagent Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Perolat%2C+J">Julien Perolat</a>, <a href="/search/cs?searchtype=author&query=de+Vylder%2C+B">Bart de Vylder</a>, <a href="/search/cs?searchtype=author&query=Hennes%2C+D">Daniel Hennes</a>, <a href="/search/cs?searchtype=author&query=Tarassov%2C+E">Eugene Tarassov</a>, <a href="/search/cs?searchtype=author&query=Strub%2C+F">Florian Strub</a>, <a href="/search/cs?searchtype=author&query=de+Boer%2C+V">Vincent de Boer</a>, <a href="/search/cs?searchtype=author&query=Muller%2C+P">Paul Muller</a>, <a href="/search/cs?searchtype=author&query=Connor%2C+J+T">Jerome T. Connor</a>, <a href="/search/cs?searchtype=author&query=Burch%2C+N">Neil Burch</a>, <a href="/search/cs?searchtype=author&query=Anthony%2C+T">Thomas Anthony</a>, <a href="/search/cs?searchtype=author&query=McAleer%2C+S">Stephen McAleer</a>, <a href="/search/cs?searchtype=author&query=Elie%2C+R">Romuald Elie</a>, <a href="/search/cs?searchtype=author&query=Cen%2C+S+H">Sarah H. Cen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhe Wang</a>, <a href="/search/cs?searchtype=author&query=Gruslys%2C+A">Audrunas Gruslys</a>, <a href="/search/cs?searchtype=author&query=Malysheva%2C+A">Aleksandra Malysheva</a>, <a href="/search/cs?searchtype=author&query=Khan%2C+M">Mina Khan</a>, <a href="/search/cs?searchtype=author&query=Ozair%2C+S">Sherjil Ozair</a>, <a href="/search/cs?searchtype=author&query=Timbers%2C+F">Finbarr Timbers</a>, <a href="/search/cs?searchtype=author&query=Pohlen%2C+T">Toby Pohlen</a>, <a href="/search/cs?searchtype=author&query=Eccles%2C+T">Tom Eccles</a>, <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Lespiau%2C+J">Jean-Baptiste Lespiau</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a> , et al. (9 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.15378v1-abstract-short" style="display: inline;"> We introduce DeepNash, an autonomous agent capable of learning to play the imperfect information game Stratego from scratch, up to a human expert level. Stratego is one of the few iconic board games that Artificial Intelligence (AI) has not yet mastered. This popular game has an enormous game tree on the order of $10^{535}$ nodes, i.e., $10^{175}$ times larger than that of Go. It has the additiona… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.15378v1-abstract-full').style.display = 'inline'; document.getElementById('2206.15378v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.15378v1-abstract-full" style="display: none;"> We introduce DeepNash, an autonomous agent capable of learning to play the imperfect information game Stratego from scratch, up to a human expert level. Stratego is one of the few iconic board games that Artificial Intelligence (AI) has not yet mastered. This popular game has an enormous game tree on the order of $10^{535}$ nodes, i.e., $10^{175}$ times larger than that of Go. It has the additional complexity of requiring decision-making under imperfect information, similar to Texas hold'em poker, which has a significantly smaller game tree (on the order of $10^{164}$ nodes). Decisions in Stratego are made over a large number of discrete actions with no obvious link between action and outcome. Episodes are long, with often hundreds of moves before a player wins, and situations in Stratego can not easily be broken down into manageably-sized sub-problems as in poker. For these reasons, Stratego has been a grand challenge for the field of AI for decades, and existing AI methods barely reach an amateur level of play. DeepNash uses a game-theoretic, model-free deep reinforcement learning method, without search, that learns to master Stratego via self-play. The Regularised Nash Dynamics (R-NaD) algorithm, a key component of DeepNash, converges to an approximate Nash equilibrium, instead of 'cycling' around it, by directly modifying the underlying multi-agent learning dynamics. DeepNash beats existing state-of-the-art AI methods in Stratego and achieved a yearly (2022) and all-time top-3 rank on the Gravon games platform, competing with human expert players. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.15378v1-abstract-full').style.display = 'none'; document.getElementById('2206.15378v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.08736">arXiv:2206.08736</a> <span> [<a href="https://arxiv.org/pdf/2206.08736">pdf</a>, <a href="https://arxiv.org/format/2206.08736">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Generalised Policy Improvement with Geometric Policy Composition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Thakoor%2C+S">Shantanu Thakoor</a>, <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Borsa%2C+D">Diana Borsa</a>, <a href="/search/cs?searchtype=author&query=Dabney%2C+W">Will Dabney</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Barreto%2C+A">Andr茅 Barreto</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.08736v1-abstract-short" style="display: inline;"> We introduce a method for policy improvement that interpolates between the greedy approach of value-based reinforcement learning (RL) and the full planning approach typical of model-based RL. The new method builds on the concept of a geometric horizon model (GHM, also known as a gamma-model), which models the discounted state-visitation distribution of a given policy. We show that we can evaluate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.08736v1-abstract-full').style.display = 'inline'; document.getElementById('2206.08736v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.08736v1-abstract-full" style="display: none;"> We introduce a method for policy improvement that interpolates between the greedy approach of value-based reinforcement learning (RL) and the full planning approach typical of model-based RL. The new method builds on the concept of a geometric horizon model (GHM, also known as a gamma-model), which models the discounted state-visitation distribution of a given policy. We show that we can evaluate any non-Markov policy that switches between a set of base Markov policies with fixed probability by a careful composition of the base policy GHMs, without any additional learning. We can then apply generalised policy improvement (GPI) to collections of such non-Markov policies to obtain a new Markov policy that will in general outperform its precursors. We provide a thorough theoretical analysis of this approach, develop applications to transfer and standard RL, and empirically demonstrate its effectiveness over standard GPI on a challenging deep RL continuous control task. We also provide an analysis of GHM training methods, proving a novel convergence result regarding previously proposed methods and showing how to train these models stably in deep RL settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.08736v1-abstract-full').style.display = 'none'; document.getElementById('2206.08736v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.08332">arXiv:2206.08332</a> <span> [<a href="https://arxiv.org/pdf/2206.08332">pdf</a>, <a href="https://arxiv.org/format/2206.08332">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> BYOL-Explore: Exploration by Bootstrapped Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+Z+D">Zhaohan Daniel Guo</a>, <a href="/search/cs?searchtype=author&query=Thakoor%2C+S">Shantanu Thakoor</a>, <a href="/search/cs?searchtype=author&query=P%C3%AEslar%2C+M">Miruna P卯slar</a>, <a href="/search/cs?searchtype=author&query=Pires%2C+B+A">Bernardo Avila Pires</a>, <a href="/search/cs?searchtype=author&query=Altch%C3%A9%2C+F">Florent Altch茅</a>, <a href="/search/cs?searchtype=author&query=Tallec%2C+C">Corentin Tallec</a>, <a href="/search/cs?searchtype=author&query=Saade%2C+A">Alaa Saade</a>, <a href="/search/cs?searchtype=author&query=Calandriello%2C+D">Daniele Calandriello</a>, <a href="/search/cs?searchtype=author&query=Grill%2C+J">Jean-Bastien Grill</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.08332v1-abstract-short" style="display: inline;"> We present BYOL-Explore, a conceptually simple yet general approach for curiosity-driven exploration in visually-complex environments. BYOL-Explore learns a world representation, the world dynamics, and an exploration policy all-together by optimizing a single prediction loss in the latent space with no additional auxiliary objective. We show that BYOL-Explore is effective in DM-HARD-8, a challeng… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.08332v1-abstract-full').style.display = 'inline'; document.getElementById('2206.08332v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.08332v1-abstract-full" style="display: none;"> We present BYOL-Explore, a conceptually simple yet general approach for curiosity-driven exploration in visually-complex environments. BYOL-Explore learns a world representation, the world dynamics, and an exploration policy all-together by optimizing a single prediction loss in the latent space with no additional auxiliary objective. We show that BYOL-Explore is effective in DM-HARD-8, a challenging partially-observable continuous-action hard-exploration benchmark with visually-rich 3-D environments. On this benchmark, we solve the majority of the tasks purely through augmenting the extrinsic reward with BYOL-Explore s intrinsic reward, whereas prior work could only get off the ground with human demonstrations. As further evidence of the generality of BYOL-Explore, we show that it achieves superhuman performance on the ten hardest exploration games in Atari while having a much simpler design than other competitive agents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.08332v1-abstract-full').style.display = 'none'; document.getElementById('2206.08332v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.14211">arXiv:2205.14211</a> <span> [<a href="https://arxiv.org/pdf/2205.14211">pdf</a>, <a href="https://arxiv.org/format/2205.14211">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> KL-Entropy-Regularized RL with a Generative Model is Minimax Optimal </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kozuno%2C+T">Tadashi Kozuno</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenhao Yang</a>, <a href="/search/cs?searchtype=author&query=Vieillard%2C+N">Nino Vieillard</a>, <a href="/search/cs?searchtype=author&query=Kitamura%2C+T">Toshinori Kitamura</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Mei%2C+J">Jincheng Mei</a>, <a href="/search/cs?searchtype=author&query=M%C3%A9nard%2C+P">Pierre M茅nard</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Pietquin%2C+O">Olivier Pietquin</a>, <a href="/search/cs?searchtype=author&query=Geist%2C+M">Matthieu Geist</a>, <a href="/search/cs?searchtype=author&query=Szepesv%C3%A1ri%2C+C">Csaba Szepesv谩ri</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.14211v1-abstract-short" style="display: inline;"> In this work, we consider and analyze the sample complexity of model-free reinforcement learning with a generative model. Particularly, we analyze mirror descent value iteration (MDVI) by Geist et al. (2019) and Vieillard et al. (2020a), which uses the Kullback-Leibler divergence and entropy regularization in its value and policy updates. Our analysis shows that it is nearly minimax-optimal for fi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.14211v1-abstract-full').style.display = 'inline'; document.getElementById('2205.14211v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.14211v1-abstract-full" style="display: none;"> In this work, we consider and analyze the sample complexity of model-free reinforcement learning with a generative model. Particularly, we analyze mirror descent value iteration (MDVI) by Geist et al. (2019) and Vieillard et al. (2020a), which uses the Kullback-Leibler divergence and entropy regularization in its value and policy updates. Our analysis shows that it is nearly minimax-optimal for finding an $\varepsilon$-optimal policy when $\varepsilon$ is sufficiently small. This is the first theoretical result that demonstrates that a simple model-free algorithm without variance-reduction can be nearly minimax-optimal under the considered setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.14211v1-abstract-full').style.display = 'none'; document.getElementById('2205.14211v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">29 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.16177">arXiv:2203.16177</a> <span> [<a href="https://arxiv.org/pdf/2203.16177">pdf</a>, <a href="https://arxiv.org/format/2203.16177">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Marginalized Operators for Off-policy Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.16177v1-abstract-short" style="display: inline;"> In this work, we propose marginalized operators, a new class of off-policy evaluation operators for reinforcement learning. Marginalized operators strictly generalize generic multi-step operators, such as Retrace, as special cases. Marginalized operators also suggest a form of sample-based estimates with potential variance reduction, compared to sample-based estimates of the original multi-step op… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.16177v1-abstract-full').style.display = 'inline'; document.getElementById('2203.16177v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.16177v1-abstract-full" style="display: none;"> In this work, we propose marginalized operators, a new class of off-policy evaluation operators for reinforcement learning. Marginalized operators strictly generalize generic multi-step operators, such as Retrace, as special cases. Marginalized operators also suggest a form of sample-based estimates with potential variance reduction, compared to sample-based estimates of the original multi-step operators. We show that the estimates for marginalized operators can be computed in a scalable way, which also generalizes prior results on marginalized importance sampling as special cases. Finally, we empirically demonstrate that marginalized operators provide performance gains to off-policy evaluation and downstream policy optimization algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.16177v1-abstract-full').style.display = 'none'; document.getElementById('2203.16177v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at AISTATS 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.13125">arXiv:2106.13125</a> <span> [<a href="https://arxiv.org/pdf/2106.13125">pdf</a>, <a href="https://arxiv.org/format/2106.13125">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Unifying Gradient Estimators for Meta-Reinforcement Learning via Off-Policy Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Kozuno%2C+T">Tadashi Kozuno</a>, <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.13125v2-abstract-short" style="display: inline;"> Model-agnostic meta-reinforcement learning requires estimating the Hessian matrix of value functions. This is challenging from an implementation perspective, as repeatedly differentiating policy gradient estimates may lead to biased Hessian estimates. In this work, we provide a unifying framework for estimating higher-order derivatives of value functions, based on off-policy evaluation. Our framew… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.13125v2-abstract-full').style.display = 'inline'; document.getElementById('2106.13125v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.13125v2-abstract-full" style="display: none;"> Model-agnostic meta-reinforcement learning requires estimating the Hessian matrix of value functions. This is challenging from an implementation perspective, as repeatedly differentiating policy gradient estimates may lead to biased Hessian estimates. In this work, we provide a unifying framework for estimating higher-order derivatives of value functions, based on off-policy evaluation. Our framework interprets a number of prior approaches as special cases and elucidates the bias and variance trade-off of Hessian estimates. This framework also opens the door to a new family of estimates, which can be easily implemented with auto-differentiation libraries, and lead to performance gains in practice. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.13125v2-abstract-full').style.display = 'none'; document.getElementById('2106.13125v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at Neural Information Processing Systems (NeurIPS), 2021. Code is available at https://github.com/robintyh1/neurips2021-meta-gradient-offpolicy-evaluation</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.06279">arXiv:2106.06279</a> <span> [<a href="https://arxiv.org/pdf/2106.06279">pdf</a>, <a href="https://arxiv.org/ps/2106.06279">ps</a>, <a href="https://arxiv.org/format/2106.06279">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Model-Free Learning for Two-Player Zero-Sum Partially Observable Markov Games with Perfect Recall </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kozuno%2C+T">Tadashi Kozuno</a>, <a href="/search/cs?searchtype=author&query=M%C3%A9nard%2C+P">Pierre M茅nard</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.06279v1-abstract-short" style="display: inline;"> We study the problem of learning a Nash equilibrium (NE) in an imperfect information game (IIG) through self-play. Precisely, we focus on two-player, zero-sum, episodic, tabular IIG under the perfect-recall assumption where the only feedback is realizations of the game (bandit feedback). In particular, the dynamic of the IIG is not known -- we can only access it by sampling or interacting with a g… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.06279v1-abstract-full').style.display = 'inline'; document.getElementById('2106.06279v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.06279v1-abstract-full" style="display: none;"> We study the problem of learning a Nash equilibrium (NE) in an imperfect information game (IIG) through self-play. Precisely, we focus on two-player, zero-sum, episodic, tabular IIG under the perfect-recall assumption where the only feedback is realizations of the game (bandit feedback). In particular, the dynamic of the IIG is not known -- we can only access it by sampling or interacting with a game simulator. For this learning setting, we provide the Implicit Exploration Online Mirror Descent (IXOMD) algorithm. It is a model-free algorithm with a high-probability bound on the convergence rate to the NE of order $1/\sqrt{T}$ where $T$ is the number of played games. Moreover, IXOMD is computationally efficient as it needs to perform the updates only along the sampled trajectory. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.06279v1-abstract-full').style.display = 'none'; document.getElementById('2106.06279v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.06170">arXiv:2106.06170</a> <span> [<a href="https://arxiv.org/pdf/2106.06170">pdf</a>, <a href="https://arxiv.org/format/2106.06170">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Taylor Expansion of Discount Factors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.06170v2-abstract-short" style="display: inline;"> In practical reinforcement learning (RL), the discount factor used for estimating value functions often differs from that used for defining the evaluation objective. In this work, we study the effect that this discrepancy of discount factors has during learning, and discover a family of objectives that interpolate value functions of two distinct discount factors. Our analysis suggests new ways for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.06170v2-abstract-full').style.display = 'inline'; document.getElementById('2106.06170v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.06170v2-abstract-full" style="display: none;"> In practical reinforcement learning (RL), the discount factor used for estimating value functions often differs from that used for defining the evaluation objective. In this work, we study the effect that this discrepancy of discount factors has during learning, and discover a family of objectives that interpolate value functions of two distinct discount factors. Our analysis suggests new ways for estimating value functions and performing policy optimization updates, which demonstrate empirical performance gains. This framework also leads to new insights on commonly-used deep RL heuristic modifications to policy optimization algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.06170v2-abstract-full').style.display = 'none'; document.getElementById('2106.06170v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at International Conference of Machine Learning (ICML), 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.03787">arXiv:2106.03787</a> <span> [<a href="https://arxiv.org/pdf/2106.03787">pdf</a>, <a href="https://arxiv.org/format/2106.03787">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Concave Utility Reinforcement Learning: the Mean-Field Game Viewpoint </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Geist%2C+M">Matthieu Geist</a>, <a href="/search/cs?searchtype=author&query=P%C3%A9rolat%2C+J">Julien P茅rolat</a>, <a href="/search/cs?searchtype=author&query=Lauri%C3%A8re%2C+M">Mathieu Lauri猫re</a>, <a href="/search/cs?searchtype=author&query=Elie%2C+R">Romuald Elie</a>, <a href="/search/cs?searchtype=author&query=Perrin%2C+S">Sarah Perrin</a>, <a href="/search/cs?searchtype=author&query=Bachem%2C+O">Olivier Bachem</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Pietquin%2C+O">Olivier Pietquin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.03787v4-abstract-short" style="display: inline;"> Concave Utility Reinforcement Learning (CURL) extends RL from linear to concave utilities in the occupancy measure induced by the agent's policy. This encompasses not only RL but also imitation learning and exploration, among others. Yet, this more general paradigm invalidates the classical Bellman equations, and calls for new algorithms. Mean-field Games (MFGs) are a continuous approximation of m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.03787v4-abstract-full').style.display = 'inline'; document.getElementById('2106.03787v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.03787v4-abstract-full" style="display: none;"> Concave Utility Reinforcement Learning (CURL) extends RL from linear to concave utilities in the occupancy measure induced by the agent's policy. This encompasses not only RL but also imitation learning and exploration, among others. Yet, this more general paradigm invalidates the classical Bellman equations, and calls for new algorithms. Mean-field Games (MFGs) are a continuous approximation of many-agent RL. They consider the limit case of a continuous distribution of identical agents, anonymous with symmetric interests, and reduce the problem to the study of a single representative agent in interaction with the full population. Our core contribution consists in showing that CURL is a subclass of MFGs. We think this important to bridge together both communities. It also allows to shed light on aspects of both fields: we show the equivalence between concavity in CURL and monotonicity in the associated MFG, between optimality conditions in CURL and Nash equilibrium in MFG, or that Fictitious Play (FP) for this class of MFGs is simply Frank-Wolfe, bringing the first convergence rate for discrete-time FP for MFGs. We also experimentally demonstrate that, using algorithms recently introduced for solving MFGs, we can address the CURL problem more efficiently. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.03787v4-abstract-full').style.display = 'none'; document.getElementById('2106.03787v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AAMAS 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2103.00107">arXiv:2103.00107</a> <span> [<a href="https://arxiv.org/pdf/2103.00107">pdf</a>, <a href="https://arxiv.org/format/2103.00107">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Revisiting Peng's Q($位$) for Modern Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kozuno%2C+T">Tadashi Kozuno</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Kapturowski%2C+S">Steven Kapturowski</a>, <a href="/search/cs?searchtype=author&query=Dabney%2C+W">Will Dabney</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&query=Abel%2C+D">David Abel</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2103.00107v1-abstract-short" style="display: inline;"> Off-policy multi-step reinforcement learning algorithms consist of conservative and non-conservative algorithms: the former actively cut traces, whereas the latter do not. Recently, Munos et al. (2016) proved the convergence of conservative algorithms to an optimal Q-function. In contrast, non-conservative algorithms are thought to be unsafe and have a limited or no theoretical guarantee. Nonethel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.00107v1-abstract-full').style.display = 'inline'; document.getElementById('2103.00107v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2103.00107v1-abstract-full" style="display: none;"> Off-policy multi-step reinforcement learning algorithms consist of conservative and non-conservative algorithms: the former actively cut traces, whereas the latter do not. Recently, Munos et al. (2016) proved the convergence of conservative algorithms to an optimal Q-function. In contrast, non-conservative algorithms are thought to be unsafe and have a limited or no theoretical guarantee. Nonetheless, recent studies have shown that non-conservative algorithms empirically outperform conservative ones. Motivated by the empirical results and the lack of theory, we carry out theoretical analyses of Peng's Q($位$), a representative example of non-conservative algorithms. We prove that it also converges to an optimal policy provided that the behavior policy slowly tracks a greedy policy in a way similar to conservative policy iteration. Such a result has been conjectured to be true but has not been proven. We also experiment with Peng's Q($位$) in complex continuous control tasks, confirming that Peng's Q($位$) often outperforms conservative algorithms despite its simplicity. These results indicate that Peng's Q($位$), which was thought to be unsafe, is a theoretically-sound and practically effective algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.00107v1-abstract-full').style.display = 'none'; document.getElementById('2103.00107v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages, 7 figures, 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2102.06514">arXiv:2102.06514</a> <span> [<a href="https://arxiv.org/pdf/2102.06514">pdf</a>, <a href="https://arxiv.org/format/2102.06514">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Large-Scale Representation Learning on Graphs via Bootstrapping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Thakoor%2C+S">Shantanu Thakoor</a>, <a href="/search/cs?searchtype=author&query=Tallec%2C+C">Corentin Tallec</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Azabou%2C+M">Mehdi Azabou</a>, <a href="/search/cs?searchtype=author&query=Dyer%2C+E+L">Eva L. Dyer</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Veli%C4%8Dkovi%C4%87%2C+P">Petar Veli膷kovi膰</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2102.06514v3-abstract-short" style="display: inline;"> Self-supervised learning provides a promising path towards eliminating the need for costly label information in representation learning on graphs. However, to achieve state-of-the-art performance, methods often need large numbers of negative examples and rely on complex augmentations. This can be prohibitively expensive, especially for large graphs. To address these challenges, we introduce Bootst… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.06514v3-abstract-full').style.display = 'inline'; document.getElementById('2102.06514v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2102.06514v3-abstract-full" style="display: none;"> Self-supervised learning provides a promising path towards eliminating the need for costly label information in representation learning on graphs. However, to achieve state-of-the-art performance, methods often need large numbers of negative examples and rely on complex augmentations. This can be prohibitively expensive, especially for large graphs. To address these challenges, we introduce Bootstrapped Graph Latents (BGRL) - a graph representation learning method that learns by predicting alternative augmentations of the input. BGRL uses only simple augmentations and alleviates the need for contrasting with negative examples, and is thus scalable by design. BGRL outperforms or matches prior methods on several established benchmarks, while achieving a 2-10x reduction in memory costs. Furthermore, we show that BGRL can be scaled up to extremely large graphs with hundreds of millions of nodes in the semi-supervised regime - achieving state-of-the-art performance and improving over supervised baselines where representations are shaped only through label information. In particular, our solution centered on BGRL constituted one of the winning entries to the Open Graph Benchmark - Large Scale Challenge at KDD Cup 2021, on a graph orders of magnitudes larger than all previously available benchmarks, thus demonstrating the scalability and effectiveness of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.06514v3-abstract-full').style.display = 'none'; document.getElementById('2102.06514v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published as a conference paper at ICLR 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2101.02055">arXiv:2101.02055</a> <span> [<a href="https://arxiv.org/pdf/2101.02055">pdf</a>, <a href="https://arxiv.org/format/2101.02055">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Geometric Entropic Exploration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+Z+D">Zhaohan Daniel Guo</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Saade%2C+A">Alaa Saade</a>, <a href="/search/cs?searchtype=author&query=Thakoor%2C+S">Shantanu Thakoor</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&query=Pires%2C+B+A">Bernardo Avila Pires</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&query=Mesnard%2C+T">Thomas Mesnard</a>, <a href="/search/cs?searchtype=author&query=Lattimore%2C+T">Tor Lattimore</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2101.02055v2-abstract-short" style="display: inline;"> Exploration is essential for solving complex Reinforcement Learning (RL) tasks. Maximum State-Visitation Entropy (MSVE) formulates the exploration problem as a well-defined policy optimization problem whose solution aims at visiting all states as uniformly as possible. This is in contrast to standard uncertainty-based approaches where exploration is transient and eventually vanishes. However, exis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.02055v2-abstract-full').style.display = 'inline'; document.getElementById('2101.02055v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2101.02055v2-abstract-full" style="display: none;"> Exploration is essential for solving complex Reinforcement Learning (RL) tasks. Maximum State-Visitation Entropy (MSVE) formulates the exploration problem as a well-defined policy optimization problem whose solution aims at visiting all states as uniformly as possible. This is in contrast to standard uncertainty-based approaches where exploration is transient and eventually vanishes. However, existing approaches to MSVE are theoretically justified only for discrete state-spaces as they are oblivious to the geometry of continuous domains. We address this challenge by introducing Geometric Entropy Maximisation (GEM), a new algorithm that maximises the geometry-aware Shannon entropy of state-visits in both discrete and continuous domains. Our key theoretical contribution is casting geometry-aware MSVE exploration as a tractable problem of optimising a simple and novel noise-contrastive objective function. In our experiments, we show the efficiency of GEM in solving several RL problems with sparse rewards, compared against other deep RL exploration approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.02055v2-abstract-full').style.display = 'none'; document.getElementById('2101.02055v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 January, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.09464">arXiv:2011.09464</a> <span> [<a href="https://arxiv.org/pdf/2011.09464">pdf</a>, <a href="https://arxiv.org/format/2011.09464">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Counterfactual Credit Assignment in Model-Free Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mesnard%2C+T">Thomas Mesnard</a>, <a href="/search/cs?searchtype=author&query=Weber%2C+T">Th茅ophane Weber</a>, <a href="/search/cs?searchtype=author&query=Viola%2C+F">Fabio Viola</a>, <a href="/search/cs?searchtype=author&query=Thakoor%2C+S">Shantanu Thakoor</a>, <a href="/search/cs?searchtype=author&query=Saade%2C+A">Alaa Saade</a>, <a href="/search/cs?searchtype=author&query=Harutyunyan%2C+A">Anna Harutyunyan</a>, <a href="/search/cs?searchtype=author&query=Dabney%2C+W">Will Dabney</a>, <a href="/search/cs?searchtype=author&query=Stepleton%2C+T">Tom Stepleton</a>, <a href="/search/cs?searchtype=author&query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&query=Guez%2C+A">Arthur Guez</a>, <a href="/search/cs?searchtype=author&query=Moulines%2C+%C3%89">脡ric Moulines</a>, <a href="/search/cs?searchtype=author&query=Hutter%2C+M">Marcus Hutter</a>, <a href="/search/cs?searchtype=author&query=Buesing%2C+L">Lars Buesing</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.09464v2-abstract-short" style="display: inline;"> Credit assignment in reinforcement learning is the problem of measuring an action's influence on future rewards. In particular, this requires separating skill from luck, i.e. disentangling the effect of an action on rewards from that of external factors and subsequent actions. To achieve this, we adapt the notion of counterfactuals from causality theory to a model-free RL setup. The key idea is to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.09464v2-abstract-full').style.display = 'inline'; document.getElementById('2011.09464v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.09464v2-abstract-full" style="display: none;"> Credit assignment in reinforcement learning is the problem of measuring an action's influence on future rewards. In particular, this requires separating skill from luck, i.e. disentangling the effect of an action on rewards from that of external factors and subsequent actions. To achieve this, we adapt the notion of counterfactuals from causality theory to a model-free RL setup. The key idea is to condition value functions on future events, by learning to extract relevant information from a trajectory. We formulate a family of policy gradient algorithms that use these future-conditional value functions as baselines or critics, and show that they are provably low variance. To avoid the potential bias from conditioning on future information, we constrain the hindsight information to not contain information about the agent's actions. We demonstrate the efficacy and validity of our algorithm on a number of illustrative and challenging problems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.09464v2-abstract-full').style.display = 'none'; document.getElementById('2011.09464v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.09192">arXiv:2011.09192</a> <span> [<a href="https://arxiv.org/pdf/2011.09192">pdf</a>, <a href="https://arxiv.org/format/2011.09192">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Game Plan: What AI can do for Football, and What Football can do for AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tuyls%2C+K">Karl Tuyls</a>, <a href="/search/cs?searchtype=author&query=Omidshafiei%2C+S">Shayegan Omidshafiei</a>, <a href="/search/cs?searchtype=author&query=Muller%2C+P">Paul Muller</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhe Wang</a>, <a href="/search/cs?searchtype=author&query=Connor%2C+J">Jerome Connor</a>, <a href="/search/cs?searchtype=author&query=Hennes%2C+D">Daniel Hennes</a>, <a href="/search/cs?searchtype=author&query=Graham%2C+I">Ian Graham</a>, <a href="/search/cs?searchtype=author&query=Spearman%2C+W">William Spearman</a>, <a href="/search/cs?searchtype=author&query=Waskett%2C+T">Tim Waskett</a>, <a href="/search/cs?searchtype=author&query=Steele%2C+D">Dafydd Steele</a>, <a href="/search/cs?searchtype=author&query=Luc%2C+P">Pauline Luc</a>, <a href="/search/cs?searchtype=author&query=Recasens%2C+A">Adria Recasens</a>, <a href="/search/cs?searchtype=author&query=Galashov%2C+A">Alexandre Galashov</a>, <a href="/search/cs?searchtype=author&query=Thornton%2C+G">Gregory Thornton</a>, <a href="/search/cs?searchtype=author&query=Elie%2C+R">Romuald Elie</a>, <a href="/search/cs?searchtype=author&query=Sprechmann%2C+P">Pablo Sprechmann</a>, <a href="/search/cs?searchtype=author&query=Moreno%2C+P">Pol Moreno</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+K">Kris Cao</a>, <a href="/search/cs?searchtype=author&query=Garnelo%2C+M">Marta Garnelo</a>, <a href="/search/cs?searchtype=author&query=Dutta%2C+P">Praneet Dutta</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&query=Bridgland%2C+A">Alex Bridgland</a>, <a href="/search/cs?searchtype=author&query=Perolat%2C+J">Julien Perolat</a>, <a href="/search/cs?searchtype=author&query=De+Vylder%2C+B">Bart De Vylder</a> , et al. (11 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.09192v1-abstract-short" style="display: inline;"> The rapid progress in artificial intelligence (AI) and machine learning has opened unprecedented analytics possibilities in various team and individual sports, including baseball, basketball, and tennis. More recently, AI techniques have been applied to football, due to a huge increase in data collection by professional teams, increased computational power, and advances in machine learning, with t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.09192v1-abstract-full').style.display = 'inline'; document.getElementById('2011.09192v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.09192v1-abstract-full" style="display: none;"> The rapid progress in artificial intelligence (AI) and machine learning has opened unprecedented analytics possibilities in various team and individual sports, including baseball, basketball, and tennis. More recently, AI techniques have been applied to football, due to a huge increase in data collection by professional teams, increased computational power, and advances in machine learning, with the goal of better addressing new scientific challenges involved in the analysis of both individual players' and coordinated teams' behaviors. The research challenges associated with predictive and prescriptive football analytics require new developments and progress at the intersection of statistical learning, game theory, and computer vision. In this paper, we provide an overarching perspective highlighting how the combination of these fields, in particular, forms a unique microcosm for AI research, while offering mutual benefits for professional teams, spectators, and broadcasters in the years to come. We illustrate that this duality makes football analytics a game changer of tremendous value, in terms of not only changing the game of football itself, but also in terms of what this domain can mean for the field of AI. We review the state-of-the-art and exemplify the types of analysis enabled by combining the aforementioned fields, including illustrative examples of counterfactual analysis using predictive models, and the combination of game-theoretic analysis of penalty kicks with statistical learning of player attributes. We conclude by highlighting envisioned downstream impacts, including possibilities for extensions to other sports (real and virtual). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.09192v1-abstract-full').style.display = 'none'; document.getElementById('2011.09192v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2008.12234">arXiv:2008.12234</a> <span> [<a href="https://arxiv.org/pdf/2008.12234">pdf</a>, <a href="https://arxiv.org/format/2008.12234">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> The Advantage Regret-Matching Actor-Critic </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gruslys%2C+A">Audr奴nas Gruslys</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Timbers%2C+F">Finbarr Timbers</a>, <a href="/search/cs?searchtype=author&query=Schmid%2C+M">Martin Schmid</a>, <a href="/search/cs?searchtype=author&query=Perolat%2C+J">Julien Perolat</a>, <a href="/search/cs?searchtype=author&query=Morrill%2C+D">Dustin Morrill</a>, <a href="/search/cs?searchtype=author&query=Zambaldi%2C+V">Vinicius Zambaldi</a>, <a href="/search/cs?searchtype=author&query=Lespiau%2C+J">Jean-Baptiste Lespiau</a>, <a href="/search/cs?searchtype=author&query=Schultz%2C+J">John Schultz</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Bowling%2C+M">Michael Bowling</a>, <a href="/search/cs?searchtype=author&query=Tuyls%2C+K">Karl Tuyls</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2008.12234v1-abstract-short" style="display: inline;"> Regret minimization has played a key role in online learning, equilibrium computation in games, and reinforcement learning (RL). In this paper, we describe a general model-free RL method for no-regret learning based on repeated reconsideration of past behavior. We propose a model-free RL algorithm, the AdvantageRegret-Matching Actor-Critic (ARMAC): rather than saving past state-action data, ARMAC… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.12234v1-abstract-full').style.display = 'inline'; document.getElementById('2008.12234v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2008.12234v1-abstract-full" style="display: none;"> Regret minimization has played a key role in online learning, equilibrium computation in games, and reinforcement learning (RL). In this paper, we describe a general model-free RL method for no-regret learning based on repeated reconsideration of past behavior. We propose a model-free RL algorithm, the AdvantageRegret-Matching Actor-Critic (ARMAC): rather than saving past state-action data, ARMAC saves a buffer of past policies, replaying through them to reconstruct hindsight assessments of past behavior. These retrospective value estimates are used to predict conditional advantages which, combined with regret matching, produces a new policy. In particular, ARMAC learns from sampled trajectories in a centralized training setting, without requiring the application of importance sampling commonly used in Monte Carlo counterfactual regret (CFR) minimization; hence, it does not suffer from excessive variance in large environments. In the single-agent setting, ARMAC shows an interesting form of exploration by keeping past policies intact. In the multiagent setting, ARMAC in self-play approaches Nash equilibria on some partially-observable zero-sum benchmarks. We provide exploitability estimates in the significantly larger game of betting-abstracted no-limit Texas Hold'em. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.12234v1-abstract-full').style.display = 'none'; document.getElementById('2008.12234v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2007.12509">arXiv:2007.12509</a> <span> [<a href="https://arxiv.org/pdf/2007.12509">pdf</a>, <a href="https://arxiv.org/format/2007.12509">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Monte-Carlo Tree Search as Regularized Policy Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Grill%2C+J">Jean-Bastien Grill</a>, <a href="/search/cs?searchtype=author&query=Altch%C3%A9%2C+F">Florent Altch茅</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Hubert%2C+T">Thomas Hubert</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&query=Antonoglou%2C+I">Ioannis Antonoglou</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2007.12509v1-abstract-short" style="display: inline;"> The combination of Monte-Carlo tree search (MCTS) with deep reinforcement learning has led to significant advances in artificial intelligence. However, AlphaZero, the current state-of-the-art MCTS algorithm, still relies on handcrafted heuristics that are only partially understood. In this paper, we show that AlphaZero's search heuristics, along with other common ones such as UCT, are an approxima… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.12509v1-abstract-full').style.display = 'inline'; document.getElementById('2007.12509v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2007.12509v1-abstract-full" style="display: none;"> The combination of Monte-Carlo tree search (MCTS) with deep reinforcement learning has led to significant advances in artificial intelligence. However, AlphaZero, the current state-of-the-art MCTS algorithm, still relies on handcrafted heuristics that are only partially understood. In this paper, we show that AlphaZero's search heuristics, along with other common ones such as UCT, are an approximation to the solution of a specific regularized policy optimization problem. With this insight, we propose a variant of AlphaZero which uses the exact solution to this policy optimization problem, and show experimentally that it reliably outperforms the original algorithm in multiple domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.12509v1-abstract-full').style.display = 'none'; document.getElementById('2007.12509v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to International Conference on Machine Learning (ICML), 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2006.07733">arXiv:2006.07733</a> <span> [<a href="https://arxiv.org/pdf/2006.07733">pdf</a>, <a href="https://arxiv.org/format/2006.07733">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Bootstrap your own latent: A new approach to self-supervised Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Grill%2C+J">Jean-Bastien Grill</a>, <a href="/search/cs?searchtype=author&query=Strub%2C+F">Florian Strub</a>, <a href="/search/cs?searchtype=author&query=Altch%C3%A9%2C+F">Florent Altch茅</a>, <a href="/search/cs?searchtype=author&query=Tallec%2C+C">Corentin Tallec</a>, <a href="/search/cs?searchtype=author&query=Richemond%2C+P+H">Pierre H. Richemond</a>, <a href="/search/cs?searchtype=author&query=Buchatskaya%2C+E">Elena Buchatskaya</a>, <a href="/search/cs?searchtype=author&query=Doersch%2C+C">Carl Doersch</a>, <a href="/search/cs?searchtype=author&query=Pires%2C+B+A">Bernardo Avila Pires</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z+D">Zhaohan Daniel Guo</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&query=Kavukcuoglu%2C+K">Koray Kavukcuoglu</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2006.07733v3-abstract-short" style="display: inline;"> We introduce Bootstrap Your Own Latent (BYOL), a new approach to self-supervised image representation learning. BYOL relies on two neural networks, referred to as online and target networks, that interact and learn from each other. From an augmented view of an image, we train the online network to predict the target network representation of the same image under a different augmented view. At the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.07733v3-abstract-full').style.display = 'inline'; document.getElementById('2006.07733v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2006.07733v3-abstract-full" style="display: none;"> We introduce Bootstrap Your Own Latent (BYOL), a new approach to self-supervised image representation learning. BYOL relies on two neural networks, referred to as online and target networks, that interact and learn from each other. From an augmented view of an image, we train the online network to predict the target network representation of the same image under a different augmented view. At the same time, we update the target network with a slow-moving average of the online network. While state-of-the art methods rely on negative pairs, BYOL achieves a new state of the art without them. BYOL reaches $74.3\%$ top-1 classification accuracy on ImageNet using a linear evaluation with a ResNet-50 architecture and $79.6\%$ with a larger ResNet. We show that BYOL performs on par or better than the current state of the art on both transfer and semi-supervised benchmarks. Our implementation and pretrained models are given on GitHub. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.07733v3-abstract-full').style.display = 'none'; document.getElementById('2006.07733v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2005.01642">arXiv:2005.01642</a> <span> [<a href="https://arxiv.org/pdf/2005.01642">pdf</a>, <a href="https://arxiv.org/format/2005.01642">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1038/s41467-020-19244-4">10.1038/s41467-020-19244-4 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Navigating the Landscape of Multiplayer Games </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Omidshafiei%2C+S">Shayegan Omidshafiei</a>, <a href="/search/cs?searchtype=author&query=Tuyls%2C+K">Karl Tuyls</a>, <a href="/search/cs?searchtype=author&query=Czarnecki%2C+W+M">Wojciech M. Czarnecki</a>, <a href="/search/cs?searchtype=author&query=Santos%2C+F+C">Francisco C. Santos</a>, <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Connor%2C+J">Jerome Connor</a>, <a href="/search/cs?searchtype=author&query=Hennes%2C+D">Daniel Hennes</a>, <a href="/search/cs?searchtype=author&query=Muller%2C+P">Paul Muller</a>, <a href="/search/cs?searchtype=author&query=Perolat%2C+J">Julien Perolat</a>, <a href="/search/cs?searchtype=author&query=De+Vylder%2C+B">Bart De Vylder</a>, <a href="/search/cs?searchtype=author&query=Gruslys%2C+A">Audrunas Gruslys</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">Remi Munos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2005.01642v3-abstract-short" style="display: inline;"> Multiplayer games have long been used as testbeds in artificial intelligence research, aptly referred to as the Drosophila of artificial intelligence. Traditionally, researchers have focused on using well-known games to build strong agents. This progress, however, can be better informed by characterizing games and their topological landscape. Tackling this latter question can facilitate understand… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.01642v3-abstract-full').style.display = 'inline'; document.getElementById('2005.01642v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2005.01642v3-abstract-full" style="display: none;"> Multiplayer games have long been used as testbeds in artificial intelligence research, aptly referred to as the Drosophila of artificial intelligence. Traditionally, researchers have focused on using well-known games to build strong agents. This progress, however, can be better informed by characterizing games and their topological landscape. Tackling this latter question can facilitate understanding of agents and help determine what game an agent should target next as part of its training. Here, we show how network measures applied to response graphs of large-scale games enable the creation of a landscape of games, quantifying relationships between games of varying sizes and characteristics. We illustrate our findings in domains ranging from canonical games to complex empirical games capturing the performance of trained agents pitted against one another. Our results culminate in a demonstration leveraging this information to generate new and interesting games, including mixtures of empirical games synthesized from real world games. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.01642v3-abstract-full').style.display = 'none'; document.getElementById('2005.01642v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2004.14646">arXiv:2004.14646</a> <span> [<a href="https://arxiv.org/pdf/2004.14646">pdf</a>, <a href="https://arxiv.org/format/2004.14646">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Bootstrap Latent-Predictive Representations for Multitask Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+D">Daniel Guo</a>, <a href="/search/cs?searchtype=author&query=Pires%2C+B+A">Bernardo Avila Pires</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&query=Grill%2C+J">Jean-bastien Grill</a>, <a href="/search/cs?searchtype=author&query=Altch%C3%A9%2C+F">Florent Altch茅</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2004.14646v1-abstract-short" style="display: inline;"> Learning a good representation is an essential component for deep reinforcement learning (RL). Representation learning is especially important in multitask and partially observable settings where building a representation of the unknown environment is crucial to solve the tasks. Here we introduce Prediction of Bootstrap Latents (PBL), a simple and flexible self-supervised representation learning a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.14646v1-abstract-full').style.display = 'inline'; document.getElementById('2004.14646v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2004.14646v1-abstract-full" style="display: none;"> Learning a good representation is an essential component for deep reinforcement learning (RL). Representation learning is especially important in multitask and partially observable settings where building a representation of the unknown environment is crucial to solve the tasks. Here we introduce Prediction of Bootstrap Latents (PBL), a simple and flexible self-supervised representation learning algorithm for multitask deep RL. PBL builds on multistep predictive representations of future observations, and focuses on capturing structured information about environment dynamics. Specifically, PBL trains its representation by predicting latent embeddings of future observations. These latent embeddings are themselves trained to be predictive of the aforementioned representations. These predictions form a bootstrapping effect, allowing the agent to learn more about the key aspects of the environment dynamics. In addition, by defining prediction tasks completely in latent space, PBL provides the flexibility of using multimodal observations involving pixel images, language instructions, rewards and more. We show in our experiments that PBL delivers across-the-board improved performance over state of the art deep RL agents in the DMLab-30 and Atari-57 multitask setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.14646v1-abstract-full').style.display = 'none'; document.getElementById('2004.14646v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 April, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2003.14089">arXiv:2003.14089</a> <span> [<a href="https://arxiv.org/pdf/2003.14089">pdf</a>, <a href="https://arxiv.org/format/2003.14089">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Leverage the Average: an Analysis of KL Regularization in RL </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Vieillard%2C+N">Nino Vieillard</a>, <a href="/search/cs?searchtype=author&query=Kozuno%2C+T">Tadashi Kozuno</a>, <a href="/search/cs?searchtype=author&query=Scherrer%2C+B">Bruno Scherrer</a>, <a href="/search/cs?searchtype=author&query=Pietquin%2C+O">Olivier Pietquin</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Geist%2C+M">Matthieu Geist</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2003.14089v5-abstract-short" style="display: inline;"> Recent Reinforcement Learning (RL) algorithms making use of Kullback-Leibler (KL) regularization as a core component have shown outstanding performance. Yet, only little is understood theoretically about why KL regularization helps, so far. We study KL regularization within an approximate value iteration scheme and show that it implicitly averages q-values. Leveraging this insight, we provide a ve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.14089v5-abstract-full').style.display = 'inline'; document.getElementById('2003.14089v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2003.14089v5-abstract-full" style="display: none;"> Recent Reinforcement Learning (RL) algorithms making use of Kullback-Leibler (KL) regularization as a core component have shown outstanding performance. Yet, only little is understood theoretically about why KL regularization helps, so far. We study KL regularization within an approximate value iteration scheme and show that it implicitly averages q-values. Leveraging this insight, we provide a very strong performance bound, the very first to combine two desirable aspects: a linear dependency to the horizon (instead of quadratic) and an error propagation term involving an averaging effect of the estimation errors (instead of an accumulation effect). We also study the more general case of an additional entropy regularizer. The resulting abstract scheme encompasses many existing RL algorithms. Some of our assumptions do not hold with neural networks, so we complement this theoretical analysis with an extensive empirical study. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.14089v5-abstract-full').style.display = 'none'; document.getElementById('2003.14089v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 March, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2003.06259">arXiv:2003.06259</a> <span> [<a href="https://arxiv.org/pdf/2003.06259">pdf</a>, <a href="https://arxiv.org/format/2003.06259">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Taylor Expansion Policy Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2003.06259v1-abstract-short" style="display: inline;"> In this work, we investigate the application of Taylor expansions in reinforcement learning. In particular, we propose Taylor expansion policy optimization, a policy optimization formalism that generalizes prior work (e.g., TRPO) as a first-order special case. We also show that Taylor expansions intimately relate to off-policy evaluation. Finally, we show that this new formulation entails modifica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.06259v1-abstract-full').style.display = 'inline'; document.getElementById('2003.06259v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2003.06259v1-abstract-full" style="display: none;"> In this work, we investigate the application of Taylor expansions in reinforcement learning. In particular, we propose Taylor expansion policy optimization, a policy optimization formalism that generalizes prior work (e.g., TRPO) as a first-order special case. We also show that Taylor expansions intimately relate to off-policy evaluation. Finally, we show that this new formulation entails modifications which improve the performance of several state-of-the-art distributed algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.06259v1-abstract-full').style.display = 'none'; document.getElementById('2003.06259v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2002.08456">arXiv:2002.08456</a> <span> [<a href="https://arxiv.org/pdf/2002.08456">pdf</a>, <a href="https://arxiv.org/format/2002.08456">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> From Poincar茅 Recurrence to Convergence in Imperfect Information Games: Finding Equilibrium via Regularization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Perolat%2C+J">Julien Perolat</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">Remi Munos</a>, <a href="/search/cs?searchtype=author&query=Lespiau%2C+J">Jean-Baptiste Lespiau</a>, <a href="/search/cs?searchtype=author&query=Omidshafiei%2C+S">Shayegan Omidshafiei</a>, <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Ortega%2C+P">Pedro Ortega</a>, <a href="/search/cs?searchtype=author&query=Burch%2C+N">Neil Burch</a>, <a href="/search/cs?searchtype=author&query=Anthony%2C+T">Thomas Anthony</a>, <a href="/search/cs?searchtype=author&query=Balduzzi%2C+D">David Balduzzi</a>, <a href="/search/cs?searchtype=author&query=De+Vylder%2C+B">Bart De Vylder</a>, <a href="/search/cs?searchtype=author&query=Piliouras%2C+G">Georgios Piliouras</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Tuyls%2C+K">Karl Tuyls</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2002.08456v1-abstract-short" style="display: inline;"> In this paper we investigate the Follow the Regularized Leader dynamics in sequential imperfect information games (IIG). We generalize existing results of Poincar茅 recurrence from normal-form games to zero-sum two-player imperfect information games and other sequential game settings. We then investigate how adapting the reward (by adding a regularization term) of the game can give strong convergen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2002.08456v1-abstract-full').style.display = 'inline'; document.getElementById('2002.08456v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2002.08456v1-abstract-full" style="display: none;"> In this paper we investigate the Follow the Regularized Leader dynamics in sequential imperfect information games (IIG). We generalize existing results of Poincar茅 recurrence from normal-form games to zero-sum two-player imperfect information games and other sequential game settings. We then investigate how adapting the reward (by adding a regularization term) of the game can give strong convergence guarantees in monotone games. We continue by showing how this reward adaptation technique can be leveraged to build algorithms that converge exactly to the Nash equilibrium. Finally, we show how these insights can be directly used to build state-of-the-art model-free algorithms for zero-sum two-player Imperfect Information Games (IIG). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2002.08456v1-abstract-full').style.display = 'none'; document.getElementById('2002.08456v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">43 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1912.02503">arXiv:1912.02503</a> <span> [<a href="https://arxiv.org/pdf/1912.02503">pdf</a>, <a href="https://arxiv.org/format/1912.02503">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Hindsight Credit Assignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Harutyunyan%2C+A">Anna Harutyunyan</a>, <a href="/search/cs?searchtype=author&query=Dabney%2C+W">Will Dabney</a>, <a href="/search/cs?searchtype=author&query=Mesnard%2C+T">Thomas Mesnard</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M">Mohammad Azar</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&query=van+Hasselt%2C+H">Hado van Hasselt</a>, <a href="/search/cs?searchtype=author&query=Wayne%2C+G">Greg Wayne</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+S">Satinder Singh</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">Remi Munos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1912.02503v1-abstract-short" style="display: inline;"> We consider the problem of efficient credit assignment in reinforcement learning. In order to efficiently and meaningfully utilize new data, we propose to explicitly assign credit to past decisions based on the likelihood of them having led to the observed outcome. This approach uses new information in hindsight, rather than employing foresight. Somewhat surprisingly, we show that value functions… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1912.02503v1-abstract-full').style.display = 'inline'; document.getElementById('1912.02503v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1912.02503v1-abstract-full" style="display: none;"> We consider the problem of efficient credit assignment in reinforcement learning. In order to efficiently and meaningfully utilize new data, we propose to explicitly assign credit to past decisions based on the likelihood of them having led to the observed outcome. This approach uses new information in hindsight, rather than employing foresight. Somewhat surprisingly, we show that value functions can be rewritten through this lens, yielding a new family of algorithms. We study the properties of these algorithms, and empirically show that they successfully address important credit assignment challenges, through a set of illustrative tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1912.02503v1-abstract-full').style.display = 'none'; document.getElementById('1912.02503v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2019</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1910.07479">arXiv:1910.07479</a> <span> [<a href="https://arxiv.org/pdf/1910.07479">pdf</a>, <a href="https://arxiv.org/format/1910.07479">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Conditional Importance Sampling for Off-Policy Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Harutyunyan%2C+A">Anna Harutyunyan</a>, <a href="/search/cs?searchtype=author&query=van+Hasselt%2C+H">Hado van Hasselt</a>, <a href="/search/cs?searchtype=author&query=Borsa%2C+D">Diana Borsa</a>, <a href="/search/cs?searchtype=author&query=Schaul%2C+T">Tom Schaul</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Dabney%2C+W">Will Dabney</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1910.07479v2-abstract-short" style="display: inline;"> The principal contribution of this paper is a conceptual framework for off-policy reinforcement learning, based on conditional expectations of importance sampling ratios. This framework yields new perspectives and understanding of existing off-policy algorithms, and reveals a broad space of unexplored algorithms. We theoretically analyse this space, and concretely investigate several algorithms th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.07479v2-abstract-full').style.display = 'inline'; document.getElementById('1910.07479v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1910.07479v2-abstract-full" style="display: none;"> The principal contribution of this paper is a conceptual framework for off-policy reinforcement learning, based on conditional expectations of importance sampling ratios. This framework yields new perspectives and understanding of existing off-policy algorithms, and reveals a broad space of unexplored algorithms. We theoretically analyse this space, and concretely investigate several algorithms that arise from this framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.07479v2-abstract-full').style.display = 'none'; document.getElementById('1910.07479v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AISTATS 2020 camera-ready version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1910.07478">arXiv:1910.07478</a> <span> [<a href="https://arxiv.org/pdf/1910.07478">pdf</a>, <a href="https://arxiv.org/format/1910.07478">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Trade-Offs in Off-Policy Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Dabney%2C+W">Will Dabney</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1910.07478v2-abstract-short" style="display: inline;"> A great variety of off-policy learning algorithms exist in the literature, and new breakthroughs in this area continue to be made, improving theoretical understanding and yielding state-of-the-art reinforcement learning algorithms. In this paper, we take a unifying view of this space of algorithms, and consider their trade-offs of three fundamental quantities: update variance, fixed-point bias, an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.07478v2-abstract-full').style.display = 'inline'; document.getElementById('1910.07478v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1910.07478v2-abstract-full" style="display: none;"> A great variety of off-policy learning algorithms exist in the literature, and new breakthroughs in this area continue to be made, improving theoretical understanding and yielding state-of-the-art reinforcement learning algorithms. In this paper, we take a unifying view of this space of algorithms, and consider their trade-offs of three fundamental quantities: update variance, fixed-point bias, and contraction rate. This leads to new perspectives of existing methods, and also naturally yields novel algorithms for off-policy evaluation and control. We develop one such algorithm, C-trace, demonstrating that it is able to more efficiently make these trade-offs than existing methods in use, and that it can be scaled to yield state-of-the-art performance in large-scale environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.07478v2-abstract-full').style.display = 'none'; document.getElementById('1910.07478v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AISTATS 2020 camera-ready version</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Munos%2C+R&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Munos%2C+R&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Munos%2C+R&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Munos%2C+R&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>