CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;39 of 39 results for author: <span class="mathjax">Piot, B</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Piot%2C+B">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Piot, B"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Piot%2C+B&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Piot, B"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04166">arXiv:2410.04166</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.04166">pdf</a>, <a href="https://arxiv.org/format/2410.04166">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Preference Optimization as Probabilistic Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&amp;query=Shahriari%2C+B">Bobak Shahriari</a>, <a href="/search/cs?searchtype=author&amp;query=Springenberg%2C+J+T">Jost Tobias Springenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Hertweck%2C+T">Tim Hertweck</a>, <a href="/search/cs?searchtype=author&amp;query=Joshi%2C+R">Rishabh Joshi</a>, <a href="/search/cs?searchtype=author&amp;query=Oh%2C+J">Junhyuk Oh</a>, <a href="/search/cs?searchtype=author&amp;query=Bloesch%2C+M">Michael Bloesch</a>, <a href="/search/cs?searchtype=author&amp;query=Lampe%2C+T">Thomas Lampe</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&amp;query=Buchli%2C+J">Jonas Buchli</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04166v1-abstract-short" style="display: inline;"> Existing preference optimization methods are mainly designed for directly learning from human feedback with the assumption that paired examples (preferred vs. dis-preferred) are available. In contrast, we propose a method that can leverage unpaired preferred or dis-preferred examples, and works even when only one type of feedback (positive or negative) is available. This flexibility allows us to a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04166v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04166v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04166v1-abstract-full" style="display: none;"> Existing preference optimization methods are mainly designed for directly learning from human feedback with the assumption that paired examples (preferred vs. dis-preferred) are available. In contrast, we propose a method that can leverage unpaired preferred or dis-preferred examples, and works even when only one type of feedback (positive or negative) is available. This flexibility allows us to apply it in scenarios with varying forms of feedback and models, including training generative language models based on human feedback as well as training policies for sequential decision-making problems, where learned (value) functions are available. Our approach builds upon the probabilistic framework introduced in (Dayan and Hinton, 1997), which proposes to use expectation-maximization (EM) to directly optimize the probability of preferred outcomes (as opposed to classic expected reward maximization). To obtain a practical algorithm, we identify and address a key limitation in current EM-based methods: when applied to preference optimization, they solely maximize the likelihood of preferred examples, while neglecting dis-preferred samples. We show how one can extend EM algorithms to explicitly incorporate dis-preferred outcomes, leading to a novel, theoretically grounded, preference optimization algorithm that offers an intuitive and versatile way to learn from both positive and negative feedback. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04166v1-abstract-full').style.display = 'none'; document.getElementById('2410.04166v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13156">arXiv:2409.13156</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.13156">pdf</a>, <a href="https://arxiv.org/format/2409.13156">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> RRM: Robust Reward Model Training Mitigates Reward Hacking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+T">Tianqi Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+W">Wei Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+J">Jie Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+L">Lichang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+J">Junru Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Joshi%2C+R">Rishabh Joshi</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+Y">Yang Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+J">Jiaming Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Qin%2C+Z">Zhen Qin</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+T">Tianhe Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Sohn%2C+D">Daniel Sohn</a>, <a href="/search/cs?searchtype=author&amp;query=Makarova%2C+A">Anastasiia Makarova</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jeremiah Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yuan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&amp;query=Ittycheriah%2C+A">Abe Ittycheriah</a>, <a href="/search/cs?searchtype=author&amp;query=Kumar%2C+A">Aviral Kumar</a>, <a href="/search/cs?searchtype=author&amp;query=Saleh%2C+M">Mohammad Saleh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13156v1-abstract-short" style="display: inline;"> Reward models (RMs) play a pivotal role in aligning large language models (LLMs) with human preferences. However, traditional RM training, which relies on response pairs tied to specific prompts, struggles to disentangle prompt-driven preferences from prompt-independent artifacts, such as response length and format. In this work, we expose a fundamental limitation of current RM training methods, w&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13156v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13156v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13156v1-abstract-full" style="display: none;"> Reward models (RMs) play a pivotal role in aligning large language models (LLMs) with human preferences. However, traditional RM training, which relies on response pairs tied to specific prompts, struggles to disentangle prompt-driven preferences from prompt-independent artifacts, such as response length and format. In this work, we expose a fundamental limitation of current RM training methods, where RMs fail to effectively distinguish between contextual signals and irrelevant artifacts when determining preferences. To address this, we introduce a causal framework that learns preferences independent of these artifacts and propose a novel data augmentation technique designed to eliminate them. Extensive experiments show that our approach successfully filters out undesirable artifacts, yielding a more robust reward model (RRM). Our RRM improves the performance of a pairwise reward model trained on Gemma-2-9b-it, on RewardBench, increasing accuracy from 80.61% to 84.15%. Additionally, we train two DPO policies using both the RM and RRM, demonstrating that the RRM significantly enhances DPO-aligned policies, improving MT-Bench scores from 7.27 to 8.31 and length-controlled win-rates in AlpacaEval-2 from 33.46% to 52.49%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13156v1-abstract-full').style.display = 'none'; document.getElementById('2409.13156v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02392">arXiv:2409.02392</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.02392">pdf</a>, <a href="https://arxiv.org/format/2409.02392">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Building Math Agents with Multi-Turn Iterative Preference Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+W">Wei Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+C">Chengshuai Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+J">Jiaming Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Rosenberg%2C+A">Aviv Rosenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Qin%2C+Z">Zhen Qin</a>, <a href="/search/cs?searchtype=author&amp;query=Calandriello%2C+D">Daniele Calandriello</a>, <a href="/search/cs?searchtype=author&amp;query=Khalman%2C+M">Misha Khalman</a>, <a href="/search/cs?searchtype=author&amp;query=Joshi%2C+R">Rishabh Joshi</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&amp;query=Saleh%2C+M">Mohammad Saleh</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+C">Chi Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+T">Tong Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+T">Tianqi Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02392v1-abstract-short" style="display: inline;"> Recent studies have shown that large language models&#39; (LLMs) mathematical problem-solving capabilities can be enhanced by integrating external tools, such as code interpreters, and employing multi-turn Chain-of-Thought (CoT) reasoning. While current methods focus on synthetic data generation and Supervised Fine-Tuning (SFT), this paper studies the complementary direct preference learning approach&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02392v1-abstract-full').style.display = 'inline'; document.getElementById('2409.02392v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02392v1-abstract-full" style="display: none;"> Recent studies have shown that large language models&#39; (LLMs) mathematical problem-solving capabilities can be enhanced by integrating external tools, such as code interpreters, and employing multi-turn Chain-of-Thought (CoT) reasoning. While current methods focus on synthetic data generation and Supervised Fine-Tuning (SFT), this paper studies the complementary direct preference learning approach to further improve model performance. However, existing direct preference learning algorithms are originally designed for the single-turn chat task, and do not fully address the complexities of multi-turn reasoning and external tool integration required for tool-integrated mathematical reasoning tasks. To fill in this gap, we introduce a multi-turn direct preference learning framework, tailored for this context, that leverages feedback from code interpreters and optimizes trajectory-level preferences. This framework includes multi-turn DPO and multi-turn KTO as specific implementations. The effectiveness of our framework is validated through training of various language models using an augmented prompt set from the GSM8K and MATH datasets. Our results demonstrate substantial improvements: a supervised fine-tuned Gemma-1.1-it-7B model&#39;s performance increased from 77.5% to 83.9% on GSM8K and from 46.1% to 51.2% on MATH. Similarly, a Gemma-2-it-9B model improved from 84.1% to 86.3% on GSM8K and from 51.0% to 54.5% on MATH. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02392v1-abstract-full').style.display = 'none'; document.getElementById('2409.02392v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">A multi-turn direct preference learning framework for tool-integrated reasoning tasks</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.00118">arXiv:2408.00118</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.00118">pdf</a>, <a href="https://arxiv.org/format/2408.00118">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Gemma 2: Improving Open Language Models at a Practical Size </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gemma+Team"> Gemma Team</a>, <a href="/search/cs?searchtype=author&amp;query=Riviere%2C+M">Morgane Riviere</a>, <a href="/search/cs?searchtype=author&amp;query=Pathak%2C+S">Shreya Pathak</a>, <a href="/search/cs?searchtype=author&amp;query=Sessa%2C+P+G">Pier Giuseppe Sessa</a>, <a href="/search/cs?searchtype=author&amp;query=Hardin%2C+C">Cassidy Hardin</a>, <a href="/search/cs?searchtype=author&amp;query=Bhupatiraju%2C+S">Surya Bhupatiraju</a>, <a href="/search/cs?searchtype=author&amp;query=Hussenot%2C+L">L茅onard Hussenot</a>, <a href="/search/cs?searchtype=author&amp;query=Mesnard%2C+T">Thomas Mesnard</a>, <a href="/search/cs?searchtype=author&amp;query=Shahriari%2C+B">Bobak Shahriari</a>, <a href="/search/cs?searchtype=author&amp;query=Ram%C3%A9%2C+A">Alexandre Ram茅</a>, <a href="/search/cs?searchtype=author&amp;query=Ferret%2C+J">Johan Ferret</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+P">Peter Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Tafti%2C+P">Pouya Tafti</a>, <a href="/search/cs?searchtype=author&amp;query=Friesen%2C+A">Abe Friesen</a>, <a href="/search/cs?searchtype=author&amp;query=Casbon%2C+M">Michelle Casbon</a>, <a href="/search/cs?searchtype=author&amp;query=Ramos%2C+S">Sabela Ramos</a>, <a href="/search/cs?searchtype=author&amp;query=Kumar%2C+R">Ravin Kumar</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+C+L">Charline Le Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Jerome%2C+S">Sammy Jerome</a>, <a href="/search/cs?searchtype=author&amp;query=Tsitsulin%2C+A">Anton Tsitsulin</a>, <a href="/search/cs?searchtype=author&amp;query=Vieillard%2C+N">Nino Vieillard</a>, <a href="/search/cs?searchtype=author&amp;query=Stanczyk%2C+P">Piotr Stanczyk</a>, <a href="/search/cs?searchtype=author&amp;query=Girgin%2C+S">Sertan Girgin</a>, <a href="/search/cs?searchtype=author&amp;query=Momchev%2C+N">Nikola Momchev</a>, <a href="/search/cs?searchtype=author&amp;query=Hoffman%2C+M">Matt Hoffman</a> , et al. (173 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.00118v3-abstract-short" style="display: inline;"> In this work, we introduce Gemma 2, a new addition to the Gemma family of lightweight, state-of-the-art open models, ranging in scale from 2 billion to 27 billion parameters. In this new version, we apply several known technical modifications to the Transformer architecture, such as interleaving local-global attentions (Beltagy et al., 2020a) and group-query attention (Ainslie et al., 2023). We al&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00118v3-abstract-full').style.display = 'inline'; document.getElementById('2408.00118v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.00118v3-abstract-full" style="display: none;"> In this work, we introduce Gemma 2, a new addition to the Gemma family of lightweight, state-of-the-art open models, ranging in scale from 2 billion to 27 billion parameters. In this new version, we apply several known technical modifications to the Transformer architecture, such as interleaving local-global attentions (Beltagy et al., 2020a) and group-query attention (Ainslie et al., 2023). We also train the 2B and 9B models with knowledge distillation (Hinton et al., 2015) instead of next token prediction. The resulting models deliver the best performance for their size, and even offer competitive alternatives to models that are 2-3 times bigger. We release all our models to the community. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00118v3-abstract-full').style.display = 'none'; document.getElementById('2408.00118v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.19107">arXiv:2405.19107</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.19107">pdf</a>, <a href="https://arxiv.org/ps/2405.19107">ps</a>, <a href="https://arxiv.org/format/2405.19107">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Offline Regularised Reinforcement Learning for Large Language Models Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Richemond%2C+P+H">Pierre Harvey Richemond</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Daniel Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Calandriello%2C+D">Daniele Calandriello</a>, <a href="/search/cs?searchtype=author&amp;query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&amp;query=Rafailov%2C+R">Rafael Rafailov</a>, <a href="/search/cs?searchtype=author&amp;query=Pires%2C+B+A">Bernardo Avila Pires</a>, <a href="/search/cs?searchtype=author&amp;query=Tarassov%2C+E">Eugene Tarassov</a>, <a href="/search/cs?searchtype=author&amp;query=Spangher%2C+L">Lucas Spangher</a>, <a href="/search/cs?searchtype=author&amp;query=Ellsworth%2C+W">Will Ellsworth</a>, <a href="/search/cs?searchtype=author&amp;query=Severyn%2C+A">Aliaksei Severyn</a>, <a href="/search/cs?searchtype=author&amp;query=Mallinson%2C+J">Jonathan Mallinson</a>, <a href="/search/cs?searchtype=author&amp;query=Shani%2C+L">Lior Shani</a>, <a href="/search/cs?searchtype=author&amp;query=Shamir%2C+G">Gil Shamir</a>, <a href="/search/cs?searchtype=author&amp;query=Joshi%2C+R">Rishabh Joshi</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+T">Tianqi Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Munos%2C+R">Remi Munos</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.19107v1-abstract-short" style="display: inline;"> The dominant framework for alignment of large language models (LLM), whether through reinforcement learning from human feedback or direct preference optimisation, is to learn from preference data. This involves building datasets where each element is a quadruplet composed of a prompt, two independent responses (completions of the prompt) and a human preference between the two independent responses&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19107v1-abstract-full').style.display = 'inline'; document.getElementById('2405.19107v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.19107v1-abstract-full" style="display: none;"> The dominant framework for alignment of large language models (LLM), whether through reinforcement learning from human feedback or direct preference optimisation, is to learn from preference data. This involves building datasets where each element is a quadruplet composed of a prompt, two independent responses (completions of the prompt) and a human preference between the two independent responses, yielding a preferred and a dis-preferred response. Such data is typically scarce and expensive to collect. On the other hand, \emph{single-trajectory} datasets where each element is a triplet composed of a prompt, a response and a human feedback is naturally more abundant. The canonical element of such datasets is for instance an LLM&#39;s response to a user&#39;s prompt followed by a user&#39;s feedback such as a thumbs-up/down. Consequently, in this work, we propose DRO, or \emph{Direct Reward Optimisation}, as a framework and associated algorithms that do not require pairwise preferences. DRO uses a simple mean-squared objective that can be implemented in various ways. We validate our findings empirically, using T5 encoder-decoder language models, and show DRO&#39;s performance over selected baselines such as Kahneman-Tversky Optimization (KTO). Thus, we confirm that DRO is a simple and empirically compelling method for single-trajectory policy optimisation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19107v1-abstract-full').style.display = 'none'; document.getElementById('2405.19107v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.14655">arXiv:2405.14655</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.14655">pdf</a>, <a href="https://arxiv.org/format/2405.14655">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Multi-turn Reinforcement Learning from Preference Human Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Shani%2C+L">Lior Shani</a>, <a href="/search/cs?searchtype=author&amp;query=Rosenberg%2C+A">Aviv Rosenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Cassel%2C+A">Asaf Cassel</a>, <a href="/search/cs?searchtype=author&amp;query=Lang%2C+O">Oran Lang</a>, <a href="/search/cs?searchtype=author&amp;query=Calandriello%2C+D">Daniele Calandriello</a>, <a href="/search/cs?searchtype=author&amp;query=Zipori%2C+A">Avital Zipori</a>, <a href="/search/cs?searchtype=author&amp;query=Noga%2C+H">Hila Noga</a>, <a href="/search/cs?searchtype=author&amp;query=Keller%2C+O">Orgad Keller</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&amp;query=Szpektor%2C+I">Idan Szpektor</a>, <a href="/search/cs?searchtype=author&amp;query=Hassidim%2C+A">Avinatan Hassidim</a>, <a href="/search/cs?searchtype=author&amp;query=Matias%2C+Y">Yossi Matias</a>, <a href="/search/cs?searchtype=author&amp;query=Munos%2C+R">R茅mi Munos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.14655v1-abstract-short" style="display: inline;"> Reinforcement Learning from Human Feedback (RLHF) has become the standard approach for aligning Large Language Models (LLMs) with human preferences, allowing LLMs to demonstrate remarkable abilities in various tasks. Existing methods work by emulating the preferences at the single decision (turn) level, limiting their capabilities in settings that require planning or multi-turn interactions to ach&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.14655v1-abstract-full').style.display = 'inline'; document.getElementById('2405.14655v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.14655v1-abstract-full" style="display: none;"> Reinforcement Learning from Human Feedback (RLHF) has become the standard approach for aligning Large Language Models (LLMs) with human preferences, allowing LLMs to demonstrate remarkable abilities in various tasks. Existing methods work by emulating the preferences at the single decision (turn) level, limiting their capabilities in settings that require planning or multi-turn interactions to achieve a long-term goal. In this paper, we address this issue by developing novel methods for Reinforcement Learning (RL) from preference feedback between two full multi-turn conversations. In the tabular setting, we present a novel mirror-descent-based policy optimization algorithm for the general multi-turn preference-based RL problem, and prove its convergence to Nash equilibrium. To evaluate performance, we create a new environment, Education Dialogue, where a teacher agent guides a student in learning a random topic, and show that a deep RL variant of our algorithm outperforms RLHF baselines. Finally, we show that in an environment with explicit rewards, our algorithm recovers the same performance as a reward-based RL baseline, despite relying solely on a weaker preference signal. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.14655v1-abstract-full').style.display = 'none'; document.getElementById('2405.14655v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.08635">arXiv:2403.08635</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.08635">pdf</a>, <a href="https://arxiv.org/format/2403.08635">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Human Alignment of Large Language Models through Online Preference Optimisation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Calandriello%2C+D">Daniele Calandriello</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Daniel Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Munos%2C+R">Remi Munos</a>, <a href="/search/cs?searchtype=author&amp;query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Pires%2C+B+A">Bernardo Avila Pires</a>, <a href="/search/cs?searchtype=author&amp;query=Richemond%2C+P+H">Pierre Harvey Richemond</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+C+L">Charline Le Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+T">Tianqi Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Joshi%2C+R">Rishabh Joshi</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+Z">Zeyu Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.08635v1-abstract-short" style="display: inline;"> Ensuring alignment of language models&#39; outputs with human preferences is critical to guarantee a useful, safe, and pleasant user experience. Thus, human alignment has been extensively studied recently and several methods such as Reinforcement Learning from Human Feedback (RLHF), Direct Policy Optimisation (DPO) and Sequence Likelihood Calibration (SLiC) have emerged. In this paper, our contributio&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08635v1-abstract-full').style.display = 'inline'; document.getElementById('2403.08635v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.08635v1-abstract-full" style="display: none;"> Ensuring alignment of language models&#39; outputs with human preferences is critical to guarantee a useful, safe, and pleasant user experience. Thus, human alignment has been extensively studied recently and several methods such as Reinforcement Learning from Human Feedback (RLHF), Direct Policy Optimisation (DPO) and Sequence Likelihood Calibration (SLiC) have emerged. In this paper, our contribution is two-fold. First, we show the equivalence between two recent alignment methods, namely Identity Policy Optimisation (IPO) and Nash Mirror Descent (Nash-MD). Second, we introduce a generalisation of IPO, named IPO-MD, that leverages the regularised sampling approach proposed by Nash-MD. This equivalence may seem surprising at first sight, since IPO is an offline method whereas Nash-MD is an online method using a preference model. However, this equivalence can be proven when we consider the online version of IPO, that is when both generations are sampled by the online policy and annotated by a trained preference model. Optimising the IPO loss with such a stream of data becomes then equivalent to finding the Nash equilibrium of the preference model through self-play. Building on this equivalence, we introduce the IPO-MD algorithm that generates data with a mixture policy (between the online and reference policy) similarly as the general Nash-MD algorithm. We compare online-IPO and IPO-MD to different online versions of existing losses on preference data such as DPO and SLiC on a summarisation task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08635v1-abstract-full').style.display = 'none'; document.getElementById('2403.08635v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.05749">arXiv:2402.05749</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.05749">pdf</a>, <a href="https://arxiv.org/format/2402.05749">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Generalized Preference Optimization: A Unified Approach to Offline Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Z+D">Zhaohan Daniel Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+Z">Zeyu Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Calandriello%2C+D">Daniele Calandriello</a>, <a href="/search/cs?searchtype=author&amp;query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&amp;query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&amp;query=Richemond%2C+P+H">Pierre Harvey Richemond</a>, <a href="/search/cs?searchtype=author&amp;query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&amp;query=Pires%2C+B+%C3%81">Bernardo 脕vila Pires</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.05749v2-abstract-short" style="display: inline;"> Offline preference optimization allows fine-tuning large models directly from offline data, and has proved effective in recent alignment practices. We propose generalized preference optimization (GPO), a family of offline losses parameterized by a general class of convex functions. GPO enables a unified view over preference optimization, encompassing existing algorithms such as DPO, IPO and SLiC a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.05749v2-abstract-full').style.display = 'inline'; document.getElementById('2402.05749v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.05749v2-abstract-full" style="display: none;"> Offline preference optimization allows fine-tuning large models directly from offline data, and has proved effective in recent alignment practices. We propose generalized preference optimization (GPO), a family of offline losses parameterized by a general class of convex functions. GPO enables a unified view over preference optimization, encompassing existing algorithms such as DPO, IPO and SLiC as special cases, while naturally introducing new variants. The GPO framework also sheds light on how offline algorithms enforce regularization, through the design of the convex function that defines the loss. Our analysis and experiments reveal the connections and subtle differences between the offline regularization and the KL divergence regularization intended by the canonical RLHF formulation. In a controlled setting akin to Gao et al 2023, we also show that different GPO variants achieve similar trade-offs between regularization and performance, though the optimal values of hyper-parameter might differ as predicted by theory. In all, our results present new algorithmic toolkits and empirical insights to alignment practitioners. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.05749v2-abstract-full').style.display = 'none'; document.getElementById('2402.05749v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ICML 2023 main conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.04792">arXiv:2402.04792</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.04792">pdf</a>, <a href="https://arxiv.org/format/2402.04792">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Direct Language Model Alignment from Online AI Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guo%2C+S">Shangmin Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+B">Biao Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+T">Tianlin Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+T">Tianqi Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Khalman%2C+M">Misha Khalman</a>, <a href="/search/cs?searchtype=author&amp;query=Llinares%2C+F">Felipe Llinares</a>, <a href="/search/cs?searchtype=author&amp;query=Rame%2C+A">Alexandre Rame</a>, <a href="/search/cs?searchtype=author&amp;query=Mesnard%2C+T">Thomas Mesnard</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Y">Yao Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&amp;query=Ferret%2C+J">Johan Ferret</a>, <a href="/search/cs?searchtype=author&amp;query=Blondel%2C+M">Mathieu Blondel</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.04792v2-abstract-short" style="display: inline;"> Direct alignment from preferences (DAP) methods, such as DPO, have recently emerged as efficient alternatives to reinforcement learning from human feedback (RLHF), that do not require a separate reward model. However, the preference datasets used in DAP methods are usually collected ahead of training and never updated, thus the feedback is purely offline. Moreover, responses in these datasets are&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.04792v2-abstract-full').style.display = 'inline'; document.getElementById('2402.04792v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.04792v2-abstract-full" style="display: none;"> Direct alignment from preferences (DAP) methods, such as DPO, have recently emerged as efficient alternatives to reinforcement learning from human feedback (RLHF), that do not require a separate reward model. However, the preference datasets used in DAP methods are usually collected ahead of training and never updated, thus the feedback is purely offline. Moreover, responses in these datasets are often sampled from a language model distinct from the one being aligned, and since the model evolves over training, the alignment phase is inevitably off-policy. In this study, we posit that online feedback is key and improves DAP methods. Our method, online AI feedback (OAIF), uses an LLM as annotator: on each training iteration, we sample two responses from the current model and prompt the LLM annotator to choose which one is preferred, thus providing online feedback. Despite its simplicity, we demonstrate via human evaluation in several tasks that OAIF outperforms both offline DAP and RLHF methods. We further show that the feedback leveraged in OAIF is easily controllable, via instruction prompts to the LLM annotator. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.04792v2-abstract-full').style.display = 'none'; document.getElementById('2402.04792v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 9 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.00886">arXiv:2312.00886</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.00886">pdf</a>, <a href="https://arxiv.org/format/2312.00886">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Nash Learning from Human Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&amp;query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&amp;query=Calandriello%2C+D">Daniele Calandriello</a>, <a href="/search/cs?searchtype=author&amp;query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&amp;query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Z+D">Zhaohan Daniel Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Geist%2C+M">Matthieu Geist</a>, <a href="/search/cs?searchtype=author&amp;query=Mesnard%2C+T">Thomas Mesnard</a>, <a href="/search/cs?searchtype=author&amp;query=Michi%2C+A">Andrea Michi</a>, <a href="/search/cs?searchtype=author&amp;query=Selvi%2C+M">Marco Selvi</a>, <a href="/search/cs?searchtype=author&amp;query=Girgin%2C+S">Sertan Girgin</a>, <a href="/search/cs?searchtype=author&amp;query=Momchev%2C+N">Nikola Momchev</a>, <a href="/search/cs?searchtype=author&amp;query=Bachem%2C+O">Olivier Bachem</a>, <a href="/search/cs?searchtype=author&amp;query=Mankowitz%2C+D+J">Daniel J. Mankowitz</a>, <a href="/search/cs?searchtype=author&amp;query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.00886v4-abstract-short" style="display: inline;"> Reinforcement learning from human feedback (RLHF) has emerged as the main paradigm for aligning large language models (LLMs) with human preferences. Typically, RLHF involves the initial step of learning a reward model from human feedback, often expressed as preferences between pairs of text generations produced by a pre-trained LLM. Subsequently, the LLM&#39;s policy is fine-tuned by optimizing it to&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.00886v4-abstract-full').style.display = 'inline'; document.getElementById('2312.00886v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.00886v4-abstract-full" style="display: none;"> Reinforcement learning from human feedback (RLHF) has emerged as the main paradigm for aligning large language models (LLMs) with human preferences. Typically, RLHF involves the initial step of learning a reward model from human feedback, often expressed as preferences between pairs of text generations produced by a pre-trained LLM. Subsequently, the LLM&#39;s policy is fine-tuned by optimizing it to maximize the reward model through a reinforcement learning algorithm. However, an inherent limitation of current reward models is their inability to fully represent the richness of human preferences and their dependency on the sampling distribution. In this study, we introduce an alternative pipeline for the fine-tuning of LLMs using pairwise human feedback. Our approach entails the initial learning of a preference model, which is conditioned on two inputs given a prompt, followed by the pursuit of a policy that consistently generates responses preferred over those generated by any competing policy, thus defining the Nash equilibrium of this preference model. We term this approach Nash learning from human feedback (NLHF). In the context of a tabular policy representation, we present a novel algorithmic solution, Nash-MD, founded on the principles of mirror descent. This algorithm produces a sequence of policies, with the last iteration converging to the regularized Nash equilibrium. Additionally, we explore parametric representations of policies and introduce gradient descent algorithms for deep-learning architectures. To demonstrate the effectiveness of our approach, we present experimental results involving the fine-tuning of a LLM for a text summarization task. We believe NLHF offers a compelling avenue for preference learning and policy optimization with the potential of advancing the field of aligning LLMs with human preferences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.00886v4-abstract-full').style.display = 'none'; document.getElementById('2312.00886v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.12036">arXiv:2310.12036</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.12036">pdf</a>, <a href="https://arxiv.org/format/2310.12036">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> A General Theoretical Paradigm to Understand Learning from Human Preferences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&amp;query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Daniel Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Calandriello%2C+D">Daniele Calandriello</a>, <a href="/search/cs?searchtype=author&amp;query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&amp;query=Munos%2C+R">R茅mi Munos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.12036v2-abstract-short" style="display: inline;"> The prevalent deployment of learning from human preferences through reinforcement learning (RLHF) relies on two important approximations: the first assumes that pairwise preferences can be substituted with pointwise rewards. The second assumes that a reward model trained on these pointwise rewards can generalize from collected data to out-of-distribution data sampled by the policy. Recently, Direc&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.12036v2-abstract-full').style.display = 'inline'; document.getElementById('2310.12036v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.12036v2-abstract-full" style="display: none;"> The prevalent deployment of learning from human preferences through reinforcement learning (RLHF) relies on two important approximations: the first assumes that pairwise preferences can be substituted with pointwise rewards. The second assumes that a reward model trained on these pointwise rewards can generalize from collected data to out-of-distribution data sampled by the policy. Recently, Direct Preference Optimisation (DPO) has been proposed as an approach that bypasses the second approximation and learn directly a policy from collected data without the reward modelling stage. However, this method still heavily relies on the first approximation. In this paper we try to gain a deeper theoretical understanding of these practical algorithms. In particular we derive a new general objective called $唯$PO for learning from human preferences that is expressed in terms of pairwise preferences and therefore bypasses both approximations. This new general objective allows us to perform an in-depth analysis of the behavior of RLHF and DPO (as special cases of $唯$PO) and to identify their potential pitfalls. We then consider another special case for $唯$PO by setting $唯$ simply to Identity, for which we can derive an efficient optimisation procedure, prove performance guarantees and demonstrate its empirical superiority to DPO on some illustrative examples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.12036v2-abstract-full').style.display = 'none'; document.getElementById('2310.12036v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.01521">arXiv:2305.01521</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.01521">pdf</a>, <a href="https://arxiv.org/format/2305.01521">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Unlocking the Power of Representations in Long-term Novelty-based Exploration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Saade%2C+A">Alaa Saade</a>, <a href="/search/cs?searchtype=author&amp;query=Kapturowski%2C+S">Steven Kapturowski</a>, <a href="/search/cs?searchtype=author&amp;query=Calandriello%2C+D">Daniele Calandriello</a>, <a href="/search/cs?searchtype=author&amp;query=Blundell%2C+C">Charles Blundell</a>, <a href="/search/cs?searchtype=author&amp;query=Sprechmann%2C+P">Pablo Sprechmann</a>, <a href="/search/cs?searchtype=author&amp;query=Sarra%2C+L">Leopoldo Sarra</a>, <a href="/search/cs?searchtype=author&amp;query=Groth%2C+O">Oliver Groth</a>, <a href="/search/cs?searchtype=author&amp;query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.01521v1-abstract-short" style="display: inline;"> We introduce Robust Exploration via Clustering-based Online Density Estimation (RECODE), a non-parametric method for novelty-based exploration that estimates visitation counts for clusters of states based on their similarity in a chosen embedding space. By adapting classical clustering to the nonstationary setting of Deep RL, RECODE can efficiently track state visitation counts over thousands of e&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.01521v1-abstract-full').style.display = 'inline'; document.getElementById('2305.01521v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.01521v1-abstract-full" style="display: none;"> We introduce Robust Exploration via Clustering-based Online Density Estimation (RECODE), a non-parametric method for novelty-based exploration that estimates visitation counts for clusters of states based on their similarity in a chosen embedding space. By adapting classical clustering to the nonstationary setting of Deep RL, RECODE can efficiently track state visitation counts over thousands of episodes. We further propose a novel generalization of the inverse dynamics loss, which leverages masked transformer architectures for multi-step prediction; which in conjunction with RECODE achieves a new state-of-the-art in a suite of challenging 3D-exploration tasks in DM-Hard-8. RECODE also sets new state-of-the-art in hard exploration Atari games, and is the first agent to reach the end screen in &#34;Pitfall!&#34;. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.01521v1-abstract-full').style.display = 'none'; document.getElementById('2305.01521v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.04817">arXiv:2302.04817</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2302.04817">pdf</a>, <a href="https://arxiv.org/format/2302.04817">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> The Edge of Orthogonality: A Simple View of What Makes BYOL Tick </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Richemond%2C+P+H">Pierre H. Richemond</a>, <a href="/search/cs?searchtype=author&amp;query=Tam%2C+A">Allison Tam</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Strub%2C+F">Florian Strub</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&amp;query=Hill%2C+F">Felix Hill</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.04817v1-abstract-short" style="display: inline;"> Self-predictive unsupervised learning methods such as BYOL or SimSiam have shown impressive results, and counter-intuitively, do not collapse to trivial representations. In this work, we aim at exploring the simplest possible mathematical arguments towards explaining the underlying mechanisms behind self-predictive unsupervised learning. We start with the observation that those methods crucially r&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.04817v1-abstract-full').style.display = 'inline'; document.getElementById('2302.04817v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.04817v1-abstract-full" style="display: none;"> Self-predictive unsupervised learning methods such as BYOL or SimSiam have shown impressive results, and counter-intuitively, do not collapse to trivial representations. In this work, we aim at exploring the simplest possible mathematical arguments towards explaining the underlying mechanisms behind self-predictive unsupervised learning. We start with the observation that those methods crucially rely on the presence of a predictor network (and stop-gradient). With simple linear algebra, we show that when using a linear predictor, the optimal predictor is close to an orthogonal projection, and propose a general framework based on orthonormalization that enables to interpret and give intuition on why BYOL works. In addition, this framework demonstrates the crucial role of the exponential moving average and stop-gradient operator in BYOL as an efficient orthonormalization mechanism. We use these insights to propose four new \emph{closed-form predictor} variants of BYOL to support our analysis. Our closed-form predictors outperform standard linear trainable predictor BYOL at $100$ and $300$ epochs (top-$1$ linear accuracy on ImageNet). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.04817v1-abstract-full').style.display = 'none'; document.getElementById('2302.04817v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.03319">arXiv:2212.03319</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2212.03319">pdf</a>, <a href="https://arxiv.org/format/2212.03319">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Understanding Self-Predictive Learning for Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Z+D">Zhaohan Daniel Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Richemond%2C+P+H">Pierre Harvey Richemond</a>, <a href="/search/cs?searchtype=author&amp;query=Pires%2C+B+%C3%81">Bernardo 脕vila Pires</a>, <a href="/search/cs?searchtype=author&amp;query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&amp;query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&amp;query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&amp;query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+C+L">Charline Le Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Lyle%2C+C">Clare Lyle</a>, <a href="/search/cs?searchtype=author&amp;query=Gy%C3%B6rgy%2C+A">Andr谩s Gy枚rgy</a>, <a href="/search/cs?searchtype=author&amp;query=Thakoor%2C+S">Shantanu Thakoor</a>, <a href="/search/cs?searchtype=author&amp;query=Dabney%2C+W">Will Dabney</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&amp;query=Calandriello%2C+D">Daniele Calandriello</a>, <a href="/search/cs?searchtype=author&amp;query=Valko%2C+M">Michal Valko</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.03319v1-abstract-short" style="display: inline;"> We study the learning dynamics of self-predictive learning for reinforcement learning, a family of algorithms that learn representations by minimizing the prediction error of their own future latent representations. Despite its recent empirical success, such algorithms have an apparent defect: trivial representations (such as constants) minimize the prediction error, yet it is obviously undesirabl&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.03319v1-abstract-full').style.display = 'inline'; document.getElementById('2212.03319v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.03319v1-abstract-full" style="display: none;"> We study the learning dynamics of self-predictive learning for reinforcement learning, a family of algorithms that learn representations by minimizing the prediction error of their own future latent representations. Despite its recent empirical success, such algorithms have an apparent defect: trivial representations (such as constants) minimize the prediction error, yet it is obviously undesirable to converge to such solutions. Our central insight is that careful designs of the optimization dynamics are critical to learning meaningful representations. We identify that a faster paced optimization of the predictor and semi-gradient updates on the representation, are crucial to preventing the representation collapse. Then in an idealized setup, we show self-predictive learning dynamics carries out spectral decomposition on the state transition matrix, effectively capturing information of the transition dynamics. Building on the theoretical insights, we propose bidirectional self-predictive learning, a novel self-predictive algorithm that learns two representations simultaneously. We examine the robustness of our theoretical insights with a number of small-scale experiments and showcase the promise of the novel representation learning algorithm with large-scale experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.03319v1-abstract-full').style.display = 'none'; document.getElementById('2212.03319v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.15378">arXiv:2206.15378</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2206.15378">pdf</a>, <a href="https://arxiv.org/format/2206.15378">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1126/science.add4679">10.1126/science.add4679 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Mastering the Game of Stratego with Model-Free Multiagent Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Perolat%2C+J">Julien Perolat</a>, <a href="/search/cs?searchtype=author&amp;query=de+Vylder%2C+B">Bart de Vylder</a>, <a href="/search/cs?searchtype=author&amp;query=Hennes%2C+D">Daniel Hennes</a>, <a href="/search/cs?searchtype=author&amp;query=Tarassov%2C+E">Eugene Tarassov</a>, <a href="/search/cs?searchtype=author&amp;query=Strub%2C+F">Florian Strub</a>, <a href="/search/cs?searchtype=author&amp;query=de+Boer%2C+V">Vincent de Boer</a>, <a href="/search/cs?searchtype=author&amp;query=Muller%2C+P">Paul Muller</a>, <a href="/search/cs?searchtype=author&amp;query=Connor%2C+J+T">Jerome T. Connor</a>, <a href="/search/cs?searchtype=author&amp;query=Burch%2C+N">Neil Burch</a>, <a href="/search/cs?searchtype=author&amp;query=Anthony%2C+T">Thomas Anthony</a>, <a href="/search/cs?searchtype=author&amp;query=McAleer%2C+S">Stephen McAleer</a>, <a href="/search/cs?searchtype=author&amp;query=Elie%2C+R">Romuald Elie</a>, <a href="/search/cs?searchtype=author&amp;query=Cen%2C+S+H">Sarah H. Cen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhe Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Gruslys%2C+A">Audrunas Gruslys</a>, <a href="/search/cs?searchtype=author&amp;query=Malysheva%2C+A">Aleksandra Malysheva</a>, <a href="/search/cs?searchtype=author&amp;query=Khan%2C+M">Mina Khan</a>, <a href="/search/cs?searchtype=author&amp;query=Ozair%2C+S">Sherjil Ozair</a>, <a href="/search/cs?searchtype=author&amp;query=Timbers%2C+F">Finbarr Timbers</a>, <a href="/search/cs?searchtype=author&amp;query=Pohlen%2C+T">Toby Pohlen</a>, <a href="/search/cs?searchtype=author&amp;query=Eccles%2C+T">Tom Eccles</a>, <a href="/search/cs?searchtype=author&amp;query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&amp;query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&amp;query=Lespiau%2C+J">Jean-Baptiste Lespiau</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a> , et al. (9 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.15378v1-abstract-short" style="display: inline;"> We introduce DeepNash, an autonomous agent capable of learning to play the imperfect information game Stratego from scratch, up to a human expert level. Stratego is one of the few iconic board games that Artificial Intelligence (AI) has not yet mastered. This popular game has an enormous game tree on the order of $10^{535}$ nodes, i.e., $10^{175}$ times larger than that of Go. It has the additiona&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.15378v1-abstract-full').style.display = 'inline'; document.getElementById('2206.15378v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.15378v1-abstract-full" style="display: none;"> We introduce DeepNash, an autonomous agent capable of learning to play the imperfect information game Stratego from scratch, up to a human expert level. Stratego is one of the few iconic board games that Artificial Intelligence (AI) has not yet mastered. This popular game has an enormous game tree on the order of $10^{535}$ nodes, i.e., $10^{175}$ times larger than that of Go. It has the additional complexity of requiring decision-making under imperfect information, similar to Texas hold&#39;em poker, which has a significantly smaller game tree (on the order of $10^{164}$ nodes). Decisions in Stratego are made over a large number of discrete actions with no obvious link between action and outcome. Episodes are long, with often hundreds of moves before a player wins, and situations in Stratego can not easily be broken down into manageably-sized sub-problems as in poker. For these reasons, Stratego has been a grand challenge for the field of AI for decades, and existing AI methods barely reach an amateur level of play. DeepNash uses a game-theoretic, model-free deep reinforcement learning method, without search, that learns to master Stratego via self-play. The Regularised Nash Dynamics (R-NaD) algorithm, a key component of DeepNash, converges to an approximate Nash equilibrium, instead of &#39;cycling&#39; around it, by directly modifying the underlying multi-agent learning dynamics. DeepNash beats existing state-of-the-art AI methods in Stratego and achieved a yearly (2022) and all-time top-3 rank on the Gravon games platform, competing with human expert players. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.15378v1-abstract-full').style.display = 'none'; document.getElementById('2206.15378v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.08332">arXiv:2206.08332</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2206.08332">pdf</a>, <a href="https://arxiv.org/format/2206.08332">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> BYOL-Explore: Exploration by Bootstrapped Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Z+D">Zhaohan Daniel Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Thakoor%2C+S">Shantanu Thakoor</a>, <a href="/search/cs?searchtype=author&amp;query=P%C3%AEslar%2C+M">Miruna P卯slar</a>, <a href="/search/cs?searchtype=author&amp;query=Pires%2C+B+A">Bernardo Avila Pires</a>, <a href="/search/cs?searchtype=author&amp;query=Altch%C3%A9%2C+F">Florent Altch茅</a>, <a href="/search/cs?searchtype=author&amp;query=Tallec%2C+C">Corentin Tallec</a>, <a href="/search/cs?searchtype=author&amp;query=Saade%2C+A">Alaa Saade</a>, <a href="/search/cs?searchtype=author&amp;query=Calandriello%2C+D">Daniele Calandriello</a>, <a href="/search/cs?searchtype=author&amp;query=Grill%2C+J">Jean-Bastien Grill</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&amp;query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&amp;query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.08332v1-abstract-short" style="display: inline;"> We present BYOL-Explore, a conceptually simple yet general approach for curiosity-driven exploration in visually-complex environments. BYOL-Explore learns a world representation, the world dynamics, and an exploration policy all-together by optimizing a single prediction loss in the latent space with no additional auxiliary objective. We show that BYOL-Explore is effective in DM-HARD-8, a challeng&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.08332v1-abstract-full').style.display = 'inline'; document.getElementById('2206.08332v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.08332v1-abstract-full" style="display: none;"> We present BYOL-Explore, a conceptually simple yet general approach for curiosity-driven exploration in visually-complex environments. BYOL-Explore learns a world representation, the world dynamics, and an exploration policy all-together by optimizing a single prediction loss in the latent space with no additional auxiliary objective. We show that BYOL-Explore is effective in DM-HARD-8, a challenging partially-observable continuous-action hard-exploration benchmark with visually-rich 3-D environments. On this benchmark, we solve the majority of the tasks purely through augmenting the extrinsic reward with BYOL-Explore s intrinsic reward, whereas prior work could only get off the ground with human demonstrations. As further evidence of the generality of BYOL-Explore, we show that it achieves superhuman performance on the ten hardest exploration games in Atari while having a much simpler design than other competitive agents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.08332v1-abstract-full').style.display = 'none'; document.getElementById('2206.08332v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.10819">arXiv:2110.10819</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2110.10819">pdf</a>, <a href="https://arxiv.org/format/2110.10819">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Shaking the foundations: delusions in sequence models for interaction and control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ortega%2C+P+A">Pedro A. Ortega</a>, <a href="/search/cs?searchtype=author&amp;query=Kunesch%2C+M">Markus Kunesch</a>, <a href="/search/cs?searchtype=author&amp;query=Del%C3%A9tang%2C+G">Gr茅goire Del茅tang</a>, <a href="/search/cs?searchtype=author&amp;query=Genewein%2C+T">Tim Genewein</a>, <a href="/search/cs?searchtype=author&amp;query=Grau-Moya%2C+J">Jordi Grau-Moya</a>, <a href="/search/cs?searchtype=author&amp;query=Veness%2C+J">Joel Veness</a>, <a href="/search/cs?searchtype=author&amp;query=Buchli%2C+J">Jonas Buchli</a>, <a href="/search/cs?searchtype=author&amp;query=Degrave%2C+J">Jonas Degrave</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&amp;query=Perolat%2C+J">Julien Perolat</a>, <a href="/search/cs?searchtype=author&amp;query=Everitt%2C+T">Tom Everitt</a>, <a href="/search/cs?searchtype=author&amp;query=Tallec%2C+C">Corentin Tallec</a>, <a href="/search/cs?searchtype=author&amp;query=Parisotto%2C+E">Emilio Parisotto</a>, <a href="/search/cs?searchtype=author&amp;query=Erez%2C+T">Tom Erez</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yutian Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Reed%2C+S">Scott Reed</a>, <a href="/search/cs?searchtype=author&amp;query=Hutter%2C+M">Marcus Hutter</a>, <a href="/search/cs?searchtype=author&amp;query=de+Freitas%2C+N">Nando de Freitas</a>, <a href="/search/cs?searchtype=author&amp;query=Legg%2C+S">Shane Legg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.10819v1-abstract-short" style="display: inline;"> The recent phenomenal success of language models has reinvigorated machine learning research, and large sequence models such as transformers are being applied to a variety of domains. One important problem class that has remained relatively elusive however is purposeful adaptive behavior. Currently there is a common perception that sequence models &#34;lack the understanding of the cause and effect of&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.10819v1-abstract-full').style.display = 'inline'; document.getElementById('2110.10819v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.10819v1-abstract-full" style="display: none;"> The recent phenomenal success of language models has reinvigorated machine learning research, and large sequence models such as transformers are being applied to a variety of domains. One important problem class that has remained relatively elusive however is purposeful adaptive behavior. Currently there is a common perception that sequence models &#34;lack the understanding of the cause and effect of their actions&#34; leading them to draw incorrect inferences due to auto-suggestive delusions. In this report we explain where this mismatch originates, and show that it can be resolved by treating actions as causal interventions. Finally, we show that in supervised learning, one can teach a system to condition or intervene on data by training with factual and counterfactual error signals respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.10819v1-abstract-full').style.display = 'none'; document.getElementById('2110.10819v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">DeepMind Tech Report, 16 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2101.02055">arXiv:2101.02055</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2101.02055">pdf</a>, <a href="https://arxiv.org/format/2101.02055">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Geometric Entropic Exploration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Z+D">Zhaohan Daniel Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&amp;query=Saade%2C+A">Alaa Saade</a>, <a href="/search/cs?searchtype=author&amp;query=Thakoor%2C+S">Shantanu Thakoor</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&amp;query=Pires%2C+B+A">Bernardo Avila Pires</a>, <a href="/search/cs?searchtype=author&amp;query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&amp;query=Mesnard%2C+T">Thomas Mesnard</a>, <a href="/search/cs?searchtype=author&amp;query=Lattimore%2C+T">Tor Lattimore</a>, <a href="/search/cs?searchtype=author&amp;query=Munos%2C+R">R茅mi Munos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2101.02055v2-abstract-short" style="display: inline;"> Exploration is essential for solving complex Reinforcement Learning (RL) tasks. Maximum State-Visitation Entropy (MSVE) formulates the exploration problem as a well-defined policy optimization problem whose solution aims at visiting all states as uniformly as possible. This is in contrast to standard uncertainty-based approaches where exploration is transient and eventually vanishes. However, exis&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.02055v2-abstract-full').style.display = 'inline'; document.getElementById('2101.02055v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2101.02055v2-abstract-full" style="display: none;"> Exploration is essential for solving complex Reinforcement Learning (RL) tasks. Maximum State-Visitation Entropy (MSVE) formulates the exploration problem as a well-defined policy optimization problem whose solution aims at visiting all states as uniformly as possible. This is in contrast to standard uncertainty-based approaches where exploration is transient and eventually vanishes. However, existing approaches to MSVE are theoretically justified only for discrete state-spaces as they are oblivious to the geometry of continuous domains. We address this challenge by introducing Geometric Entropy Maximisation (GEM), a new algorithm that maximises the geometry-aware Shannon entropy of state-visits in both discrete and continuous domains. Our key theoretical contribution is casting geometry-aware MSVE exploration as a tractable problem of optimising a simple and novel noise-contrastive objective function. In our experiments, we show the efficiency of GEM in solving several RL problems with sparse rewards, compared against other deep RL exploration approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.02055v2-abstract-full').style.display = 'none'; document.getElementById('2101.02055v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 January, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2010.10241">arXiv:2010.10241</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2010.10241">pdf</a>, <a href="https://arxiv.org/ps/2010.10241">ps</a>, <a href="https://arxiv.org/format/2010.10241">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> BYOL works even without batch statistics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Richemond%2C+P+H">Pierre H. Richemond</a>, <a href="/search/cs?searchtype=author&amp;query=Grill%2C+J">Jean-Bastien Grill</a>, <a href="/search/cs?searchtype=author&amp;query=Altch%C3%A9%2C+F">Florent Altch茅</a>, <a href="/search/cs?searchtype=author&amp;query=Tallec%2C+C">Corentin Tallec</a>, <a href="/search/cs?searchtype=author&amp;query=Strub%2C+F">Florian Strub</a>, <a href="/search/cs?searchtype=author&amp;query=Brock%2C+A">Andrew Brock</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+S">Samuel Smith</a>, <a href="/search/cs?searchtype=author&amp;query=De%2C+S">Soham De</a>, <a href="/search/cs?searchtype=author&amp;query=Pascanu%2C+R">Razvan Pascanu</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&amp;query=Valko%2C+M">Michal Valko</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2010.10241v1-abstract-short" style="display: inline;"> Bootstrap Your Own Latent (BYOL) is a self-supervised learning approach for image representation. From an augmented view of an image, BYOL trains an online network to predict a target network representation of a different augmented view of the same image. Unlike contrastive methods, BYOL does not explicitly use a repulsion term built from negative pairs in its training objective. Yet, it avoids co&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.10241v1-abstract-full').style.display = 'inline'; document.getElementById('2010.10241v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2010.10241v1-abstract-full" style="display: none;"> Bootstrap Your Own Latent (BYOL) is a self-supervised learning approach for image representation. From an augmented view of an image, BYOL trains an online network to predict a target network representation of a different augmented view of the same image. Unlike contrastive methods, BYOL does not explicitly use a repulsion term built from negative pairs in its training objective. Yet, it avoids collapse to a trivial, constant representation. Thus, it has recently been hypothesized that batch normalization (BN) is critical to prevent collapse in BYOL. Indeed, BN flows gradients across batch elements, and could leak information about negative views in the batch, which could act as an implicit negative (contrastive) term. However, we experimentally show that replacing BN with a batch-independent normalization scheme (namely, a combination of group normalization and weight standardization) achieves performance comparable to vanilla BYOL ($73.9\%$ vs. $74.3\%$ top-1 accuracy under the linear evaluation protocol on ImageNet with ResNet-$50$). Our finding disproves the hypothesis that the use of batch statistics is a crucial ingredient for BYOL to learn useful representations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.10241v1-abstract-full').style.display = 'none'; document.getElementById('2010.10241v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2006.07733">arXiv:2006.07733</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2006.07733">pdf</a>, <a href="https://arxiv.org/format/2006.07733">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Bootstrap your own latent: A new approach to self-supervised Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Grill%2C+J">Jean-Bastien Grill</a>, <a href="/search/cs?searchtype=author&amp;query=Strub%2C+F">Florian Strub</a>, <a href="/search/cs?searchtype=author&amp;query=Altch%C3%A9%2C+F">Florent Altch茅</a>, <a href="/search/cs?searchtype=author&amp;query=Tallec%2C+C">Corentin Tallec</a>, <a href="/search/cs?searchtype=author&amp;query=Richemond%2C+P+H">Pierre H. Richemond</a>, <a href="/search/cs?searchtype=author&amp;query=Buchatskaya%2C+E">Elena Buchatskaya</a>, <a href="/search/cs?searchtype=author&amp;query=Doersch%2C+C">Carl Doersch</a>, <a href="/search/cs?searchtype=author&amp;query=Pires%2C+B+A">Bernardo Avila Pires</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Z+D">Zhaohan Daniel Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&amp;query=Kavukcuoglu%2C+K">Koray Kavukcuoglu</a>, <a href="/search/cs?searchtype=author&amp;query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&amp;query=Valko%2C+M">Michal Valko</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2006.07733v3-abstract-short" style="display: inline;"> We introduce Bootstrap Your Own Latent (BYOL), a new approach to self-supervised image representation learning. BYOL relies on two neural networks, referred to as online and target networks, that interact and learn from each other. From an augmented view of an image, we train the online network to predict the target network representation of the same image under a different augmented view. At the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.07733v3-abstract-full').style.display = 'inline'; document.getElementById('2006.07733v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2006.07733v3-abstract-full" style="display: none;"> We introduce Bootstrap Your Own Latent (BYOL), a new approach to self-supervised image representation learning. BYOL relies on two neural networks, referred to as online and target networks, that interact and learn from each other. From an augmented view of an image, we train the online network to predict the target network representation of the same image under a different augmented view. At the same time, we update the target network with a slow-moving average of the online network. While state-of-the art methods rely on negative pairs, BYOL achieves a new state of the art without them. BYOL reaches $74.3\%$ top-1 classification accuracy on ImageNet using a linear evaluation with a ResNet-50 architecture and $79.6\%$ with a larger ResNet. We show that BYOL performs on par or better than the current state of the art on both transfer and semi-supervised benchmarks. Our implementation and pretrained models are given on GitHub. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.07733v3-abstract-full').style.display = 'none'; document.getElementById('2006.07733v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2006.00979">arXiv:2006.00979</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2006.00979">pdf</a>, <a href="https://arxiv.org/format/2006.00979">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Acme: A Research Framework for Distributed Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hoffman%2C+M+W">Matthew W. Hoffman</a>, <a href="/search/cs?searchtype=author&amp;query=Shahriari%2C+B">Bobak Shahriari</a>, <a href="/search/cs?searchtype=author&amp;query=Aslanides%2C+J">John Aslanides</a>, <a href="/search/cs?searchtype=author&amp;query=Barth-Maron%2C+G">Gabriel Barth-Maron</a>, <a href="/search/cs?searchtype=author&amp;query=Momchev%2C+N">Nikola Momchev</a>, <a href="/search/cs?searchtype=author&amp;query=Sinopalnikov%2C+D">Danila Sinopalnikov</a>, <a href="/search/cs?searchtype=author&amp;query=Sta%C5%84czyk%2C+P">Piotr Sta艅czyk</a>, <a href="/search/cs?searchtype=author&amp;query=Ramos%2C+S">Sabela Ramos</a>, <a href="/search/cs?searchtype=author&amp;query=Raichuk%2C+A">Anton Raichuk</a>, <a href="/search/cs?searchtype=author&amp;query=Vincent%2C+D">Damien Vincent</a>, <a href="/search/cs?searchtype=author&amp;query=Hussenot%2C+L">L茅onard Hussenot</a>, <a href="/search/cs?searchtype=author&amp;query=Dadashi%2C+R">Robert Dadashi</a>, <a href="/search/cs?searchtype=author&amp;query=Dulac-Arnold%2C+G">Gabriel Dulac-Arnold</a>, <a href="/search/cs?searchtype=author&amp;query=Orsini%2C+M">Manu Orsini</a>, <a href="/search/cs?searchtype=author&amp;query=Jacq%2C+A">Alexis Jacq</a>, <a href="/search/cs?searchtype=author&amp;query=Ferret%2C+J">Johan Ferret</a>, <a href="/search/cs?searchtype=author&amp;query=Vieillard%2C+N">Nino Vieillard</a>, <a href="/search/cs?searchtype=author&amp;query=Ghasemipour%2C+S+K+S">Seyed Kamyar Seyed Ghasemipour</a>, <a href="/search/cs?searchtype=author&amp;query=Girgin%2C+S">Sertan Girgin</a>, <a href="/search/cs?searchtype=author&amp;query=Pietquin%2C+O">Olivier Pietquin</a>, <a href="/search/cs?searchtype=author&amp;query=Behbahani%2C+F">Feryal Behbahani</a>, <a href="/search/cs?searchtype=author&amp;query=Norman%2C+T">Tamara Norman</a>, <a href="/search/cs?searchtype=author&amp;query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&amp;query=Cassirer%2C+A">Albin Cassirer</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+F">Fan Yang</a> , et al. (14 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2006.00979v2-abstract-short" style="display: inline;"> Deep reinforcement learning (RL) has led to many recent and groundbreaking advances. However, these advances have often come at the cost of both increased scale in the underlying architectures being trained as well as increased complexity of the RL algorithms used to train them. These increases have in turn made it more difficult for researchers to rapidly prototype new ideas or reproduce publishe&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.00979v2-abstract-full').style.display = 'inline'; document.getElementById('2006.00979v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2006.00979v2-abstract-full" style="display: none;"> Deep reinforcement learning (RL) has led to many recent and groundbreaking advances. However, these advances have often come at the cost of both increased scale in the underlying architectures being trained as well as increased complexity of the RL algorithms used to train them. These increases have in turn made it more difficult for researchers to rapidly prototype new ideas or reproduce published RL algorithms. To address these concerns this work describes Acme, a framework for constructing novel RL algorithms that is specifically designed to enable agents that are built using simple, modular components that can be used at various scales of execution. While the primary goal of Acme is to provide a framework for algorithm development, a secondary goal is to provide simple reference implementations of important or state-of-the-art algorithms. These implementations serve both as a validation of our design decisions as well as an important contribution to reproducibility in RL research. In this work we describe the major design decisions made within Acme and give further details as to how its components can be used to implement various algorithms. Our experiments provide baselines for a number of common and state-of-the-art algorithms as well as showing how these algorithms can be scaled up for much larger and more complex environments. This highlights one of the primary advantages of Acme, namely that it can be used to implement large, distributed RL algorithms that can run at massive scales while still maintaining the inherent readability of that implementation. This work presents a second version of the paper which coincides with an increase in modularity, additional emphasis on offline, imitation and learning from demonstrations algorithms, as well as various new agents implemented as part of Acme. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.00979v2-abstract-full').style.display = 'none'; document.getElementById('2006.00979v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work presents a second version of the paper which coincides with an increase in modularity, additional emphasis on offline, imitation and learning from demonstrations algorithms, as well as various new agents implemented as part of Acme</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2004.14646">arXiv:2004.14646</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2004.14646">pdf</a>, <a href="https://arxiv.org/format/2004.14646">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Bootstrap Latent-Predictive Representations for Multitask Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Daniel Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Pires%2C+B+A">Bernardo Avila Pires</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&amp;query=Grill%2C+J">Jean-bastien Grill</a>, <a href="/search/cs?searchtype=author&amp;query=Altch%C3%A9%2C+F">Florent Altch茅</a>, <a href="/search/cs?searchtype=author&amp;query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&amp;query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2004.14646v1-abstract-short" style="display: inline;"> Learning a good representation is an essential component for deep reinforcement learning (RL). Representation learning is especially important in multitask and partially observable settings where building a representation of the unknown environment is crucial to solve the tasks. Here we introduce Prediction of Bootstrap Latents (PBL), a simple and flexible self-supervised representation learning a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.14646v1-abstract-full').style.display = 'inline'; document.getElementById('2004.14646v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2004.14646v1-abstract-full" style="display: none;"> Learning a good representation is an essential component for deep reinforcement learning (RL). Representation learning is especially important in multitask and partially observable settings where building a representation of the unknown environment is crucial to solve the tasks. Here we introduce Prediction of Bootstrap Latents (PBL), a simple and flexible self-supervised representation learning algorithm for multitask deep RL. PBL builds on multistep predictive representations of future observations, and focuses on capturing structured information about environment dynamics. Specifically, PBL trains its representation by predicting latent embeddings of future observations. These latent embeddings are themselves trained to be predictive of the aforementioned representations. These predictions form a bootstrapping effect, allowing the agent to learn more about the key aspects of the environment dynamics. In addition, by defining prediction tasks completely in latent space, PBL provides the flexibility of using multimodal observations involving pixel images, language instructions, rewards and more. We show in our experiments that PBL delivers across-the-board improved performance over state of the art deep RL agents in the DMLab-30 and Atari-57 multitask setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.14646v1-abstract-full').style.display = 'none'; document.getElementById('2004.14646v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 April, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2003.13350">arXiv:2003.13350</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2003.13350">pdf</a>, <a href="https://arxiv.org/format/2003.13350">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Agent57: Outperforming the Atari Human Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Badia%2C+A+P">Adri脿 Puigdom猫nech Badia</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&amp;query=Kapturowski%2C+S">Steven Kapturowski</a>, <a href="/search/cs?searchtype=author&amp;query=Sprechmann%2C+P">Pablo Sprechmann</a>, <a href="/search/cs?searchtype=author&amp;query=Vitvitskyi%2C+A">Alex Vitvitskyi</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Daniel Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Blundell%2C+C">Charles Blundell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2003.13350v1-abstract-short" style="display: inline;"> Atari games have been a long-standing benchmark in the reinforcement learning (RL) community for the past decade. This benchmark was proposed to test general competency of RL algorithms. Previous work has achieved good average performance by doing outstandingly well on many games of the set, but very poorly in several of the most challenging games. We propose Agent57, the first deep RL agent that&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.13350v1-abstract-full').style.display = 'inline'; document.getElementById('2003.13350v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2003.13350v1-abstract-full" style="display: none;"> Atari games have been a long-standing benchmark in the reinforcement learning (RL) community for the past decade. This benchmark was proposed to test general competency of RL algorithms. Previous work has achieved good average performance by doing outstandingly well on many games of the set, but very poorly in several of the most challenging games. We propose Agent57, the first deep RL agent that outperforms the standard human benchmark on all 57 Atari games. To achieve this result, we train a neural network which parameterizes a family of policies ranging from very exploratory to purely exploitative. We propose an adaptive mechanism to choose which policy to prioritize throughout the training process. Additionally, we utilize a novel parameterization of the architecture that allows for more consistent and stable learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.13350v1-abstract-full').style.display = 'none'; document.getElementById('2003.13350v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2002.06038">arXiv:2002.06038</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2002.06038">pdf</a>, <a href="https://arxiv.org/format/2002.06038">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Never Give Up: Learning Directed Exploration Strategies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Badia%2C+A+P">Adri脿 Puigdom猫nech Badia</a>, <a href="/search/cs?searchtype=author&amp;query=Sprechmann%2C+P">Pablo Sprechmann</a>, <a href="/search/cs?searchtype=author&amp;query=Vitvitskyi%2C+A">Alex Vitvitskyi</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Daniel Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&amp;query=Kapturowski%2C+S">Steven Kapturowski</a>, <a href="/search/cs?searchtype=author&amp;query=Tieleman%2C+O">Olivier Tieleman</a>, <a href="/search/cs?searchtype=author&amp;query=Arjovsky%2C+M">Mart铆n Arjovsky</a>, <a href="/search/cs?searchtype=author&amp;query=Pritzel%2C+A">Alexander Pritzel</a>, <a href="/search/cs?searchtype=author&amp;query=Bolt%2C+A">Andew Bolt</a>, <a href="/search/cs?searchtype=author&amp;query=Blundell%2C+C">Charles Blundell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2002.06038v1-abstract-short" style="display: inline;"> We propose a reinforcement learning agent to solve hard exploration games by learning a range of directed exploratory policies. We construct an episodic memory-based intrinsic reward using k-nearest neighbors over the agent&#39;s recent experience to train the directed exploratory policies, thereby encouraging the agent to repeatedly revisit all states in its environment. A self-supervised inverse dyn&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2002.06038v1-abstract-full').style.display = 'inline'; document.getElementById('2002.06038v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2002.06038v1-abstract-full" style="display: none;"> We propose a reinforcement learning agent to solve hard exploration games by learning a range of directed exploratory policies. We construct an episodic memory-based intrinsic reward using k-nearest neighbors over the agent&#39;s recent experience to train the directed exploratory policies, thereby encouraging the agent to repeatedly revisit all states in its environment. A self-supervised inverse dynamics model is used to train the embeddings of the nearest neighbour lookup, biasing the novelty signal towards what the agent can control. We employ the framework of Universal Value Function Approximators (UVFA) to simultaneously learn many directed exploration policies with the same neural network, with different trade-offs between exploration and exploitation. By using the same neural network for different degrees of exploration/exploitation, transfer is demonstrated from predominantly exploratory policies yielding effective exploitative policies. The proposed method can be incorporated to run with modern distributed RL agents that collect large amounts of experience from many actors running in parallel on separate environment instances. Our method doubles the performance of the base agent in all hard exploration in the Atari-57 suite while maintaining a very high score across the remaining games, obtaining a median human normalised score of 1344.0%. Notably, the proposed method is the first algorithm to achieve non-zero rewards (with a mean score of 8,400) in the game of Pitfall! without using demonstrations or hand-crafted features. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2002.06038v1-abstract-full').style.display = 'none'; document.getElementById('2002.06038v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published as a conference paper in ICLR 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1912.02503">arXiv:1912.02503</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1912.02503">pdf</a>, <a href="https://arxiv.org/format/1912.02503">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Hindsight Credit Assignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Harutyunyan%2C+A">Anna Harutyunyan</a>, <a href="/search/cs?searchtype=author&amp;query=Dabney%2C+W">Will Dabney</a>, <a href="/search/cs?searchtype=author&amp;query=Mesnard%2C+T">Thomas Mesnard</a>, <a href="/search/cs?searchtype=author&amp;query=Azar%2C+M">Mohammad Azar</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&amp;query=van+Hasselt%2C+H">Hado van Hasselt</a>, <a href="/search/cs?searchtype=author&amp;query=Wayne%2C+G">Greg Wayne</a>, <a href="/search/cs?searchtype=author&amp;query=Singh%2C+S">Satinder Singh</a>, <a href="/search/cs?searchtype=author&amp;query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&amp;query=Munos%2C+R">Remi Munos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1912.02503v1-abstract-short" style="display: inline;"> We consider the problem of efficient credit assignment in reinforcement learning. In order to efficiently and meaningfully utilize new data, we propose to explicitly assign credit to past decisions based on the likelihood of them having led to the observed outcome. This approach uses new information in hindsight, rather than employing foresight. Somewhat surprisingly, we show that value functions&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1912.02503v1-abstract-full').style.display = 'inline'; document.getElementById('1912.02503v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1912.02503v1-abstract-full" style="display: none;"> We consider the problem of efficient credit assignment in reinforcement learning. In order to efficiently and meaningfully utilize new data, we propose to explicitly assign credit to past decisions based on the likelihood of them having led to the observed outcome. This approach uses new information in hindsight, rather than employing foresight. Somewhat surprisingly, we show that value functions can be rewritten through this lens, yielding a new family of algorithms. We study the properties of these algorithms, and empirically show that they successfully address important credit assignment challenges, through a set of illustrative tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1912.02503v1-abstract-full').style.display = 'none'; document.getElementById('1912.02503v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2019</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1902.07685">arXiv:1902.07685</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1902.07685">pdf</a>, <a href="https://arxiv.org/format/1902.07685">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applications">stat.AP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> World Discovery Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&amp;query=Pires%2C+B+A">Bernardo Avila Pires</a>, <a href="/search/cs?searchtype=author&amp;query=Grill%2C+J">Jean-Bastien Grill</a>, <a href="/search/cs?searchtype=author&amp;query=Altch%C3%A9%2C+F">Florent Altch茅</a>, <a href="/search/cs?searchtype=author&amp;query=Munos%2C+R">R茅mi Munos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1902.07685v3-abstract-short" style="display: inline;"> As humans we are driven by a strong desire for seeking novelty in our world. Also upon observing a novel pattern we are capable of refining our understanding of the world based on the new information---humans can discover their world. The outstanding ability of the human mind for discovery has led to many breakthroughs in science, art and technology. Here we investigate the possibility of building&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1902.07685v3-abstract-full').style.display = 'inline'; document.getElementById('1902.07685v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1902.07685v3-abstract-full" style="display: none;"> As humans we are driven by a strong desire for seeking novelty in our world. Also upon observing a novel pattern we are capable of refining our understanding of the world based on the new information---humans can discover their world. The outstanding ability of the human mind for discovery has led to many breakthroughs in science, art and technology. Here we investigate the possibility of building an agent capable of discovering its world using the modern AI technology. In particular we introduce NDIGO, Neural Differential Information Gain Optimisation, a self-supervised discovery model that aims at seeking new information to construct a global view of its world from partial and noisy observations. Our experiments on some controlled 2-D navigation tasks show that NDIGO outperforms state-of-the-art information-seeking methods in terms of the quality of the learned representation. The improvement in performance is particularly significant in the presence of white or structured noise where other information-seeking methods follow the noise instead of discovering their world. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1902.07685v3-abstract-full').style.display = 'none'; document.getElementById('1902.07685v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 March, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 February, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1811.06407">arXiv:1811.06407</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1811.06407">pdf</a>, <a href="https://arxiv.org/format/1811.06407">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Neural Predictive Belief Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Z+D">Zhaohan Daniel Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&amp;query=Pires%2C+B+A">Bernardo A. Pires</a>, <a href="/search/cs?searchtype=author&amp;query=Munos%2C+R">R茅mi Munos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1811.06407v2-abstract-short" style="display: inline;"> Unsupervised representation learning has succeeded with excellent results in many applications. It is an especially powerful tool to learn a good representation of environments with partial or noisy observations. In partially observable domains it is important for the representation to encode a belief state, a sufficient statistic of the observations seen so far. In this paper, we investigate whet&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1811.06407v2-abstract-full').style.display = 'inline'; document.getElementById('1811.06407v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1811.06407v2-abstract-full" style="display: none;"> Unsupervised representation learning has succeeded with excellent results in many applications. It is an especially powerful tool to learn a good representation of environments with partial or noisy observations. In partially observable domains it is important for the representation to encode a belief state, a sufficient statistic of the observations seen so far. In this paper, we investigate whether it is possible to learn such a belief representation using modern neural architectures. Specifically, we focus on one-step frame prediction and two variants of contrastive predictive coding (CPC) as the objective functions to learn the representations. To evaluate these learned representations, we test how well they can predict various pieces of information about the underlying state of the environment, e.g., position of the agent in a 3D maze. We show that all three methods are able to learn belief representations of the environment, they encode not only the state information, but also its uncertainty, a crucial aspect of belief states. We also find that for CPC multi-step predictions and action-conditioning are critical for accurate belief representations in visually complex environments. The ability of neural representations to capture the belief information has the potential to spur new advances for learning and planning in partially observable domains, where leveraging uncertainty is essential for optimal decision making. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1811.06407v2-abstract-full').style.display = 'none'; document.getElementById('1811.06407v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 November, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2018. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1809.07802">arXiv:1809.07802</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1809.07802">pdf</a>, <a href="https://arxiv.org/format/1809.07802">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Playing the Game of Universal Adversarial Perturbations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Perolat%2C+J">Julien Perolat</a>, <a href="/search/cs?searchtype=author&amp;query=Malinowski%2C+M">Mateusz Malinowski</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&amp;query=Pietquin%2C+O">Olivier Pietquin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1809.07802v2-abstract-short" style="display: inline;"> We study the problem of learning classifiers robust to universal adversarial perturbations. While prior work approaches this problem via robust optimization, adversarial training, or input transformation, we instead phrase it as a two-player zero-sum game. In this new formulation, both players simultaneously play the same game, where one player chooses a classifier that minimizes a classification&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1809.07802v2-abstract-full').style.display = 'inline'; document.getElementById('1809.07802v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1809.07802v2-abstract-full" style="display: none;"> We study the problem of learning classifiers robust to universal adversarial perturbations. While prior work approaches this problem via robust optimization, adversarial training, or input transformation, we instead phrase it as a two-player zero-sum game. In this new formulation, both players simultaneously play the same game, where one player chooses a classifier that minimizes a classification loss whilst the other player creates an adversarial perturbation that increases the same loss when applied to every sample in the training set. By observing that performing a classification (respectively creating adversarial samples) is the best response to the other player, we propose a novel extension of a game-theoretic algorithm, namely fictitious play, to the domain of training robust classifiers. Finally, we empirically show the robustness and versatility of our approach in two defence scenarios where universal attacks are performed on several image classification datasets -- CIFAR10, CIFAR100 and ImageNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1809.07802v2-abstract-full').style.display = 'none'; document.getElementById('1809.07802v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 September, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2018. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1805.11593">arXiv:1805.11593</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1805.11593">pdf</a>, <a href="https://arxiv.org/format/1805.11593">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Observe and Look Further: Achieving Consistent Performance on Atari </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pohlen%2C+T">Tobias Pohlen</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&amp;query=Hester%2C+T">Todd Hester</a>, <a href="/search/cs?searchtype=author&amp;query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&amp;query=Horgan%2C+D">Dan Horgan</a>, <a href="/search/cs?searchtype=author&amp;query=Budden%2C+D">David Budden</a>, <a href="/search/cs?searchtype=author&amp;query=Barth-Maron%2C+G">Gabriel Barth-Maron</a>, <a href="/search/cs?searchtype=author&amp;query=van+Hasselt%2C+H">Hado van Hasselt</a>, <a href="/search/cs?searchtype=author&amp;query=Quan%2C+J">John Quan</a>, <a href="/search/cs?searchtype=author&amp;query=Ve%C4%8Der%C3%ADk%2C+M">Mel Ve膷er铆k</a>, <a href="/search/cs?searchtype=author&amp;query=Hessel%2C+M">Matteo Hessel</a>, <a href="/search/cs?searchtype=author&amp;query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&amp;query=Pietquin%2C+O">Olivier Pietquin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1805.11593v1-abstract-short" style="display: inline;"> Despite significant advances in the field of deep Reinforcement Learning (RL), today&#39;s algorithms still fail to learn human-level policies consistently over a set of diverse tasks such as Atari 2600 games. We identify three key challenges that any algorithm needs to master in order to perform well on all games: processing diverse reward distributions, reasoning over long time horizons, and explori&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1805.11593v1-abstract-full').style.display = 'inline'; document.getElementById('1805.11593v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1805.11593v1-abstract-full" style="display: none;"> Despite significant advances in the field of deep Reinforcement Learning (RL), today&#39;s algorithms still fail to learn human-level policies consistently over a set of diverse tasks such as Atari 2600 games. We identify three key challenges that any algorithm needs to master in order to perform well on all games: processing diverse reward distributions, reasoning over long time horizons, and exploring efficiently. In this paper, we propose an algorithm that addresses each of these challenges and is able to learn human-level policies on nearly all Atari games. A new transformed Bellman operator allows our algorithm to process rewards of varying densities and scales; an auxiliary temporal consistency loss allows us to train stably using a discount factor of $纬= 0.999$ (instead of $纬= 0.99$) extending the effective planning horizon by an order of magnitude; and we ease the exploration problem by using human demonstrations that guide the agent towards rewarding states. When tested on a set of 42 Atari games, our algorithm exceeds the performance of an average human on 40 games using a common set of hyper parameters. Furthermore, it is the first deep RL algorithm to solve the first level of Montezuma&#39;s Revenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1805.11593v1-abstract-full').style.display = 'none'; document.getElementById('1805.11593v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2018. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1710.02298">arXiv:1710.02298</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1710.02298">pdf</a>, <a href="https://arxiv.org/format/1710.02298">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Rainbow: Combining Improvements in Deep Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hessel%2C+M">Matteo Hessel</a>, <a href="/search/cs?searchtype=author&amp;query=Modayil%2C+J">Joseph Modayil</a>, <a href="/search/cs?searchtype=author&amp;query=van+Hasselt%2C+H">Hado van Hasselt</a>, <a href="/search/cs?searchtype=author&amp;query=Schaul%2C+T">Tom Schaul</a>, <a href="/search/cs?searchtype=author&amp;query=Ostrovski%2C+G">Georg Ostrovski</a>, <a href="/search/cs?searchtype=author&amp;query=Dabney%2C+W">Will Dabney</a>, <a href="/search/cs?searchtype=author&amp;query=Horgan%2C+D">Dan Horgan</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&amp;query=Azar%2C+M">Mohammad Azar</a>, <a href="/search/cs?searchtype=author&amp;query=Silver%2C+D">David Silver</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1710.02298v1-abstract-short" style="display: inline;"> The deep reinforcement learning community has made several independent improvements to the DQN algorithm. However, it is unclear which of these extensions are complementary and can be fruitfully combined. This paper examines six extensions to the DQN algorithm and empirically studies their combination. Our experiments show that the combination provides state-of-the-art performance on the Atari 260&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1710.02298v1-abstract-full').style.display = 'inline'; document.getElementById('1710.02298v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1710.02298v1-abstract-full" style="display: none;"> The deep reinforcement learning community has made several independent improvements to the DQN algorithm. However, it is unclear which of these extensions are complementary and can be fruitfully combined. This paper examines six extensions to the DQN algorithm and empirically studies their combination. Our experiments show that the combination provides state-of-the-art performance on the Atari 2600 benchmark, both in terms of data efficiency and final performance. We also provide results from a detailed ablation study that shows the contribution of each component to overall performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1710.02298v1-abstract-full').style.display = 'none'; document.getElementById('1710.02298v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2017. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review as a conference paper at AAAI 2018</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1707.08817">arXiv:1707.08817</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1707.08817">pdf</a>, <a href="https://arxiv.org/format/1707.08817">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Leveraging Demonstrations for Deep Reinforcement Learning on Robotics Problems with Sparse Rewards </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Vecerik%2C+M">Mel Vecerik</a>, <a href="/search/cs?searchtype=author&amp;query=Hester%2C+T">Todd Hester</a>, <a href="/search/cs?searchtype=author&amp;query=Scholz%2C+J">Jonathan Scholz</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+F">Fumin Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Pietquin%2C+O">Olivier Pietquin</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&amp;query=Roth%C3%B6rl%2C+T">Thomas Roth枚rl</a>, <a href="/search/cs?searchtype=author&amp;query=Lampe%2C+T">Thomas Lampe</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1707.08817v2-abstract-short" style="display: inline;"> We propose a general and model-free approach for Reinforcement Learning (RL) on real robotics with sparse rewards. We build upon the Deep Deterministic Policy Gradient (DDPG) algorithm to use demonstrations. Both demonstrations and actual interactions are used to fill a replay buffer and the sampling ratio between demonstrations and transitions is automatically tuned via a prioritized replay mecha&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1707.08817v2-abstract-full').style.display = 'inline'; document.getElementById('1707.08817v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1707.08817v2-abstract-full" style="display: none;"> We propose a general and model-free approach for Reinforcement Learning (RL) on real robotics with sparse rewards. We build upon the Deep Deterministic Policy Gradient (DDPG) algorithm to use demonstrations. Both demonstrations and actual interactions are used to fill a replay buffer and the sampling ratio between demonstrations and transitions is automatically tuned via a prioritized replay mechanism. Typically, carefully engineered shaping rewards are required to enable the agents to efficiently explore on high dimensional control problems such as robotics. They are also required for model-based acceleration methods relying on local solvers such as iLQG (e.g. Guided Policy Search and Normalized Advantage Function). The demonstrations replace the need for carefully engineered rewards, and reduce the exploration problem encountered by classical RL approaches in these domains. Demonstrations are collected by a robot kinesthetically force-controlled by a human demonstrator. Results on four simulated insertion tasks show that DDPG from demonstrations out-performs DDPG, and does not require engineered rewards. Finally, we demonstrate the method on a real robotics task consisting of inserting a clip (flexible object) into a rigid object. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1707.08817v2-abstract-full').style.display = 'none'; document.getElementById('1707.08817v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 July, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2017. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1706.10295">arXiv:1706.10295</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1706.10295">pdf</a>, <a href="https://arxiv.org/format/1706.10295">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Noisy Networks for Exploration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Fortunato%2C+M">Meire Fortunato</a>, <a href="/search/cs?searchtype=author&amp;query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&amp;query=Menick%2C+J">Jacob Menick</a>, <a href="/search/cs?searchtype=author&amp;query=Osband%2C+I">Ian Osband</a>, <a href="/search/cs?searchtype=author&amp;query=Graves%2C+A">Alex Graves</a>, <a href="/search/cs?searchtype=author&amp;query=Mnih%2C+V">Vlad Mnih</a>, <a href="/search/cs?searchtype=author&amp;query=Munos%2C+R">Remi Munos</a>, <a href="/search/cs?searchtype=author&amp;query=Hassabis%2C+D">Demis Hassabis</a>, <a href="/search/cs?searchtype=author&amp;query=Pietquin%2C+O">Olivier Pietquin</a>, <a href="/search/cs?searchtype=author&amp;query=Blundell%2C+C">Charles Blundell</a>, <a href="/search/cs?searchtype=author&amp;query=Legg%2C+S">Shane Legg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1706.10295v3-abstract-short" style="display: inline;"> We introduce NoisyNet, a deep reinforcement learning agent with parametric noise added to its weights, and show that the induced stochasticity of the agent&#39;s policy can be used to aid efficient exploration. The parameters of the noise are learned with gradient descent along with the remaining network weights. NoisyNet is straightforward to implement and adds little computational overhead. We find&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1706.10295v3-abstract-full').style.display = 'inline'; document.getElementById('1706.10295v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1706.10295v3-abstract-full" style="display: none;"> We introduce NoisyNet, a deep reinforcement learning agent with parametric noise added to its weights, and show that the induced stochasticity of the agent&#39;s policy can be used to aid efficient exploration. The parameters of the noise are learned with gradient descent along with the remaining network weights. NoisyNet is straightforward to implement and adds little computational overhead. We find that replacing the conventional exploration heuristics for A3C, DQN and dueling agents (entropy reward and $蔚$-greedy respectively) with NoisyNet yields substantially higher scores for a wide range of Atari games, in some cases advancing the agent from sub to super-human performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1706.10295v3-abstract-full').style.display = 'none'; document.getElementById('1706.10295v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 July, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 June, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2017. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2018</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1706.06617">arXiv:1706.06617</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1706.06617">pdf</a>, <a href="https://arxiv.org/format/1706.06617">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Observational Learning by Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Borsa%2C+D">Diana Borsa</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&amp;query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&amp;query=Pietquin%2C+O">Olivier Pietquin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1706.06617v1-abstract-short" style="display: inline;"> Observational learning is a type of learning that occurs as a function of observing, retaining and possibly replicating or imitating the behaviour of another agent. It is a core mechanism appearing in various instances of social learning and has been found to be employed in several intelligent species, including humans. In this paper, we investigate to what extent the explicit modelling of other a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1706.06617v1-abstract-full').style.display = 'inline'; document.getElementById('1706.06617v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1706.06617v1-abstract-full" style="display: none;"> Observational learning is a type of learning that occurs as a function of observing, retaining and possibly replicating or imitating the behaviour of another agent. It is a core mechanism appearing in various instances of social learning and has been found to be employed in several intelligent species, including humans. In this paper, we investigate to what extent the explicit modelling of other agents is necessary to achieve observational learning through machine learning. Especially, we argue that observational learning can emerge from pure Reinforcement Learning (RL), potentially coupled with memory. Through simple scenarios, we demonstrate that an RL agent can leverage the information provided by the observations of an other agent performing a task in a shared environment. The other agent is only observed through the effect of its actions on the environment and never explicitly modeled. Two key aspects are borrowed from observational learning: i) the observer behaviour needs to change as a result of viewing a &#39;teacher&#39; (another agent) and ii) the observer needs to be motivated somehow to engage in making use of the other agent&#39;s behaviour. The later is naturally modeled by RL, by correlating the learning agent&#39;s reward with the teacher agent&#39;s behaviour. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1706.06617v1-abstract-full').style.display = 'none'; document.getElementById('1706.06617v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 June, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2017. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1704.04651">arXiv:1704.04651</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1704.04651">pdf</a>, <a href="https://arxiv.org/format/1704.04651">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> The Reactor: A fast and sample-efficient Actor-Critic agent for Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gruslys%2C+A">Audrunas Gruslys</a>, <a href="/search/cs?searchtype=author&amp;query=Dabney%2C+W">Will Dabney</a>, <a href="/search/cs?searchtype=author&amp;query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&amp;query=Bellemare%2C+M">Marc Bellemare</a>, <a href="/search/cs?searchtype=author&amp;query=Munos%2C+R">Remi Munos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1704.04651v2-abstract-short" style="display: inline;"> In this work we present a new agent architecture, called Reactor, which combines multiple algorithmic and architectural contributions to produce an agent with higher sample-efficiency than Prioritized Dueling DQN (Wang et al., 2016) and Categorical DQN (Bellemare et al., 2017), while giving better run-time performance than A3C (Mnih et al., 2016). Our first contribution is a new policy evaluation&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1704.04651v2-abstract-full').style.display = 'inline'; document.getElementById('1704.04651v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1704.04651v2-abstract-full" style="display: none;"> In this work we present a new agent architecture, called Reactor, which combines multiple algorithmic and architectural contributions to produce an agent with higher sample-efficiency than Prioritized Dueling DQN (Wang et al., 2016) and Categorical DQN (Bellemare et al., 2017), while giving better run-time performance than A3C (Mnih et al., 2016). Our first contribution is a new policy evaluation algorithm called Distributional Retrace, which brings multi-step off-policy updates to the distributional reinforcement learning setting. The same approach can be used to convert several classes of multi-step policy evaluation algorithms designed for expected value evaluation into distributional ones. Next, we introduce the \b{eta}-leave-one-out policy gradient algorithm which improves the trade-off between variance and bias by using action values as a baseline. Our final algorithmic contribution is a new prioritized replay algorithm for sequences, which exploits the temporal locality of neighboring observations for more efficient replay prioritization. Using the Atari 2600 benchmarks, we show that each of these innovations contribute to both the sample efficiency and final agent performance. Finally, we demonstrate that Reactor reaches state-of-the-art performance after 200 million frames and less than a day of training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1704.04651v2-abstract-full').style.display = 'none'; document.getElementById('1704.04651v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 June, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 April, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2017. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1704.03732">arXiv:1704.03732</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1704.03732">pdf</a>, <a href="https://arxiv.org/ps/1704.03732">ps</a>, <a href="https://arxiv.org/format/1704.03732">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Deep Q-learning from Demonstrations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hester%2C+T">Todd Hester</a>, <a href="/search/cs?searchtype=author&amp;query=Vecerik%2C+M">Matej Vecerik</a>, <a href="/search/cs?searchtype=author&amp;query=Pietquin%2C+O">Olivier Pietquin</a>, <a href="/search/cs?searchtype=author&amp;query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&amp;query=Schaul%2C+T">Tom Schaul</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&amp;query=Horgan%2C+D">Dan Horgan</a>, <a href="/search/cs?searchtype=author&amp;query=Quan%2C+J">John Quan</a>, <a href="/search/cs?searchtype=author&amp;query=Sendonaris%2C+A">Andrew Sendonaris</a>, <a href="/search/cs?searchtype=author&amp;query=Dulac-Arnold%2C+G">Gabriel Dulac-Arnold</a>, <a href="/search/cs?searchtype=author&amp;query=Osband%2C+I">Ian Osband</a>, <a href="/search/cs?searchtype=author&amp;query=Agapiou%2C+J">John Agapiou</a>, <a href="/search/cs?searchtype=author&amp;query=Leibo%2C+J+Z">Joel Z. Leibo</a>, <a href="/search/cs?searchtype=author&amp;query=Gruslys%2C+A">Audrunas Gruslys</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1704.03732v4-abstract-short" style="display: inline;"> Deep reinforcement learning (RL) has achieved several high profile successes in difficult decision-making problems. However, these algorithms typically require a huge amount of data before they reach reasonable performance. In fact, their performance during learning can be extremely poor. This may be acceptable for a simulator, but it severely limits the applicability of deep RL to many real-world&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1704.03732v4-abstract-full').style.display = 'inline'; document.getElementById('1704.03732v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1704.03732v4-abstract-full" style="display: none;"> Deep reinforcement learning (RL) has achieved several high profile successes in difficult decision-making problems. However, these algorithms typically require a huge amount of data before they reach reasonable performance. In fact, their performance during learning can be extremely poor. This may be acceptable for a simulator, but it severely limits the applicability of deep RL to many real-world tasks, where the agent must learn in the real environment. In this paper we study a setting where the agent may access data from previous control of the system. We present an algorithm, Deep Q-learning from Demonstrations (DQfD), that leverages small sets of demonstration data to massively accelerate the learning process even from relatively small amounts of demonstration data and is able to automatically assess the necessary ratio of demonstration data while learning thanks to a prioritized replay mechanism. DQfD works by combining temporal difference updates with supervised classification of the demonstrator&#39;s actions. We show that DQfD has better initial performance than Prioritized Dueling Double Deep Q-Networks (PDD DQN) as it starts with better scores on the first million steps on 41 of 42 games and on average it takes PDD DQN 83 million steps to catch up to DQfD&#39;s performance. DQfD learns to out-perform the best demonstration given in 14 of 42 games. In addition, DQfD leverages human demonstrations to achieve state-of-the-art results for 11 games. Finally, we show that DQfD performs better than three related algorithms for incorporating demonstration data into DQN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1704.03732v4-abstract-full').style.display = 'none'; document.getElementById('1704.03732v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2017; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 April, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2017. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published at AAAI 2018. Previously on arxiv as &#34;Learning from Demonstrations for Real World Reinforcement Learning&#34;</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1703.05423">arXiv:1703.05423</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1703.05423">pdf</a>, <a href="https://arxiv.org/format/1703.05423">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> End-to-end optimization of goal-driven and visually grounded dialogue systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Strub%2C+F">Florian Strub</a>, <a href="/search/cs?searchtype=author&amp;query=de+Vries%2C+H">Harm de Vries</a>, <a href="/search/cs?searchtype=author&amp;query=Mary%2C+J">Jeremie Mary</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&amp;query=Courville%2C+A">Aaron Courville</a>, <a href="/search/cs?searchtype=author&amp;query=Pietquin%2C+O">Olivier Pietquin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1703.05423v1-abstract-short" style="display: inline;"> End-to-end design of dialogue systems has recently become a popular research topic thanks to powerful tools such as encoder-decoder architectures for sequence-to-sequence learning. Yet, most current approaches cast human-machine dialogue management as a supervised learning problem, aiming at predicting the next utterance of a participant given the full history of the dialogue. This vision is too s&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1703.05423v1-abstract-full').style.display = 'inline'; document.getElementById('1703.05423v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1703.05423v1-abstract-full" style="display: none;"> End-to-end design of dialogue systems has recently become a popular research topic thanks to powerful tools such as encoder-decoder architectures for sequence-to-sequence learning. Yet, most current approaches cast human-machine dialogue management as a supervised learning problem, aiming at predicting the next utterance of a participant given the full history of the dialogue. This vision is too simplistic to render the intrinsic planning problem inherent to dialogue as well as its grounded nature, making the context of a dialogue larger than the sole history. This is why only chit-chat and question answering tasks have been addressed so far using end-to-end architectures. In this paper, we introduce a Deep Reinforcement Learning method to optimize visually grounded task-oriented dialogues, based on the policy gradient algorithm. This approach is tested on a dataset of 120k dialogues collected through Mechanical Turk and provides encouraging results at solving both the problem of generating natural dialogues and the task of discovering a specific object in a complex picture. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1703.05423v1-abstract-full').style.display = 'none'; document.getElementById('1703.05423v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2017. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1606.08718">arXiv:1606.08718</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1606.08718">pdf</a>, <a href="https://arxiv.org/ps/1606.08718">ps</a>, <a href="https://arxiv.org/format/1606.08718">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> Learning Nash Equilibrium for General-Sum Markov Games from Batch Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=P%C3%A9rolat%2C+J">Julien P茅rolat</a>, <a href="/search/cs?searchtype=author&amp;query=Strub%2C+F">Florian Strub</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&amp;query=Pietquin%2C+O">Olivier Pietquin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1606.08718v4-abstract-short" style="display: inline;"> This paper addresses the problem of learning a Nash equilibrium in $纬$-discounted multiplayer general-sum Markov Games (MG). A key component of this model is the possibility for the players to either collaborate or team apart to increase their rewards. Building an artificial player for general-sum MGs implies to learn more complex strategies which are impossible to obtain by using techniques devel&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1606.08718v4-abstract-full').style.display = 'inline'; document.getElementById('1606.08718v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1606.08718v4-abstract-full" style="display: none;"> This paper addresses the problem of learning a Nash equilibrium in $纬$-discounted multiplayer general-sum Markov Games (MG). A key component of this model is the possibility for the players to either collaborate or team apart to increase their rewards. Building an artificial player for general-sum MGs implies to learn more complex strategies which are impossible to obtain by using techniques developed for two-player zero-sum MGs. In this paper, we introduce a new definition of $蔚$-Nash equilibrium in MGs which grasps the strategy&#39;s quality for multiplayer games. We prove that minimizing the norm of two Bellman-like residuals implies the convergence to such an $蔚$-Nash equilibrium. Then, we show that minimizing an empirical estimate of the $L_p$ norm of these Bellman-like residuals allows learning for general-sum games within the batch setting. Finally, we introduce a neural network architecture named NashNetwork that successfully learns a Nash equilibrium in a generic multiplayer general-sum turn-based MG. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1606.08718v4-abstract-full').style.display = 'none'; document.getElementById('1606.08718v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2017; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 June, 2016; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2016. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20th International Conference on Artificial Intelligence and Statistics (AISTATS) 2017, Fort Lauderdale, Florida, USA. JMLR: W&amp;CP volume 54</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Report number:</span> CRIStAL, UMR 9189 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1606.07636">arXiv:1606.07636</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1606.07636">pdf</a>, <a href="https://arxiv.org/format/1606.07636">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Is the Bellman residual a bad proxy? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Geist%2C+M">Matthieu Geist</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&amp;query=Pietquin%2C+O">Olivier Pietquin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1606.07636v3-abstract-short" style="display: inline;"> This paper aims at theoretically and empirically comparing two standard optimization criteria for Reinforcement Learning: i) maximization of the mean value and ii) minimization of the Bellman residual. For that purpose, we place ourselves in the framework of policy search algorithms, that are usually designed to maximize the mean value, and derive a method that minimizes the residual&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1606.07636v3-abstract-full').style.display = 'inline'; document.getElementById('1606.07636v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1606.07636v3-abstract-full" style="display: none;"> This paper aims at theoretically and empirically comparing two standard optimization criteria for Reinforcement Learning: i) maximization of the mean value and ii) minimization of the Bellman residual. For that purpose, we place ourselves in the framework of policy search algorithms, that are usually designed to maximize the mean value, and derive a method that minimizes the residual $\|T_* v_蟺- v_蟺\|_{1,谓}$ over policies. A theoretical analysis shows how good this proxy is to policy optimization, and notably that it is better than its value-based counterpart. We also propose experiments on randomly generated generic Markov decision processes, specifically designed for studying the influence of the involved concentrability coefficient. They show that the Bellman residual is generally a bad proxy to policy optimization and that directly maximizing the mean value is much better, despite the current lack of deep theoretical analysis. This might seem obvious, as directly addressing the problem of interest is usually better, but given the prevalence of (projected) Bellman residual minimization in value-based reinforcement learning, we believe that this question is worth to be considered. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1606.07636v3-abstract-full').style.display = 'none'; document.getElementById('1606.07636v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2017; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 June, 2016; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2016. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Final NIPS 2017 version (title, among other things, changed)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1606.01128">arXiv:1606.01128</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1606.01128">pdf</a>, <a href="https://arxiv.org/format/1606.01128">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Difference of Convex Functions Programming Applied to Control with Expert Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&amp;query=Geist%2C+M">Matthieu Geist</a>, <a href="/search/cs?searchtype=author&amp;query=Pietquin%2C+O">Olivier Pietquin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1606.01128v2-abstract-short" style="display: inline;"> This paper reports applications of Difference of Convex functions (DC) programming to Learning from Demonstrations (LfD) and Reinforcement Learning (RL) with expert data. This is made possible because the norm of the Optimal Bellman Residual (OBR), which is at the heart of many RL and LfD algorithms, is DC. Improvement in performance is demonstrated on two specific algorithms, namely Reward-regula&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1606.01128v2-abstract-full').style.display = 'inline'; document.getElementById('1606.01128v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1606.01128v2-abstract-full" style="display: none;"> This paper reports applications of Difference of Convex functions (DC) programming to Learning from Demonstrations (LfD) and Reinforcement Learning (RL) with expert data. This is made possible because the norm of the Optimal Bellman Residual (OBR), which is at the heart of many RL and LfD algorithms, is DC. Improvement in performance is demonstrated on two specific algorithms, namely Reward-regularized Classification for Apprenticeship Learning (RCAL) and Reinforcement Learning with Expert Demonstrations (RLED), through experiments on generic Markov Decision Processes (MDP), called Garnets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1606.01128v2-abstract-full').style.display = 'none'; document.getElementById('1606.01128v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2016; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 June, 2016; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2016. </p> </li> </ol> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10