CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 63 results for author: <span class="mathjax">Riedmiller, M</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Riedmiller%2C+M">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Riedmiller, M"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Riedmiller%2C+M&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Riedmiller, M"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Riedmiller%2C+M&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Riedmiller%2C+M&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Riedmiller%2C+M&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04166">arXiv:2410.04166</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.04166">pdf</a>, <a href="https://arxiv.org/format/2410.04166">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Preference Optimization as Probabilistic Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&amp;query=Shahriari%2C+B">Bobak Shahriari</a>, <a href="/search/cs?searchtype=author&amp;query=Springenberg%2C+J+T">Jost Tobias Springenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Hertweck%2C+T">Tim Hertweck</a>, <a href="/search/cs?searchtype=author&amp;query=Joshi%2C+R">Rishabh Joshi</a>, <a href="/search/cs?searchtype=author&amp;query=Oh%2C+J">Junhyuk Oh</a>, <a href="/search/cs?searchtype=author&amp;query=Bloesch%2C+M">Michael Bloesch</a>, <a href="/search/cs?searchtype=author&amp;query=Lampe%2C+T">Thomas Lampe</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&amp;query=Buchli%2C+J">Jonas Buchli</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04166v1-abstract-short" style="display: inline;"> Existing preference optimization methods are mainly designed for directly learning from human feedback with the assumption that paired examples (preferred vs. dis-preferred) are available. In contrast, we propose a method that can leverage unpaired preferred or dis-preferred examples, and works even when only one type of feedback (positive or negative) is available. This flexibility allows us to a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04166v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04166v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04166v1-abstract-full" style="display: none;"> Existing preference optimization methods are mainly designed for directly learning from human feedback with the assumption that paired examples (preferred vs. dis-preferred) are available. In contrast, we propose a method that can leverage unpaired preferred or dis-preferred examples, and works even when only one type of feedback (positive or negative) is available. This flexibility allows us to apply it in scenarios with varying forms of feedback and models, including training generative language models based on human feedback as well as training policies for sequential decision-making problems, where learned (value) functions are available. Our approach builds upon the probabilistic framework introduced in (Dayan and Hinton, 1997), which proposes to use expectation-maximization (EM) to directly optimize the probability of preferred outcomes (as opposed to classic expected reward maximization). To obtain a practical algorithm, we identify and address a key limitation in current EM-based methods: when applied to preference optimization, they solely maximize the likelihood of preferred examples, while neglecting dis-preferred samples. We show how one can extend EM algorithms to explicitly incorporate dis-preferred outcomes, leading to a novel, theoretically grounded, preference optimization algorithm that offers an intuitive and versatile way to learn from both positive and negative feedback. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04166v1-abstract-full').style.display = 'none'; document.getElementById('2410.04166v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06613">arXiv:2409.06613</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.06613">pdf</a>, <a href="https://arxiv.org/format/2409.06613">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DemoStart: Demonstration-led auto-curriculum applied to sim-to-real with multi-fingered robots </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bauza%2C+M">Maria Bauza</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J+E">Jose Enrique Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Dalibard%2C+V">Valentin Dalibard</a>, <a href="/search/cs?searchtype=author&amp;query=Gileadi%2C+N">Nimrod Gileadi</a>, <a href="/search/cs?searchtype=author&amp;query=Hafner%2C+R">Roland Hafner</a>, <a href="/search/cs?searchtype=author&amp;query=Martins%2C+M+F">Murilo F. Martins</a>, <a href="/search/cs?searchtype=author&amp;query=Moore%2C+J">Joss Moore</a>, <a href="/search/cs?searchtype=author&amp;query=Pevceviciute%2C+R">Rugile Pevceviciute</a>, <a href="/search/cs?searchtype=author&amp;query=Laurens%2C+A">Antoine Laurens</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+D">Dushyant Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Zambelli%2C+M">Martina Zambelli</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a>, <a href="/search/cs?searchtype=author&amp;query=Scholz%2C+J">Jon Scholz</a>, <a href="/search/cs?searchtype=author&amp;query=Bousmalis%2C+K">Konstantinos Bousmalis</a>, <a href="/search/cs?searchtype=author&amp;query=Nori%2C+F">Francesco Nori</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06613v2-abstract-short" style="display: inline;"> We present DemoStart, a novel auto-curriculum reinforcement learning method capable of learning complex manipulation behaviors on an arm equipped with a three-fingered robotic hand, from only a sparse reward and a handful of demonstrations in simulation. Learning from simulation drastically reduces the development cycle of behavior generation, and domain randomization techniques are leveraged to a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06613v2-abstract-full').style.display = 'inline'; document.getElementById('2409.06613v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06613v2-abstract-full" style="display: none;"> We present DemoStart, a novel auto-curriculum reinforcement learning method capable of learning complex manipulation behaviors on an arm equipped with a three-fingered robotic hand, from only a sparse reward and a handful of demonstrations in simulation. Learning from simulation drastically reduces the development cycle of behavior generation, and domain randomization techniques are leveraged to achieve successful zero-shot sim-to-real transfer. Transferred policies are learned directly from raw pixels from multiple cameras and robot proprioception. Our approach outperforms policies learned from demonstrations on the real robot and requires 100 times fewer demonstrations, collected in simulation. More details and videos in https://sites.google.com/view/demostart. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06613v2-abstract-full').style.display = 'none'; document.getElementById('2409.06613v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages total with 7 pages of appendix. 9 Figures, 4 in the main text and 5 in the appendix</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03402">arXiv:2409.03402</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.03402">pdf</a>, <a href="https://arxiv.org/format/2409.03402">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Game On: Towards Language Models as RL Experimenters </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jingwei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Lampe%2C+T">Thomas Lampe</a>, <a href="/search/cs?searchtype=author&amp;query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&amp;query=Springenberg%2C+J+T">Jost Tobias Springenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03402v1-abstract-short" style="display: inline;"> We propose an agent architecture that automates parts of the common reinforcement learning experiment workflow, to enable automated mastery of control domains for embodied agents. To do so, it leverages a VLM to perform some of the capabilities normally required of a human experimenter, including the monitoring and analysis of experiment progress, the proposition of new tasks based on past success&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03402v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03402v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03402v1-abstract-full" style="display: none;"> We propose an agent architecture that automates parts of the common reinforcement learning experiment workflow, to enable automated mastery of control domains for embodied agents. To do so, it leverages a VLM to perform some of the capabilities normally required of a human experimenter, including the monitoring and analysis of experiment progress, the proposition of new tasks based on past successes and failures of the agent, decomposing tasks into a sequence of subtasks (skills), and retrieval of the skill to execute - enabling our system to build automated curricula for learning. We believe this is one of the first proposals for a system that leverages a VLM throughout the full experiment cycle of reinforcement learning. We provide a first prototype of this system, and examine the feasibility of current models and techniques for the desired level of automation. For this, we use a standard Gemini model, without additional fine-tuning, to provide a curriculum of skills to a language-conditioned Actor-Critic algorithm, in order to steer data collection so as to aid learning new skills. Data collected in this way is shown to be useful for learning and iteratively improving control policies in a robotics domain. Additional examination of the ability of the system to build a growing library of skills, and to judge the progress of the training of those skills, also shows promising results, suggesting that the proposed architecture provides a potential recipe for fully automated mastery of tasks and domains for embodied agents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03402v1-abstract-full').style.display = 'none'; document.getElementById('2409.03402v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.01369">arXiv:2409.01369</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.01369">pdf</a>, <a href="https://arxiv.org/format/2409.01369">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Imitating Language via Scalable Inverse Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wulfmeier%2C+M">Markus Wulfmeier</a>, <a href="/search/cs?searchtype=author&amp;query=Bloesch%2C+M">Michael Bloesch</a>, <a href="/search/cs?searchtype=author&amp;query=Vieillard%2C+N">Nino Vieillard</a>, <a href="/search/cs?searchtype=author&amp;query=Ahuja%2C+A">Arun Ahuja</a>, <a href="/search/cs?searchtype=author&amp;query=Bornschein%2C+J">Jorg Bornschein</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+S">Sandy Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Sokolov%2C+A">Artem Sokolov</a>, <a href="/search/cs?searchtype=author&amp;query=Barnes%2C+M">Matt Barnes</a>, <a href="/search/cs?searchtype=author&amp;query=Desjardins%2C+G">Guillaume Desjardins</a>, <a href="/search/cs?searchtype=author&amp;query=Bewley%2C+A">Alex Bewley</a>, <a href="/search/cs?searchtype=author&amp;query=Bechtle%2C+S+M+E">Sarah Maria Elisabeth Bechtle</a>, <a href="/search/cs?searchtype=author&amp;query=Springenberg%2C+J+T">Jost Tobias Springenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Momchev%2C+N">Nikola Momchev</a>, <a href="/search/cs?searchtype=author&amp;query=Bachem%2C+O">Olivier Bachem</a>, <a href="/search/cs?searchtype=author&amp;query=Geist%2C+M">Matthieu Geist</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.01369v1-abstract-short" style="display: inline;"> The majority of language model training builds on imitation learning. It covers pretraining, supervised fine-tuning, and affects the starting conditions for reinforcement learning from human feedback (RLHF). The simplicity and scalability of maximum likelihood estimation (MLE) for next token prediction led to its role as predominant paradigm. However, the broader field of imitation learning can mo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01369v1-abstract-full').style.display = 'inline'; document.getElementById('2409.01369v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.01369v1-abstract-full" style="display: none;"> The majority of language model training builds on imitation learning. It covers pretraining, supervised fine-tuning, and affects the starting conditions for reinforcement learning from human feedback (RLHF). The simplicity and scalability of maximum likelihood estimation (MLE) for next token prediction led to its role as predominant paradigm. However, the broader field of imitation learning can more effectively utilize the sequential structure underlying autoregressive generation. We focus on investigating the inverse reinforcement learning (IRL) perspective to imitation, extracting rewards and directly optimizing sequences instead of individual token likelihoods and evaluate its benefits for fine-tuning large language models. We provide a new angle, reformulating inverse soft-Q-learning as a temporal difference regularized extension of MLE. This creates a principled connection between MLE and IRL and allows trading off added complexity with increased performance and diversity of generations in the supervised fine-tuning (SFT) setting. We find clear advantages for IRL-based imitation, in particular for retaining diversity while maximizing task performance, rendering IRL a strong alternative on fixed SFT datasets even without online data generation. Our analysis of IRL-extracted reward functions further indicates benefits for more robust reward functions via tighter integration of supervised and preference-based LLM post-training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01369v1-abstract-full').style.display = 'none'; document.getElementById('2409.01369v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.02425">arXiv:2405.02425</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.02425">pdf</a>, <a href="https://arxiv.org/format/2405.02425">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Learning Robot Soccer from Egocentric Vision with Deep Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tirumala%2C+D">Dhruva Tirumala</a>, <a href="/search/cs?searchtype=author&amp;query=Wulfmeier%2C+M">Markus Wulfmeier</a>, <a href="/search/cs?searchtype=author&amp;query=Moran%2C+B">Ben Moran</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+S">Sandy Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Humplik%2C+J">Jan Humplik</a>, <a href="/search/cs?searchtype=author&amp;query=Lever%2C+G">Guy Lever</a>, <a href="/search/cs?searchtype=author&amp;query=Haarnoja%2C+T">Tuomas Haarnoja</a>, <a href="/search/cs?searchtype=author&amp;query=Hasenclever%2C+L">Leonard Hasenclever</a>, <a href="/search/cs?searchtype=author&amp;query=Byravan%2C+A">Arunkumar Byravan</a>, <a href="/search/cs?searchtype=author&amp;query=Batchelor%2C+N">Nathan Batchelor</a>, <a href="/search/cs?searchtype=author&amp;query=Sreendra%2C+N">Neil Sreendra</a>, <a href="/search/cs?searchtype=author&amp;query=Patel%2C+K">Kushal Patel</a>, <a href="/search/cs?searchtype=author&amp;query=Gwira%2C+M">Marlon Gwira</a>, <a href="/search/cs?searchtype=author&amp;query=Nori%2C+F">Francesco Nori</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.02425v1-abstract-short" style="display: inline;"> We apply multi-agent deep reinforcement learning (RL) to train end-to-end robot soccer policies with fully onboard computation and sensing via egocentric RGB vision. This setting reflects many challenges of real-world robotics, including active perception, agile full-body control, and long-horizon planning in a dynamic, partially-observable, multi-agent domain. We rely on large-scale, simulation-b&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02425v1-abstract-full').style.display = 'inline'; document.getElementById('2405.02425v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.02425v1-abstract-full" style="display: none;"> We apply multi-agent deep reinforcement learning (RL) to train end-to-end robot soccer policies with fully onboard computation and sensing via egocentric RGB vision. This setting reflects many challenges of real-world robotics, including active perception, agile full-body control, and long-horizon planning in a dynamic, partially-observable, multi-agent domain. We rely on large-scale, simulation-based data generation to obtain complex behaviors from egocentric vision which can be successfully transferred to physical robots using low-cost sensors. To achieve adequate visual realism, our simulation combines rigid-body physics with learned, realistic rendering via multiple Neural Radiance Fields (NeRFs). We combine teacher-based multi-agent RL and cross-experiment data reuse to enable the discovery of sophisticated soccer strategies. We analyze active-perception behaviors including object tracking and ball seeking that emerge when simply optimizing perception-agnostic soccer play. The agents display equivalent levels of performance and agility as policies with access to privileged, ground-truth state. To our knowledge, this paper constitutes a first demonstration of end-to-end training for multi-agent robot soccer, mapping raw pixel observations to joint-level actions, that can be deployed in the real world. Videos of the game-play and analyses can be seen on our website https://sites.google.com/view/vision-soccer . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02425v1-abstract-full').style.display = 'none'; document.getElementById('2405.02425v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.06102">arXiv:2402.06102</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.06102">pdf</a>, <a href="https://arxiv.org/format/2402.06102">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Real-World Fluid Directed Rigid Body Control via Deep Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bhardwaj%2C+M">Mohak Bhardwaj</a>, <a href="/search/cs?searchtype=author&amp;query=Lampe%2C+T">Thomas Lampe</a>, <a href="/search/cs?searchtype=author&amp;query=Neunert%2C+M">Michael Neunert</a>, <a href="/search/cs?searchtype=author&amp;query=Romano%2C+F">Francesco Romano</a>, <a href="/search/cs?searchtype=author&amp;query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&amp;query=Byravan%2C+A">Arunkumar Byravan</a>, <a href="/search/cs?searchtype=author&amp;query=Wulfmeier%2C+M">Markus Wulfmeier</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a>, <a href="/search/cs?searchtype=author&amp;query=Buchli%2C+J">Jonas Buchli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.06102v1-abstract-short" style="display: inline;"> Recent advances in real-world applications of reinforcement learning (RL) have relied on the ability to accurately simulate systems at scale. However, domains such as fluid dynamical systems exhibit complex dynamic phenomena that are hard to simulate at high integration rates, limiting the direct application of modern deep RL algorithms to often expensive or safety critical hardware. In this work,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.06102v1-abstract-full').style.display = 'inline'; document.getElementById('2402.06102v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.06102v1-abstract-full" style="display: none;"> Recent advances in real-world applications of reinforcement learning (RL) have relied on the ability to accurately simulate systems at scale. However, domains such as fluid dynamical systems exhibit complex dynamic phenomena that are hard to simulate at high integration rates, limiting the direct application of modern deep RL algorithms to often expensive or safety critical hardware. In this work, we introduce &#34;Box o Flows&#34;, a novel benchtop experimental control system for systematically evaluating RL algorithms in dynamic real-world scenarios. We describe the key components of the Box o Flows, and through a series of experiments demonstrate how state-of-the-art model-free RL algorithms can synthesize a variety of complex behaviors via simple reward specifications. Furthermore, we explore the role of offline RL in data-efficient hypothesis testing by reusing past experiences. We believe that the insights gained from this preliminary study and the availability of systems like the Box o Flows support the way forward for developing systematic RL algorithms that can be generally applied to complex, dynamical systems. Supplementary material and videos of experiments are available at https://sites.google.com/view/box-o-flows/home. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.06102v1-abstract-full').style.display = 'none'; document.getElementById('2402.06102v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.05546">arXiv:2402.05546</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.05546">pdf</a>, <a href="https://arxiv.org/format/2402.05546">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Offline Actor-Critic Reinforcement Learning Scales to Large Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Springenberg%2C+J+T">Jost Tobias Springenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jingwei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Groth%2C+O">Oliver Groth</a>, <a href="/search/cs?searchtype=author&amp;query=Bloesch%2C+M">Michael Bloesch</a>, <a href="/search/cs?searchtype=author&amp;query=Lampe%2C+T">Thomas Lampe</a>, <a href="/search/cs?searchtype=author&amp;query=Brakel%2C+P">Philemon Brakel</a>, <a href="/search/cs?searchtype=author&amp;query=Bechtle%2C+S">Sarah Bechtle</a>, <a href="/search/cs?searchtype=author&amp;query=Kapturowski%2C+S">Steven Kapturowski</a>, <a href="/search/cs?searchtype=author&amp;query=Hafner%2C+R">Roland Hafner</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.05546v1-abstract-short" style="display: inline;"> We show that offline actor-critic reinforcement learning can scale to large models - such as transformers - and follows similar scaling laws as supervised learning. We find that offline actor-critic algorithms can outperform strong, supervised, behavioral cloning baselines for multi-task training on a large dataset containing both sub-optimal and expert behavior on 132 continuous control tasks. We&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.05546v1-abstract-full').style.display = 'inline'; document.getElementById('2402.05546v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.05546v1-abstract-full" style="display: none;"> We show that offline actor-critic reinforcement learning can scale to large models - such as transformers - and follows similar scaling laws as supervised learning. We find that offline actor-critic algorithms can outperform strong, supervised, behavioral cloning baselines for multi-task training on a large dataset containing both sub-optimal and expert behavior on 132 continuous control tasks. We introduce a Perceiver-based actor-critic model and elucidate the key model features needed to make offline RL work with self- and cross-attention modules. Overall, we find that: i) simple offline actor critic algorithms are a natural choice for gradually moving away from the currently predominant paradigm of behavioral cloning, and ii) via offline RL it is possible to learn multi-task policies that master many domains simultaneously, including real robotics tasks, from sub-optimal demonstrations or self-generated data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.05546v1-abstract-full').style.display = 'none'; document.getElementById('2402.05546v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.11374">arXiv:2312.11374</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.11374">pdf</a>, <a href="https://arxiv.org/format/2312.11374">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Mastering Stacking of Diverse Shapes with Large-Scale Iterative Reinforcement Learning on Real Robots </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lampe%2C+T">Thomas Lampe</a>, <a href="/search/cs?searchtype=author&amp;query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&amp;query=Bechtle%2C+S">Sarah Bechtle</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+S+H">Sandy H. Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Springenberg%2C+J+T">Jost Tobias Springenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Bloesch%2C+M">Michael Bloesch</a>, <a href="/search/cs?searchtype=author&amp;query=Groth%2C+O">Oliver Groth</a>, <a href="/search/cs?searchtype=author&amp;query=Hafner%2C+R">Roland Hafner</a>, <a href="/search/cs?searchtype=author&amp;query=Hertweck%2C+T">Tim Hertweck</a>, <a href="/search/cs?searchtype=author&amp;query=Neunert%2C+M">Michael Neunert</a>, <a href="/search/cs?searchtype=author&amp;query=Wulfmeier%2C+M">Markus Wulfmeier</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jingwei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Nori%2C+F">Francesco Nori</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.11374v1-abstract-short" style="display: inline;"> Reinforcement learning solely from an agent&#39;s self-generated data is often believed to be infeasible for learning on real robots, due to the amount of data needed. However, if done right, agents learning from real data can be surprisingly efficient through re-using previously collected sub-optimal data. In this paper we demonstrate how the increased understanding of off-policy learning methods and&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.11374v1-abstract-full').style.display = 'inline'; document.getElementById('2312.11374v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.11374v1-abstract-full" style="display: none;"> Reinforcement learning solely from an agent&#39;s self-generated data is often believed to be infeasible for learning on real robots, due to the amount of data needed. However, if done right, agents learning from real data can be surprisingly efficient through re-using previously collected sub-optimal data. In this paper we demonstrate how the increased understanding of off-policy learning methods and their embedding in an iterative online/offline scheme (``collect and infer&#39;&#39;) can drastically improve data-efficiency by using all the collected experience, which empowers learning from real robot experience only. Moreover, the resulting policy improves significantly over the state of the art on a recently proposed real robot manipulation benchmark. Our approach learns end-to-end, directly from pixels, and does not rely on additional human domain knowledge such as a simulator or demonstrations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.11374v1-abstract-full').style.display = 'none'; document.getElementById('2312.11374v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.09120">arXiv:2312.09120</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.09120">pdf</a>, <a href="https://arxiv.org/format/2312.09120">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Less is more -- the Dispatcher/ Executor principle for multi-task Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a>, <a href="/search/cs?searchtype=author&amp;query=Hertweck%2C+T">Tim Hertweck</a>, <a href="/search/cs?searchtype=author&amp;query=Hafner%2C+R">Roland Hafner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.09120v1-abstract-short" style="display: inline;"> Humans instinctively know how to neglect details when it comes to solve complex decision making problems in environments with unforeseeable variations. This abstraction process seems to be a vital property for most biological systems and helps to &#39;abstract away&#39; unnecessary details and boost generalisation. In this work we introduce the dispatcher/ executor principle for the design of multi-task R&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.09120v1-abstract-full').style.display = 'inline'; document.getElementById('2312.09120v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.09120v1-abstract-full" style="display: none;"> Humans instinctively know how to neglect details when it comes to solve complex decision making problems in environments with unforeseeable variations. This abstraction process seems to be a vital property for most biological systems and helps to &#39;abstract away&#39; unnecessary details and boost generalisation. In this work we introduce the dispatcher/ executor principle for the design of multi-task Reinforcement Learning controllers. It suggests to partition the controller in two entities, one that understands the task (the dispatcher) and one that computes the controls for the specific device (the executor) - and to connect these two by a strongly regularizing communication channel. The core rationale behind this position paper is that changes in structure and design principles can improve generalisation properties and drastically enforce data-efficiency. It is in some sense a &#39;yes, and ...&#39; response to the current trend of using large neural networks trained on vast amounts of data and bet on emerging generalisation properties. While we agree on the power of scaling - in the sense of Sutton&#39;s &#39;bitter lesson&#39; - we will give some evidence, that considering structure and adding design principles can be a valuable and critical component in particular when data is not abundant and infinite, but is a precious resource. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.09120v1-abstract-full').style.display = 'none'; document.getElementById('2312.09120v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 9 figures. Videos showing the results can be found at https://sites.google.com/view/dispatcher-executor</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.15951">arXiv:2311.15951</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.15951">pdf</a>, <a href="https://arxiv.org/format/2311.15951">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Replay across Experiments: A Natural Extension of Off-Policy RL </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tirumala%2C+D">Dhruva Tirumala</a>, <a href="/search/cs?searchtype=author&amp;query=Lampe%2C+T">Thomas Lampe</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J+E">Jose Enrique Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Haarnoja%2C+T">Tuomas Haarnoja</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+S">Sandy Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Lever%2C+G">Guy Lever</a>, <a href="/search/cs?searchtype=author&amp;query=Moran%2C+B">Ben Moran</a>, <a href="/search/cs?searchtype=author&amp;query=Hertweck%2C+T">Tim Hertweck</a>, <a href="/search/cs?searchtype=author&amp;query=Hasenclever%2C+L">Leonard Hasenclever</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&amp;query=Wulfmeier%2C+M">Markus Wulfmeier</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.15951v2-abstract-short" style="display: inline;"> Replaying data is a principal mechanism underlying the stability and data efficiency of off-policy reinforcement learning (RL). We present an effective yet simple framework to extend the use of replays across multiple experiments, minimally adapting the RL workflow for sizeable improvements in controller performance and research iteration times. At its core, Replay Across Experiments (RaE) involve&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.15951v2-abstract-full').style.display = 'inline'; document.getElementById('2311.15951v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.15951v2-abstract-full" style="display: none;"> Replaying data is a principal mechanism underlying the stability and data efficiency of off-policy reinforcement learning (RL). We present an effective yet simple framework to extend the use of replays across multiple experiments, minimally adapting the RL workflow for sizeable improvements in controller performance and research iteration times. At its core, Replay Across Experiments (RaE) involves reusing experience from previous experiments to improve exploration and bootstrap learning while reducing required changes to a minimum in comparison to prior work. We empirically show benefits across a number of RL algorithms and challenging control domains spanning both locomotion and manipulation, including hard exploration tasks from egocentric vision. Through comprehensive ablations, we demonstrate robustness to the quality and amount of data available and various hyperparameter choices. Finally, we discuss how our approach can be applied more broadly across research life cycles and can increase resilience by reloading data across random seeds or hyperparameter variations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.15951v2-abstract-full').style.display = 'none'; document.getElementById('2311.15951v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.07578">arXiv:2309.07578</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.07578">pdf</a>, <a href="https://arxiv.org/format/2309.07578">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Equivariant Data Augmentation for Generalization in Offline Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pinneri%2C+C">Cristina Pinneri</a>, <a href="/search/cs?searchtype=author&amp;query=Bechtle%2C+S">Sarah Bechtle</a>, <a href="/search/cs?searchtype=author&amp;query=Wulfmeier%2C+M">Markus Wulfmeier</a>, <a href="/search/cs?searchtype=author&amp;query=Byravan%2C+A">Arunkumar Byravan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jingwei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Whitney%2C+W+F">William F. Whitney</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.07578v1-abstract-short" style="display: inline;"> We present a novel approach to address the challenge of generalization in offline reinforcement learning (RL), where the agent learns from a fixed dataset without any additional interaction with the environment. Specifically, we aim to improve the agent&#39;s ability to generalize to out-of-distribution goals. To achieve this, we propose to learn a dynamics model and check if it is equivariant with re&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.07578v1-abstract-full').style.display = 'inline'; document.getElementById('2309.07578v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.07578v1-abstract-full" style="display: none;"> We present a novel approach to address the challenge of generalization in offline reinforcement learning (RL), where the agent learns from a fixed dataset without any additional interaction with the environment. Specifically, we aim to improve the agent&#39;s ability to generalize to out-of-distribution goals. To achieve this, we propose to learn a dynamics model and check if it is equivariant with respect to a fixed type of transformation, namely translations in the state space. We then use an entropy regularizer to increase the equivariant set and augment the dataset with the resulting transformed samples. Finally, we learn a new policy offline based on the augmented dataset, with an off-the-shelf offline RL algorithm. Our experimental results demonstrate that our approach can greatly improve the test performance of the policy on the considered environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.07578v1-abstract-full').style.display = 'none'; document.getElementById('2309.07578v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.15470">arXiv:2308.15470</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.15470">pdf</a>, <a href="https://arxiv.org/format/2308.15470">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Policy composition in reinforcement learning via multi-objective policy optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mishra%2C+S">Shruti Mishra</a>, <a href="/search/cs?searchtype=author&amp;query=Anand%2C+A">Ankit Anand</a>, <a href="/search/cs?searchtype=author&amp;query=Hoffmann%2C+J">Jordan Hoffmann</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a>, <a href="/search/cs?searchtype=author&amp;query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&amp;query=Precup%2C+D">Doina Precup</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.15470v2-abstract-short" style="display: inline;"> We enable reinforcement learning agents to learn successful behavior policies by utilizing relevant pre-existing teacher policies. The teacher policies are introduced as objectives, in addition to the task objective, in a multi-objective policy optimization setting. Using the Multi-Objective Maximum a Posteriori Policy Optimization algorithm (Abdolmaleki et al. 2020), we show that teacher policies&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.15470v2-abstract-full').style.display = 'inline'; document.getElementById('2308.15470v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.15470v2-abstract-full" style="display: none;"> We enable reinforcement learning agents to learn successful behavior policies by utilizing relevant pre-existing teacher policies. The teacher policies are introduced as objectives, in addition to the task objective, in a multi-objective policy optimization setting. Using the Multi-Objective Maximum a Posteriori Policy Optimization algorithm (Abdolmaleki et al. 2020), we show that teacher policies can help speed up learning, particularly in the absence of shaping rewards. In two domains with continuous observation and action spaces, our agents successfully compose teacher policies in sequence and in parallel, and are also able to further extend the policies of the teachers in order to solve the task. Depending on the specified combination of task and teacher(s), teacher(s) may naturally act to limit the final performance of an agent. The extent to which agents are required to adhere to teacher policies are determined by hyperparameters which determine both the effect of teachers on learning speed and the eventual performance of the agent on the task. In the humanoid domain (Tassa et al. 2018), we also equip agents with the ability to control the selection of teachers. With this ability, agents are able to meaningfully compose from the teacher policies to achieve a superior task reward on the walk task than in cases without access to the teacher policies. We show the resemblance of composed task policies with the corresponding teacher policies through videos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.15470v2-abstract-full').style.display = 'none'; document.getElementById('2308.15470v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.07741">arXiv:2308.07741</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.07741">pdf</a>, <a href="https://arxiv.org/format/2308.07741">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Real Robot Challenge 2022: Learning Dexterous Manipulation from Offline Data in the Real World </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=G%C3%BCrtler%2C+N">Nico G眉rtler</a>, <a href="/search/cs?searchtype=author&amp;query=Widmaier%2C+F">Felix Widmaier</a>, <a href="/search/cs?searchtype=author&amp;query=Sancaktar%2C+C">Cansu Sancaktar</a>, <a href="/search/cs?searchtype=author&amp;query=Blaes%2C+S">Sebastian Blaes</a>, <a href="/search/cs?searchtype=author&amp;query=Kolev%2C+P">Pavel Kolev</a>, <a href="/search/cs?searchtype=author&amp;query=Bauer%2C+S">Stefan Bauer</a>, <a href="/search/cs?searchtype=author&amp;query=W%C3%BCthrich%2C+M">Manuel W眉thrich</a>, <a href="/search/cs?searchtype=author&amp;query=Wulfmeier%2C+M">Markus Wulfmeier</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a>, <a href="/search/cs?searchtype=author&amp;query=Allshire%2C+A">Arthur Allshire</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Q">Qiang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=McCarthy%2C+R">Robert McCarthy</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+H">Hangyeol Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Baek%2C+J">Jongchan Baek</a>, <a href="/search/cs?searchtype=author&amp;query=Kwon%2C+W">Wookyong Kwon</a>, <a href="/search/cs?searchtype=author&amp;query=Qian%2C+S">Shanliang Qian</a>, <a href="/search/cs?searchtype=author&amp;query=Toshimitsu%2C+Y">Yasunori Toshimitsu</a>, <a href="/search/cs?searchtype=author&amp;query=Michelis%2C+M+Y">Mike Yan Michelis</a>, <a href="/search/cs?searchtype=author&amp;query=Kazemipour%2C+A">Amirhossein Kazemipour</a>, <a href="/search/cs?searchtype=author&amp;query=Raayatsanati%2C+A">Arman Raayatsanati</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+H">Hehui Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Cangan%2C+B+G">Barnabas Gavin Cangan</a>, <a href="/search/cs?searchtype=author&amp;query=Sch%C3%B6lkopf%2C+B">Bernhard Sch枚lkopf</a>, <a href="/search/cs?searchtype=author&amp;query=Martius%2C+G">Georg Martius</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.07741v3-abstract-short" style="display: inline;"> Experimentation on real robots is demanding in terms of time and costs. For this reason, a large part of the reinforcement learning (RL) community uses simulators to develop and benchmark algorithms. However, insights gained in simulation do not necessarily translate to real robots, in particular for tasks involving complex interactions with the environment. The Real Robot Challenge 2022 therefore&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.07741v3-abstract-full').style.display = 'inline'; document.getElementById('2308.07741v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.07741v3-abstract-full" style="display: none;"> Experimentation on real robots is demanding in terms of time and costs. For this reason, a large part of the reinforcement learning (RL) community uses simulators to develop and benchmark algorithms. However, insights gained in simulation do not necessarily translate to real robots, in particular for tasks involving complex interactions with the environment. The Real Robot Challenge 2022 therefore served as a bridge between the RL and robotics communities by allowing participants to experiment remotely with a real robot - as easily as in simulation. In the last years, offline reinforcement learning has matured into a promising paradigm for learning from pre-collected datasets, alleviating the reliance on expensive online interactions. We therefore asked the participants to learn two dexterous manipulation tasks involving pushing, grasping, and in-hand orientation from provided real-robot datasets. An extensive software documentation and an initial stage based on a simulation of the real set-up made the competition particularly accessible. By giving each team plenty of access budget to evaluate their offline-learned policies on a cluster of seven identical real TriFinger platforms, we organized an exciting competition for machine learners and roboticists alike. In this work we state the rules of the competition, present the methods used by the winning teams and compare their results with a benchmark of state-of-the-art offline RL algorithms on the challenge datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.07741v3-abstract-full').style.display = 'none'; document.getElementById('2308.07741v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Typo in author list fixed</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.11546">arXiv:2307.11546</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2307.11546">pdf</a>, <a href="https://arxiv.org/format/2307.11546">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Plasma Physics">physics.plasm-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Towards practical reinforcement learning for tokamak magnetic control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tracey%2C+B+D">Brendan D. Tracey</a>, <a href="/search/cs?searchtype=author&amp;query=Michi%2C+A">Andrea Michi</a>, <a href="/search/cs?searchtype=author&amp;query=Chervonyi%2C+Y">Yuri Chervonyi</a>, <a href="/search/cs?searchtype=author&amp;query=Davies%2C+I">Ian Davies</a>, <a href="/search/cs?searchtype=author&amp;query=Paduraru%2C+C">Cosmin Paduraru</a>, <a href="/search/cs?searchtype=author&amp;query=Lazic%2C+N">Nevena Lazic</a>, <a href="/search/cs?searchtype=author&amp;query=Felici%2C+F">Federico Felici</a>, <a href="/search/cs?searchtype=author&amp;query=Ewalds%2C+T">Timo Ewalds</a>, <a href="/search/cs?searchtype=author&amp;query=Donner%2C+C">Craig Donner</a>, <a href="/search/cs?searchtype=author&amp;query=Galperti%2C+C">Cristian Galperti</a>, <a href="/search/cs?searchtype=author&amp;query=Buchli%2C+J">Jonas Buchli</a>, <a href="/search/cs?searchtype=author&amp;query=Neunert%2C+M">Michael Neunert</a>, <a href="/search/cs?searchtype=author&amp;query=Huber%2C+A">Andrea Huber</a>, <a href="/search/cs?searchtype=author&amp;query=Evens%2C+J">Jonathan Evens</a>, <a href="/search/cs?searchtype=author&amp;query=Kurylowicz%2C+P">Paula Kurylowicz</a>, <a href="/search/cs?searchtype=author&amp;query=Mankowitz%2C+D+J">Daniel J. Mankowitz</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a>, <a href="/search/cs?searchtype=author&amp;query=Team%2C+T+T">The TCV Team</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.11546v2-abstract-short" style="display: inline;"> Reinforcement learning (RL) has shown promising results for real-time control systems, including the domain of plasma magnetic control. However, there are still significant drawbacks compared to traditional feedback control approaches for magnetic confinement. In this work, we address key drawbacks of the RL method; achieving higher control accuracy for desired plasma properties, reducing the stea&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.11546v2-abstract-full').style.display = 'inline'; document.getElementById('2307.11546v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.11546v2-abstract-full" style="display: none;"> Reinforcement learning (RL) has shown promising results for real-time control systems, including the domain of plasma magnetic control. However, there are still significant drawbacks compared to traditional feedback control approaches for magnetic confinement. In this work, we address key drawbacks of the RL method; achieving higher control accuracy for desired plasma properties, reducing the steady-state error, and decreasing the required time to learn new tasks. We build on top of \cite{degrave2022magnetic}, and present algorithmic improvements to the agent architecture and training procedure. We present simulation results that show up to 65\% improvement in shape accuracy, achieve substantial reduction in the long-term bias of the plasma current, and additionally reduce the training time required to learn new tasks by a factor of 3 or more. We present new experiments using the upgraded RL-based controllers on the TCV tokamak, which validate the simulation results achieved, and point the way towards routinely achieving accurate discharges using the RL approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.11546v2-abstract-full').style.display = 'none'; document.getElementById('2307.11546v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.09668">arXiv:2307.09668</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2307.09668">pdf</a>, <a href="https://arxiv.org/format/2307.09668">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Towards A Unified Agent with Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Di+Palo%2C+N">Norman Di Palo</a>, <a href="/search/cs?searchtype=author&amp;query=Byravan%2C+A">Arunkumar Byravan</a>, <a href="/search/cs?searchtype=author&amp;query=Hasenclever%2C+L">Leonard Hasenclever</a>, <a href="/search/cs?searchtype=author&amp;query=Wulfmeier%2C+M">Markus Wulfmeier</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.09668v1-abstract-short" style="display: inline;"> Language Models and Vision Language Models have recently demonstrated unprecedented capabilities in terms of understanding human intentions, reasoning, scene understanding, and planning-like behaviour, in text form, among many others. In this work, we investigate how to embed and leverage such abilities in Reinforcement Learning (RL) agents. We design a framework that uses language as the core rea&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.09668v1-abstract-full').style.display = 'inline'; document.getElementById('2307.09668v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.09668v1-abstract-full" style="display: none;"> Language Models and Vision Language Models have recently demonstrated unprecedented capabilities in terms of understanding human intentions, reasoning, scene understanding, and planning-like behaviour, in text form, among many others. In this work, we investigate how to embed and leverage such abilities in Reinforcement Learning (RL) agents. We design a framework that uses language as the core reasoning tool, exploring how this enables an agent to tackle a series of fundamental RL challenges, such as efficient exploration, reusing experience data, scheduling skills, and learning from observations, which traditionally require separate, vertically designed algorithms. We test our method on a sparse-reward simulated robotic manipulation environment, where a robot needs to stack a set of objects. We demonstrate substantial performance improvements over baselines in exploration efficiency and ability to reuse data from offline datasets, and illustrate how to reuse learned skills to solve novel tasks or imitate videos of human experts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.09668v1-abstract-full').style.display = 'none'; document.getElementById('2307.09668v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.11706">arXiv:2306.11706</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2306.11706">pdf</a>, <a href="https://arxiv.org/format/2306.11706">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RoboCat: A Self-Improving Generalist Agent for Robotic Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bousmalis%2C+K">Konstantinos Bousmalis</a>, <a href="/search/cs?searchtype=author&amp;query=Vezzani%2C+G">Giulia Vezzani</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+D">Dushyant Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Devin%2C+C">Coline Devin</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+A+X">Alex X. Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Bauza%2C+M">Maria Bauza</a>, <a href="/search/cs?searchtype=author&amp;query=Davchev%2C+T">Todor Davchev</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Y">Yuxiang Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Gupta%2C+A">Agrim Gupta</a>, <a href="/search/cs?searchtype=author&amp;query=Raju%2C+A">Akhil Raju</a>, <a href="/search/cs?searchtype=author&amp;query=Laurens%2C+A">Antoine Laurens</a>, <a href="/search/cs?searchtype=author&amp;query=Fantacci%2C+C">Claudio Fantacci</a>, <a href="/search/cs?searchtype=author&amp;query=Dalibard%2C+V">Valentin Dalibard</a>, <a href="/search/cs?searchtype=author&amp;query=Zambelli%2C+M">Martina Zambelli</a>, <a href="/search/cs?searchtype=author&amp;query=Martins%2C+M">Murilo Martins</a>, <a href="/search/cs?searchtype=author&amp;query=Pevceviciute%2C+R">Rugile Pevceviciute</a>, <a href="/search/cs?searchtype=author&amp;query=Blokzijl%2C+M">Michiel Blokzijl</a>, <a href="/search/cs?searchtype=author&amp;query=Denil%2C+M">Misha Denil</a>, <a href="/search/cs?searchtype=author&amp;query=Batchelor%2C+N">Nathan Batchelor</a>, <a href="/search/cs?searchtype=author&amp;query=Lampe%2C+T">Thomas Lampe</a>, <a href="/search/cs?searchtype=author&amp;query=Parisotto%2C+E">Emilio Parisotto</a>, <a href="/search/cs?searchtype=author&amp;query=%C5%BBo%C5%82na%2C+K">Konrad 呕o艂na</a>, <a href="/search/cs?searchtype=author&amp;query=Reed%2C+S">Scott Reed</a>, <a href="/search/cs?searchtype=author&amp;query=Colmenarejo%2C+S+G">Sergio G贸mez Colmenarejo</a>, <a href="/search/cs?searchtype=author&amp;query=Scholz%2C+J">Jon Scholz</a> , et al. (14 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.11706v2-abstract-short" style="display: inline;"> The ability to leverage heterogeneous robotic experience from different robots and tasks to quickly master novel skills and embodiments has the potential to transform robot learning. Inspired by recent advances in foundation models for vision and language, we propose a multi-embodiment, multi-task generalist agent for robotic manipulation. This agent, named RoboCat, is a visual goal-conditioned de&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.11706v2-abstract-full').style.display = 'inline'; document.getElementById('2306.11706v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.11706v2-abstract-full" style="display: none;"> The ability to leverage heterogeneous robotic experience from different robots and tasks to quickly master novel skills and embodiments has the potential to transform robot learning. Inspired by recent advances in foundation models for vision and language, we propose a multi-embodiment, multi-task generalist agent for robotic manipulation. This agent, named RoboCat, is a visual goal-conditioned decision transformer capable of consuming action-labelled visual experience. This data spans a large repertoire of motor control skills from simulated and real robotic arms with varying sets of observations and actions. With RoboCat, we demonstrate the ability to generalise to new tasks and robots, both zero-shot as well as through adaptation using only 100-1000 examples for the target task. We also show how a trained model itself can be used to generate data for subsequent training iterations, thus providing a basic building block for an autonomous improvement loop. We investigate the agent&#39;s capabilities, with large-scale evaluations both in simulation and on three different real robot embodiments. We find that as we grow and diversify its training data, RoboCat not only shows signs of cross-task transfer, but also becomes more efficient at adapting to new tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.11706v2-abstract-full').style.display = 'none'; document.getElementById('2306.11706v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Transactions on Machine Learning Research (12/2023)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.10912">arXiv:2305.10912</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.10912">pdf</a>, <a href="https://arxiv.org/format/2305.10912">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> A Generalist Dynamics Model for Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Schubert%2C+I">Ingmar Schubert</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jingwei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Bruce%2C+J">Jake Bruce</a>, <a href="/search/cs?searchtype=author&amp;query=Bechtle%2C+S">Sarah Bechtle</a>, <a href="/search/cs?searchtype=author&amp;query=Parisotto%2C+E">Emilio Parisotto</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a>, <a href="/search/cs?searchtype=author&amp;query=Springenberg%2C+J+T">Jost Tobias Springenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Byravan%2C+A">Arunkumar Byravan</a>, <a href="/search/cs?searchtype=author&amp;query=Hasenclever%2C+L">Leonard Hasenclever</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.10912v2-abstract-short" style="display: inline;"> We investigate the use of transformer sequence models as dynamics models (TDMs) for control. We find that TDMs exhibit strong generalization capabilities to unseen environments, both in a few-shot setting, where a generalist TDM is fine-tuned with small amounts of data from the target environment, and in a zero-shot setting, where a generalist TDM is applied to an unseen environment without any fu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.10912v2-abstract-full').style.display = 'inline'; document.getElementById('2305.10912v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.10912v2-abstract-full" style="display: none;"> We investigate the use of transformer sequence models as dynamics models (TDMs) for control. We find that TDMs exhibit strong generalization capabilities to unseen environments, both in a few-shot setting, where a generalist TDM is fine-tuned with small amounts of data from the target environment, and in a zero-shot setting, where a generalist TDM is applied to an unseen environment without any further training. Here, we demonstrate that generalizing system dynamics can work much better than generalizing optimal behavior directly as a policy. Additional results show that TDMs also perform well in a single-environment learning setting when compared to a number of baseline models. These properties make TDMs a promising ingredient for a foundation model of control. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.10912v2-abstract-full').style.display = 'none'; document.getElementById('2305.10912v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.12617">arXiv:2302.12617</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2302.12617">pdf</a>, <a href="https://arxiv.org/format/2302.12617">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Leveraging Jumpy Models for Planning and Fast Learning in Robotic Domains </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jingwei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Springenberg%2C+J+T">Jost Tobias Springenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Byravan%2C+A">Arunkumar Byravan</a>, <a href="/search/cs?searchtype=author&amp;query=Hasenclever%2C+L">Leonard Hasenclever</a>, <a href="/search/cs?searchtype=author&amp;query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+D">Dushyant Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.12617v1-abstract-short" style="display: inline;"> In this paper we study the problem of learning multi-step dynamics prediction models (jumpy models) from unlabeled experience and their utility for fast inference of (high-level) plans in downstream tasks. In particular we propose to learn a jumpy model alongside a skill embedding space offline, from previously collected experience for which no labels or reward annotations are required. We then in&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.12617v1-abstract-full').style.display = 'inline'; document.getElementById('2302.12617v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.12617v1-abstract-full" style="display: none;"> In this paper we study the problem of learning multi-step dynamics prediction models (jumpy models) from unlabeled experience and their utility for fast inference of (high-level) plans in downstream tasks. In particular we propose to learn a jumpy model alongside a skill embedding space offline, from previously collected experience for which no labels or reward annotations are required. We then investigate several options of harnessing those learned components in combination with model-based planning or model-free reinforcement learning (RL) to speed up learning on downstream tasks. We conduct a set of experiments in the RGB-stacking environment, showing that planning with the learned skills and the associated model can enable zero-shot generalization to new tasks, and can further speed up training of policies via reinforcement learning. These experiments demonstrate that jumpy models which incorporate temporal abstraction can facilitate planning in long-horizon tasks in which standard dynamics models fail. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.12617v1-abstract-full').style.display = 'none'; document.getElementById('2302.12617v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.13743">arXiv:2211.13743</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2211.13743">pdf</a>, <a href="https://arxiv.org/format/2211.13743">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SkillS: Adaptive Skill Sequencing for Efficient Temporally-Extended Exploration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Vezzani%2C+G">Giulia Vezzani</a>, <a href="/search/cs?searchtype=author&amp;query=Tirumala%2C+D">Dhruva Tirumala</a>, <a href="/search/cs?searchtype=author&amp;query=Wulfmeier%2C+M">Markus Wulfmeier</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+D">Dushyant Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&amp;query=Moran%2C+B">Ben Moran</a>, <a href="/search/cs?searchtype=author&amp;query=Haarnoja%2C+T">Tuomas Haarnoja</a>, <a href="/search/cs?searchtype=author&amp;query=Humplik%2C+J">Jan Humplik</a>, <a href="/search/cs?searchtype=author&amp;query=Hafner%2C+R">Roland Hafner</a>, <a href="/search/cs?searchtype=author&amp;query=Neunert%2C+M">Michael Neunert</a>, <a href="/search/cs?searchtype=author&amp;query=Fantacci%2C+C">Claudio Fantacci</a>, <a href="/search/cs?searchtype=author&amp;query=Hertweck%2C+T">Tim Hertweck</a>, <a href="/search/cs?searchtype=author&amp;query=Lampe%2C+T">Thomas Lampe</a>, <a href="/search/cs?searchtype=author&amp;query=Sadeghi%2C+F">Fereshteh Sadeghi</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.13743v3-abstract-short" style="display: inline;"> The ability to effectively reuse prior knowledge is a key requirement when building general and flexible Reinforcement Learning (RL) agents. Skill reuse is one of the most common approaches, but current methods have considerable limitations.For example, fine-tuning an existing policy frequently fails, as the policy can degrade rapidly early in training. In a similar vein, distillation of expert be&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.13743v3-abstract-full').style.display = 'inline'; document.getElementById('2211.13743v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.13743v3-abstract-full" style="display: none;"> The ability to effectively reuse prior knowledge is a key requirement when building general and flexible Reinforcement Learning (RL) agents. Skill reuse is one of the most common approaches, but current methods have considerable limitations.For example, fine-tuning an existing policy frequently fails, as the policy can degrade rapidly early in training. In a similar vein, distillation of expert behavior can lead to poor results when given sub-optimal experts. We compare several common approaches for skill transfer on multiple domains including changes in task and system dynamics. We identify how existing methods can fail and introduce an alternative approach to mitigate these problems. Our approach learns to sequence existing temporally-extended skills for exploration but learns the final policy directly from the raw experience. This conceptual split enables rapid adaptation and thus efficient data collection but without constraining the final solution.It significantly outperforms many classical methods across a suite of evaluation tasks and we use a broad set of ablations to highlight the importance of differentc omponents of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.13743v3-abstract-full').style.display = 'none'; document.getElementById('2211.13743v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.12566">arXiv:2210.12566</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2210.12566">pdf</a>, <a href="https://arxiv.org/format/2210.12566">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Solving Continuous Control via Q-learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Seyde%2C+T">Tim Seyde</a>, <a href="/search/cs?searchtype=author&amp;query=Werner%2C+P">Peter Werner</a>, <a href="/search/cs?searchtype=author&amp;query=Schwarting%2C+W">Wilko Schwarting</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a>, <a href="/search/cs?searchtype=author&amp;query=Rus%2C+D">Daniela Rus</a>, <a href="/search/cs?searchtype=author&amp;query=Wulfmeier%2C+M">Markus Wulfmeier</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.12566v2-abstract-short" style="display: inline;"> While there has been substantial success for solving continuous control with actor-critic methods, simpler critic-only methods such as Q-learning find limited application in the associated high-dimensional action spaces. However, most actor-critic methods come at the cost of added complexity: heuristics for stabilisation, compute requirements and wider hyperparameter search spaces. We show that a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.12566v2-abstract-full').style.display = 'inline'; document.getElementById('2210.12566v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.12566v2-abstract-full" style="display: none;"> While there has been substantial success for solving continuous control with actor-critic methods, simpler critic-only methods such as Q-learning find limited application in the associated high-dimensional action spaces. However, most actor-critic methods come at the cost of added complexity: heuristics for stabilisation, compute requirements and wider hyperparameter search spaces. We show that a simple modification of deep Q-learning largely alleviates these issues. By combining bang-bang action discretization with value decomposition, framing single-agent control as cooperative multi-agent reinforcement learning (MARL), this simple critic-only approach matches performance of state-of-the-art continuous actor-critic methods when learning from features or pixels. We extend classical bandit examples from cooperative MARL to provide intuition for how decoupled critics leverage state information to coordinate joint optimization, and demonstrate surprisingly strong performance across a variety of continuous control tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.12566v2-abstract-full').style.display = 'none'; document.getElementById('2210.12566v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.01947">arXiv:2209.01947</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2209.01947">pdf</a>, <a href="https://arxiv.org/format/2209.01947">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> MO2: Model-Based Offline Options </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Salter%2C+S">Sasha Salter</a>, <a href="/search/cs?searchtype=author&amp;query=Wulfmeier%2C+M">Markus Wulfmeier</a>, <a href="/search/cs?searchtype=author&amp;query=Tirumala%2C+D">Dhruva Tirumala</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a>, <a href="/search/cs?searchtype=author&amp;query=Hadsell%2C+R">Raia Hadsell</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+D">Dushyant Rao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.01947v1-abstract-short" style="display: inline;"> The ability to discover useful behaviours from past experience and transfer them to new tasks is considered a core component of natural embodied intelligence. Inspired by neuroscience, discovering behaviours that switch at bottleneck states have been long sought after for inducing plans of minimum description length across tasks. Prior approaches have either only supported online, on-policy, bottl&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.01947v1-abstract-full').style.display = 'inline'; document.getElementById('2209.01947v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.01947v1-abstract-full" style="display: none;"> The ability to discover useful behaviours from past experience and transfer them to new tasks is considered a core component of natural embodied intelligence. Inspired by neuroscience, discovering behaviours that switch at bottleneck states have been long sought after for inducing plans of minimum description length across tasks. Prior approaches have either only supported online, on-policy, bottleneck state discovery, limiting sample-efficiency, or discrete state-action domains, restricting applicability. To address this, we introduce Model-Based Offline Options (MO2), an offline hindsight framework supporting sample-efficient bottleneck option discovery over continuous state-action spaces. Once bottleneck options are learnt offline over source domains, they are transferred online to improve exploration and value estimation on the transfer domain. Our experiments show that on complex long-horizon continuous control tasks with sparse, delayed rewards, MO2&#39;s properties are essential and lead to performance exceeding recent option learning methods. Additional ablations further demonstrate the impact on option predictability and credit assignment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.01947v1-abstract-full').style.display = 'none'; document.getElementById('2209.01947v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at 1st Conference on Lifelong Learning Agents (CoLLAs) Conference Track, 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.10256">arXiv:2204.10256</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2204.10256">pdf</a>, <a href="https://arxiv.org/format/2204.10256">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Revisiting Gaussian mixture critics in off-policy reinforcement learning: a sample-based approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Shahriari%2C+B">Bobak Shahriari</a>, <a href="/search/cs?searchtype=author&amp;query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&amp;query=Byravan%2C+A">Arunkumar Byravan</a>, <a href="/search/cs?searchtype=author&amp;query=Friesen%2C+A">Abe Friesen</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Siqi Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Springenberg%2C+J+T">Jost Tobias Springenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&amp;query=Hoffman%2C+M">Matt Hoffman</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.10256v2-abstract-short" style="display: inline;"> Actor-critic algorithms that make use of distributional policy evaluation have frequently been shown to outperform their non-distributional counterparts on many challenging control tasks. Examples of this behavior include the D4PG and DMPO algorithms as compared to DDPG and MPO, respectively [Barth-Maron et al., 2018; Hoffman et al., 2020]. However, both agents rely on the C51 critic for value est&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.10256v2-abstract-full').style.display = 'inline'; document.getElementById('2204.10256v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.10256v2-abstract-full" style="display: none;"> Actor-critic algorithms that make use of distributional policy evaluation have frequently been shown to outperform their non-distributional counterparts on many challenging control tasks. Examples of this behavior include the D4PG and DMPO algorithms as compared to DDPG and MPO, respectively [Barth-Maron et al., 2018; Hoffman et al., 2020]. However, both agents rely on the C51 critic for value estimation.One major drawback of the C51 approach is its requirement of prior knowledge about the minimum andmaximum values a policy can attain as well as the number of bins used, which fixes the resolution ofthe distributional estimate. While the DeepMind control suite of tasks utilizes standardized rewards and episode lengths, thus enabling the entire suite to be solved with a single setting of these hyperparameters, this is often not the case. This paper revisits a natural alternative that removes this requirement, namelya mixture of Gaussians, and a simple sample-based loss function to train it in an off-policy regime. We empirically evaluate its performance on a broad range of continuous control tasks and demonstrate that it eliminates the need for these distributional hyperparameters and achieves state-of-the-art performance on a variety of challenging tasks (e.g. the humanoid, dog, quadruped, and manipulator domains). Finallywe provide an implementation in the Acme agent repository. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.10256v2-abstract-full').style.display = 'none'; document.getElementById('2204.10256v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2201.11861">arXiv:2201.11861</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2201.11861">pdf</a>, <a href="https://arxiv.org/format/2201.11861">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> The Challenges of Exploration for Offline Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lambert%2C+N">Nathan Lambert</a>, <a href="/search/cs?searchtype=author&amp;query=Wulfmeier%2C+M">Markus Wulfmeier</a>, <a href="/search/cs?searchtype=author&amp;query=Whitney%2C+W">William Whitney</a>, <a href="/search/cs?searchtype=author&amp;query=Byravan%2C+A">Arunkumar Byravan</a>, <a href="/search/cs?searchtype=author&amp;query=Bloesch%2C+M">Michael Bloesch</a>, <a href="/search/cs?searchtype=author&amp;query=Dasagi%2C+V">Vibhavari Dasagi</a>, <a href="/search/cs?searchtype=author&amp;query=Hertweck%2C+T">Tim Hertweck</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2201.11861v2-abstract-short" style="display: inline;"> Offline Reinforcement Learning (ORL) enablesus to separately study the two interlinked processes of reinforcement learning: collecting informative experience and inferring optimal behaviour. The second step has been widely studied in the offline setting, but just as critical to data-efficient RL is the collection of informative data. The task-agnostic setting for data collection, where the task is&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.11861v2-abstract-full').style.display = 'inline'; document.getElementById('2201.11861v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2201.11861v2-abstract-full" style="display: none;"> Offline Reinforcement Learning (ORL) enablesus to separately study the two interlinked processes of reinforcement learning: collecting informative experience and inferring optimal behaviour. The second step has been widely studied in the offline setting, but just as critical to data-efficient RL is the collection of informative data. The task-agnostic setting for data collection, where the task is not known a priori, is of particular interest due to the possibility of collecting a single dataset and using it to solve several downstream tasks as they arise. We investigate this setting via curiosity-based intrinsic motivation, a family of exploration methods which encourage the agent to explore those states or transitions it has not yet learned to model. With Explore2Offline, we propose to evaluate the quality of collected data by transferring the collected data and inferring policies with reward relabelling and standard offline RL algorithms. We evaluate a wide variety of data collection strategies, including a new exploration agent, Intrinsic Model Predictive Control (IMPC), using this scheme and demonstrate their performance on various tasks. We use this decoupled framework to strengthen intuitions about exploration and the data prerequisites for effective offline RL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.11861v2-abstract-full').style.display = 'none'; document.getElementById('2201.11861v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.02552">arXiv:2111.02552</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2111.02552">pdf</a>, <a href="https://arxiv.org/format/2111.02552">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Is Bang-Bang Control All You Need? Solving Continuous Control with Bernoulli Policies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Seyde%2C+T">Tim Seyde</a>, <a href="/search/cs?searchtype=author&amp;query=Gilitschenski%2C+I">Igor Gilitschenski</a>, <a href="/search/cs?searchtype=author&amp;query=Schwarting%2C+W">Wilko Schwarting</a>, <a href="/search/cs?searchtype=author&amp;query=Stellato%2C+B">Bartolomeo Stellato</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a>, <a href="/search/cs?searchtype=author&amp;query=Wulfmeier%2C+M">Markus Wulfmeier</a>, <a href="/search/cs?searchtype=author&amp;query=Rus%2C+D">Daniela Rus</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.02552v1-abstract-short" style="display: inline;"> Reinforcement learning (RL) for continuous control typically employs distributions whose support covers the entire action space. In this work, we investigate the colloquially known phenomenon that trained agents often prefer actions at the boundaries of that space. We draw theoretical connections to the emergence of bang-bang behavior in optimal control, and provide extensive empirical evaluation&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.02552v1-abstract-full').style.display = 'inline'; document.getElementById('2111.02552v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.02552v1-abstract-full" style="display: none;"> Reinforcement learning (RL) for continuous control typically employs distributions whose support covers the entire action space. In this work, we investigate the colloquially known phenomenon that trained agents often prefer actions at the boundaries of that space. We draw theoretical connections to the emergence of bang-bang behavior in optimal control, and provide extensive empirical evaluation across a variety of recent RL algorithms. We replace the normal Gaussian by a Bernoulli distribution that solely considers the extremes along each action dimension - a bang-bang controller. Surprisingly, this achieves state-of-the-art performance on several continuous control benchmarks - in contrast to robotic hardware, where energy and maintenance cost affect controller choices. Since exploration, learning,and the final solution are entangled in RL, we provide additional imitation learning experiments to reduce the impact of exploration on our analysis. Finally, we show that our observations generalize to environments that aim to model real-world challenges and evaluate factors to mitigate the emergence of bang-bang solutions. Our findings emphasize challenges for benchmarking continuous control algorithms, particularly in light of potential real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.02552v1-abstract-full').style.display = 'none'; document.getElementById('2111.02552v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.06192">arXiv:2110.06192</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2110.06192">pdf</a>, <a href="https://arxiv.org/format/2110.06192">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Beyond Pick-and-Place: Tackling Robotic Stacking of Diverse Shapes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lee%2C+A+X">Alex X. Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Devin%2C+C">Coline Devin</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Y">Yuxiang Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Lampe%2C+T">Thomas Lampe</a>, <a href="/search/cs?searchtype=author&amp;query=Bousmalis%2C+K">Konstantinos Bousmalis</a>, <a href="/search/cs?searchtype=author&amp;query=Springenberg%2C+J+T">Jost Tobias Springenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Byravan%2C+A">Arunkumar Byravan</a>, <a href="/search/cs?searchtype=author&amp;query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&amp;query=Gileadi%2C+N">Nimrod Gileadi</a>, <a href="/search/cs?searchtype=author&amp;query=Khosid%2C+D">David Khosid</a>, <a href="/search/cs?searchtype=author&amp;query=Fantacci%2C+C">Claudio Fantacci</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J+E">Jose Enrique Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Raju%2C+A">Akhil Raju</a>, <a href="/search/cs?searchtype=author&amp;query=Jeong%2C+R">Rae Jeong</a>, <a href="/search/cs?searchtype=author&amp;query=Neunert%2C+M">Michael Neunert</a>, <a href="/search/cs?searchtype=author&amp;query=Laurens%2C+A">Antoine Laurens</a>, <a href="/search/cs?searchtype=author&amp;query=Saliceti%2C+S">Stefano Saliceti</a>, <a href="/search/cs?searchtype=author&amp;query=Casarini%2C+F">Federico Casarini</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a>, <a href="/search/cs?searchtype=author&amp;query=Hadsell%2C+R">Raia Hadsell</a>, <a href="/search/cs?searchtype=author&amp;query=Nori%2C+F">Francesco Nori</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.06192v2-abstract-short" style="display: inline;"> We study the problem of robotic stacking with objects of complex geometry. We propose a challenging and diverse set of such objects that was carefully designed to require strategies beyond a simple &#34;pick-and-place&#34; solution. Our method is a reinforcement learning (RL) approach combined with vision-based interactive policy distillation and simulation-to-reality transfer. Our learned policies can ef&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.06192v2-abstract-full').style.display = 'inline'; document.getElementById('2110.06192v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.06192v2-abstract-full" style="display: none;"> We study the problem of robotic stacking with objects of complex geometry. We propose a challenging and diverse set of such objects that was carefully designed to require strategies beyond a simple &#34;pick-and-place&#34; solution. Our method is a reinforcement learning (RL) approach combined with vision-based interactive policy distillation and simulation-to-reality transfer. Our learned policies can efficiently handle multiple object combinations in the real world and exhibit a large variety of stacking skills. In a large experimental study, we investigate what choices matter for learning such general vision-based agents in simulation, and what affects optimal transfer to the real robot. We then leverage data collected by such policies and improve upon them with offline RL. A video and a blog post of our work are provided as supplementary material. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.06192v2-abstract-full').style.display = 'none'; document.getElementById('2110.06192v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CoRL 2021. Video: https://dpmd.ai/robotics-stacking-YT . Blog: https://dpmd.ai/robotics-stacking . Code: https://github.com/deepmind/rgb_stacking</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.03363">arXiv:2110.03363</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2110.03363">pdf</a>, <a href="https://arxiv.org/format/2110.03363">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Evaluating model-based planning and planner amortization for continuous control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Byravan%2C+A">Arunkumar Byravan</a>, <a href="/search/cs?searchtype=author&amp;query=Hasenclever%2C+L">Leonard Hasenclever</a>, <a href="/search/cs?searchtype=author&amp;query=Trochim%2C+P">Piotr Trochim</a>, <a href="/search/cs?searchtype=author&amp;query=Mirza%2C+M">Mehdi Mirza</a>, <a href="/search/cs?searchtype=author&amp;query=Ialongo%2C+A+D">Alessandro Davide Ialongo</a>, <a href="/search/cs?searchtype=author&amp;query=Tassa%2C+Y">Yuval Tassa</a>, <a href="/search/cs?searchtype=author&amp;query=Springenberg%2C+J+T">Jost Tobias Springenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&amp;query=Merel%2C+J">Josh Merel</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.03363v1-abstract-short" style="display: inline;"> There is a widespread intuition that model-based control methods should be able to surpass the data efficiency of model-free approaches. In this paper we attempt to evaluate this intuition on various challenging locomotion tasks. We take a hybrid approach, combining model predictive control (MPC) with a learned model and model-free policy learning; the learned policy serves as a proposal for MPC.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.03363v1-abstract-full').style.display = 'inline'; document.getElementById('2110.03363v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.03363v1-abstract-full" style="display: none;"> There is a widespread intuition that model-based control methods should be able to surpass the data efficiency of model-free approaches. In this paper we attempt to evaluate this intuition on various challenging locomotion tasks. We take a hybrid approach, combining model predictive control (MPC) with a learned model and model-free policy learning; the learned policy serves as a proposal for MPC. We find that well-tuned model-free agents are strong baselines even for high DoF control problems but MPC with learned proposals and models (trained on the fly or transferred from related tasks) can significantly improve performance and data efficiency in hard multi-task/multi-goal settings. Finally, we show that it is possible to distil a model-based planner into a policy that amortizes the planning computation without any loss of performance. Videos of agents performing different tasks can be seen at https://sites.google.com/view/mbrl-amortization/home. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.03363v1-abstract-full').style.display = 'none'; document.getElementById('2110.03363v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages main text, 30 pages with references and appendix including several ablations and additional experiments. Submitted to ICLR 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2109.08603">arXiv:2109.08603</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2109.08603">pdf</a>, <a href="https://arxiv.org/format/2109.08603">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Is Curiosity All You Need? On the Utility of Emergent Behaviours from Curious Exploration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Groth%2C+O">Oliver Groth</a>, <a href="/search/cs?searchtype=author&amp;query=Wulfmeier%2C+M">Markus Wulfmeier</a>, <a href="/search/cs?searchtype=author&amp;query=Vezzani%2C+G">Giulia Vezzani</a>, <a href="/search/cs?searchtype=author&amp;query=Dasagi%2C+V">Vibhavari Dasagi</a>, <a href="/search/cs?searchtype=author&amp;query=Hertweck%2C+T">Tim Hertweck</a>, <a href="/search/cs?searchtype=author&amp;query=Hafner%2C+R">Roland Hafner</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2109.08603v1-abstract-short" style="display: inline;"> Curiosity-based reward schemes can present powerful exploration mechanisms which facilitate the discovery of solutions for complex, sparse or long-horizon tasks. However, as the agent learns to reach previously unexplored spaces and the objective adapts to reward new areas, many behaviours emerge only to disappear due to being overwritten by the constantly shifting objective. We argue that merely&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.08603v1-abstract-full').style.display = 'inline'; document.getElementById('2109.08603v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2109.08603v1-abstract-full" style="display: none;"> Curiosity-based reward schemes can present powerful exploration mechanisms which facilitate the discovery of solutions for complex, sparse or long-horizon tasks. However, as the agent learns to reach previously unexplored spaces and the objective adapts to reward new areas, many behaviours emerge only to disappear due to being overwritten by the constantly shifting objective. We argue that merely using curiosity for fast environment exploration or as a bonus reward for a specific task does not harness the full potential of this technique and misses useful skills. Instead, we propose to shift the focus towards retaining the behaviours which emerge during curiosity-based learning. We posit that these self-discovered behaviours serve as valuable skills in an agent&#39;s repertoire to solve related tasks. Our experiments demonstrate the continuous shift in behaviour throughout training and the benefits of a simple policy snapshot method to reuse discovered behaviour for transfer tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.08603v1-abstract-full').style.display = 'none'; document.getElementById('2109.08603v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 7 figures, 2 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6; I.2.9 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2108.10273">arXiv:2108.10273</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2108.10273">pdf</a>, <a href="https://arxiv.org/format/2108.10273">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Collect &amp; Infer -- a fresh look at data-efficient Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a>, <a href="/search/cs?searchtype=author&amp;query=Springenberg%2C+J+T">Jost Tobias Springenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Hafner%2C+R">Roland Hafner</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2108.10273v1-abstract-short" style="display: inline;"> This position paper proposes a fresh look at Reinforcement Learning (RL) from the perspective of data-efficiency. Data-efficient RL has gone through three major stages: pure on-line RL where every data-point is considered only once, RL with a replay buffer where additional learning is done on a portion of the experience, and finally transition memory based RL, where, conceptually, all transitions&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.10273v1-abstract-full').style.display = 'inline'; document.getElementById('2108.10273v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2108.10273v1-abstract-full" style="display: none;"> This position paper proposes a fresh look at Reinforcement Learning (RL) from the perspective of data-efficiency. Data-efficient RL has gone through three major stages: pure on-line RL where every data-point is considered only once, RL with a replay buffer where additional learning is done on a portion of the experience, and finally transition memory based RL, where, conceptually, all transitions are stored and re-used in every update step. While inferring knowledge from all explicitly stored experience has lead to a tremendous gain in data-efficiency, the question of how this data is collected has been vastly understudied. We argue that data-efficiency can only be achieved through careful consideration of both aspects. We propose to make this insight explicit via a paradigm that we call &#39;Collect and Infer&#39;, which explicitly models RL as two separate but interconnected processes, concerned with data collection and knowledge inference respectively. We discuss implications of the paradigm, how its ideas are reflected in the literature, and how it can guide future research into data efficient RL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.10273v1-abstract-full').style.display = 'none'; document.getElementById('2108.10273v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.08199">arXiv:2106.08199</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2106.08199">pdf</a>, <a href="https://arxiv.org/format/2106.08199">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> On Multi-objective Policy Optimization as a Tool for Reinforcement Learning: Case Studies in Offline RL and Finetuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+S+H">Sandy H. Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Vezzani%2C+G">Giulia Vezzani</a>, <a href="/search/cs?searchtype=author&amp;query=Shahriari%2C+B">Bobak Shahriari</a>, <a href="/search/cs?searchtype=author&amp;query=Springenberg%2C+J+T">Jost Tobias Springenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Mishra%2C+S">Shruti Mishra</a>, <a href="/search/cs?searchtype=author&amp;query=TB%2C+D">Dhruva TB</a>, <a href="/search/cs?searchtype=author&amp;query=Byravan%2C+A">Arunkumar Byravan</a>, <a href="/search/cs?searchtype=author&amp;query=Bousmalis%2C+K">Konstantinos Bousmalis</a>, <a href="/search/cs?searchtype=author&amp;query=Gyorgy%2C+A">Andras Gyorgy</a>, <a href="/search/cs?searchtype=author&amp;query=Szepesvari%2C+C">Csaba Szepesvari</a>, <a href="/search/cs?searchtype=author&amp;query=Hadsell%2C+R">Raia Hadsell</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.08199v2-abstract-short" style="display: inline;"> Many advances that have improved the robustness and efficiency of deep reinforcement learning (RL) algorithms can, in one way or another, be understood as introducing additional objectives or constraints in the policy optimization step. This includes ideas as far ranging as exploration bonuses, entropy regularization, and regularization toward teachers or data priors. Often, the task reward and au&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.08199v2-abstract-full').style.display = 'inline'; document.getElementById('2106.08199v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.08199v2-abstract-full" style="display: none;"> Many advances that have improved the robustness and efficiency of deep reinforcement learning (RL) algorithms can, in one way or another, be understood as introducing additional objectives or constraints in the policy optimization step. This includes ideas as far ranging as exploration bonuses, entropy regularization, and regularization toward teachers or data priors. Often, the task reward and auxiliary objectives are in conflict, and in this paper we argue that this makes it natural to treat these cases as instances of multi-objective (MO) optimization problems. We demonstrate how this perspective allows us to develop novel and more effective RL algorithms. In particular, we focus on offline RL and finetuning as case studies, and show that existing approaches can be understood as MO algorithms relying on linear scalarization. We hypothesize that replacing linear scalarization with a better algorithm can improve performance. We introduce Distillation of a Mixture of Experts (DiME), a new MORL algorithm that outperforms linear scalarization and can be applied to these non-standard MO problems. We demonstrate that for offline RL, DiME leads to a simple new algorithm that outperforms state-of-the-art. For finetuning, we derive new algorithms that learn to outperform the teacher policy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.08199v2-abstract-full').style.display = 'none'; document.getElementById('2106.08199v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2101.09458">arXiv:2101.09458</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2101.09458">pdf</a>, <a href="https://arxiv.org/format/2101.09458">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Decoupled Exploration and Exploitation Policies for Sample-Efficient Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Whitney%2C+W+F">William F. Whitney</a>, <a href="/search/cs?searchtype=author&amp;query=Bloesch%2C+M">Michael Bloesch</a>, <a href="/search/cs?searchtype=author&amp;query=Springenberg%2C+J+T">Jost Tobias Springenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&amp;query=Cho%2C+K">Kyunghyun Cho</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2101.09458v2-abstract-short" style="display: inline;"> Despite the close connection between exploration and sample efficiency, most state of the art reinforcement learning algorithms include no considerations for exploration beyond maximizing the entropy of the policy. In this work we address this seeming missed opportunity. We observe that the most common formulation of directed exploration in deep RL, known as bonus-based exploration (BBE), suffers&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.09458v2-abstract-full').style.display = 'inline'; document.getElementById('2101.09458v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2101.09458v2-abstract-full" style="display: none;"> Despite the close connection between exploration and sample efficiency, most state of the art reinforcement learning algorithms include no considerations for exploration beyond maximizing the entropy of the policy. In this work we address this seeming missed opportunity. We observe that the most common formulation of directed exploration in deep RL, known as bonus-based exploration (BBE), suffers from bias and slow coverage in the few-sample regime. This causes BBE to be actively detrimental to policy learning in many control tasks. We show that by decoupling the task policy from the exploration policy, directed exploration can be highly effective for sample-efficient continuous control. Our method, Decoupled Exploration and Exploitation Policies (DEEP), can be combined with any off-policy RL algorithm without modification. When used in conjunction with soft actor-critic, DEEP incurs no performance penalty in densely-rewarding environments. On sparse environments, DEEP gives a several-fold improvement in data efficiency due to better exploration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.09458v2-abstract-full').style.display = 'none'; document.getElementById('2101.09458v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 January, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.01758">arXiv:2011.01758</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2011.01758">pdf</a>, <a href="https://arxiv.org/format/2011.01758">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Representation Matters: Improving Perception and Exploration for Robotics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wulfmeier%2C+M">Markus Wulfmeier</a>, <a href="/search/cs?searchtype=author&amp;query=Byravan%2C+A">Arunkumar Byravan</a>, <a href="/search/cs?searchtype=author&amp;query=Hertweck%2C+T">Tim Hertweck</a>, <a href="/search/cs?searchtype=author&amp;query=Higgins%2C+I">Irina Higgins</a>, <a href="/search/cs?searchtype=author&amp;query=Gupta%2C+A">Ankush Gupta</a>, <a href="/search/cs?searchtype=author&amp;query=Kulkarni%2C+T">Tejas Kulkarni</a>, <a href="/search/cs?searchtype=author&amp;query=Reynolds%2C+M">Malcolm Reynolds</a>, <a href="/search/cs?searchtype=author&amp;query=Teplyashin%2C+D">Denis Teplyashin</a>, <a href="/search/cs?searchtype=author&amp;query=Hafner%2C+R">Roland Hafner</a>, <a href="/search/cs?searchtype=author&amp;query=Lampe%2C+T">Thomas Lampe</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.01758v2-abstract-short" style="display: inline;"> Projecting high-dimensional environment observations into lower-dimensional structured representations can considerably improve data-efficiency for reinforcement learning in domains with limited data such as robotics. Can a single generally useful representation be found? In order to answer this question, it is important to understand how the representation will be used by the agent and what prope&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.01758v2-abstract-full').style.display = 'inline'; document.getElementById('2011.01758v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.01758v2-abstract-full" style="display: none;"> Projecting high-dimensional environment observations into lower-dimensional structured representations can considerably improve data-efficiency for reinforcement learning in domains with limited data such as robotics. Can a single generally useful representation be found? In order to answer this question, it is important to understand how the representation will be used by the agent and what properties such a &#39;good&#39; representation should have. In this paper we systematically evaluate a number of common learnt and hand-engineered representations in the context of three robotics tasks: lifting, stacking and pushing of 3D blocks. The representations are evaluated in two use-cases: as input to the agent, or as a source of auxiliary tasks. Furthermore, the value of each representation is evaluated in terms of three properties: dimensionality, observability and disentanglement. We can significantly improve performance in both use-cases and demonstrate that some representations can perform commensurate to simulator states as agent inputs. Finally, our results challenge common intuitions by demonstrating that: 1) dimensionality strongly matters for task generation, but is negligible for inputs, 2) observability of task-relevant aspects mostly affects the input representation use-case, and 3) disentanglement leads to better auxiliary tasks, but has only limited benefits for input representations. This work serves as a step towards a more systematic understanding of what makes a &#39;good&#39; representation for control in robotics, enabling practitioners to make more informed choices for developing new learned or hand-engineered representations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.01758v2-abstract-full').style.display = 'none'; document.getElementById('2011.01758v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published at ICRA 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2010.15492">arXiv:2010.15492</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2010.15492">pdf</a>, <a href="https://arxiv.org/format/2010.15492">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> &#34;What, not how&#34;: Solving an under-actuated insertion task from scratch </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Vezzani%2C+G">Giulia Vezzani</a>, <a href="/search/cs?searchtype=author&amp;query=Neunert%2C+M">Michael Neunert</a>, <a href="/search/cs?searchtype=author&amp;query=Wulfmeier%2C+M">Markus Wulfmeier</a>, <a href="/search/cs?searchtype=author&amp;query=Jeong%2C+R">Rae Jeong</a>, <a href="/search/cs?searchtype=author&amp;query=Lampe%2C+T">Thomas Lampe</a>, <a href="/search/cs?searchtype=author&amp;query=Siegel%2C+N">Noah Siegel</a>, <a href="/search/cs?searchtype=author&amp;query=Hafner%2C+R">Roland Hafner</a>, <a href="/search/cs?searchtype=author&amp;query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a>, <a href="/search/cs?searchtype=author&amp;query=Nori%2C+F">Francesco Nori</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2010.15492v2-abstract-short" style="display: inline;"> Robot manipulation requires a complex set of skills that need to be carefully combined and coordinated to solve a task. Yet, most ReinforcementLearning (RL) approaches in robotics study tasks which actually consist only of a single manipulation skill, such as grasping an object or inserting a pre-grasped object. As a result the skill (&#39;how&#39; to solve the task) but not the actual goal of a complete&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.15492v2-abstract-full').style.display = 'inline'; document.getElementById('2010.15492v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2010.15492v2-abstract-full" style="display: none;"> Robot manipulation requires a complex set of skills that need to be carefully combined and coordinated to solve a task. Yet, most ReinforcementLearning (RL) approaches in robotics study tasks which actually consist only of a single manipulation skill, such as grasping an object or inserting a pre-grasped object. As a result the skill (&#39;how&#39; to solve the task) but not the actual goal of a complete manipulation (&#39;what&#39; to solve) is specified. In contrast, we study a complex manipulation goal that requires an agent to learn and combine diverse manipulation skills. We propose a challenging, highly under-actuated peg-in-hole task with a free, rotational asymmetrical peg, requiring a broad range of manipulation skills. While correct peg (re-)orientation is a requirement for successful insertion, there is no reward associated with it. Hence an agent needs to understand this pre-condition and learn the skill to fulfil it. The final insertion reward is sparse, allowing freedom in the solution and leading to complex emerging behaviour not envisioned during the task design. We tackle the problem in a multi-task RL framework using Scheduled Auxiliary Control (SAC-X) combined with Regularized Hierarchical Policy Optimization (RHPO) which successfully solves the task in simulation and from scratch on a single robot where data is severely limited. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.15492v2-abstract-full').style.display = 'none'; document.getElementById('2010.15492v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2010.10644">arXiv:2010.10644</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2010.10644">pdf</a>, <a href="https://arxiv.org/format/2010.10644">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Robust Constrained Reinforcement Learning for Continuous Control with Model Misspecification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mankowitz%2C+D+J">Daniel J. Mankowitz</a>, <a href="/search/cs?searchtype=author&amp;query=Calian%2C+D+A">Dan A. Calian</a>, <a href="/search/cs?searchtype=author&amp;query=Jeong%2C+R">Rae Jeong</a>, <a href="/search/cs?searchtype=author&amp;query=Paduraru%2C+C">Cosmin Paduraru</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&amp;query=Dathathri%2C+S">Sumanth Dathathri</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a>, <a href="/search/cs?searchtype=author&amp;query=Mann%2C+T">Timothy Mann</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2010.10644v4-abstract-short" style="display: inline;"> Many real-world physical control systems are required to satisfy constraints upon deployment. Furthermore, real-world systems are often subject to effects such as non-stationarity, wear-and-tear, uncalibrated sensors and so on. Such effects effectively perturb the system dynamics and can cause a policy trained successfully in one domain to perform poorly when deployed to a perturbed version of the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.10644v4-abstract-full').style.display = 'inline'; document.getElementById('2010.10644v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2010.10644v4-abstract-full" style="display: none;"> Many real-world physical control systems are required to satisfy constraints upon deployment. Furthermore, real-world systems are often subject to effects such as non-stationarity, wear-and-tear, uncalibrated sensors and so on. Such effects effectively perturb the system dynamics and can cause a policy trained successfully in one domain to perform poorly when deployed to a perturbed version of the same domain. This can affect a policy&#39;s ability to maximize future rewards as well as the extent to which it satisfies constraints. We refer to this as constrained model misspecification. We present an algorithm that mitigates this form of misspecification, and showcase its performance in multiple simulated Mujoco tasks from the Real World Reinforcement Learning (RWRL) suite. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.10644v4-abstract-full').style.display = 'none'; document.getElementById('2010.10644v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2010.05545">arXiv:2010.05545</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2010.05545">pdf</a>, <a href="https://arxiv.org/format/2010.05545">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Local Search for Policy Iteration in Continuous Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Springenberg%2C+J+T">Jost Tobias Springenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&amp;query=Mankowitz%2C+D">Daniel Mankowitz</a>, <a href="/search/cs?searchtype=author&amp;query=Merel%2C+J">Josh Merel</a>, <a href="/search/cs?searchtype=author&amp;query=Byravan%2C+A">Arunkumar Byravan</a>, <a href="/search/cs?searchtype=author&amp;query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&amp;query=Kay%2C+J">Jackie Kay</a>, <a href="/search/cs?searchtype=author&amp;query=Degrave%2C+J">Jonas Degrave</a>, <a href="/search/cs?searchtype=author&amp;query=Schrittwieser%2C+J">Julian Schrittwieser</a>, <a href="/search/cs?searchtype=author&amp;query=Tassa%2C+Y">Yuval Tassa</a>, <a href="/search/cs?searchtype=author&amp;query=Buchli%2C+J">Jonas Buchli</a>, <a href="/search/cs?searchtype=author&amp;query=Belov%2C+D">Dan Belov</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2010.05545v1-abstract-short" style="display: inline;"> We present an algorithm for local, regularized, policy improvement in reinforcement learning (RL) that allows us to formulate model-based and model-free variants in a single framework. Our algorithm can be interpreted as a natural extension of work on KL-regularized RL and introduces a form of tree search for continuous action spaces. We demonstrate that additional computation spent on model-based&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.05545v1-abstract-full').style.display = 'inline'; document.getElementById('2010.05545v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2010.05545v1-abstract-full" style="display: none;"> We present an algorithm for local, regularized, policy improvement in reinforcement learning (RL) that allows us to formulate model-based and model-free variants in a single framework. Our algorithm can be interpreted as a natural extension of work on KL-regularized RL and introduces a form of tree search for continuous action spaces. We demonstrate that additional computation spent on model-based policy improvement during learning can improve data efficiency, and confirm that model-based policy improvement during action selection can also be beneficial. Quantitatively, our algorithm improves data efficiency on several continuous control benchmarks (when a model is learned in parallel), and it provides significant improvements in wall-clock time in high-dimensional domains (when a ground truth model is available). The unified framework also helps us to better understand the space of model-based and model-free algorithms. In particular, we demonstrate that some benefits attributed to model-based RL can be obtained without a model, simply by utilizing more computation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.05545v1-abstract-full').style.display = 'none'; document.getElementById('2010.05545v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2008.12228">arXiv:2008.12228</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2008.12228">pdf</a>, <a href="https://arxiv.org/format/2008.12228">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Towards General and Autonomous Learning of Core Skills: A Case Study in Locomotion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hafner%2C+R">Roland Hafner</a>, <a href="/search/cs?searchtype=author&amp;query=Hertweck%2C+T">Tim Hertweck</a>, <a href="/search/cs?searchtype=author&amp;query=Kl%C3%B6ppner%2C+P">Philipp Kl枚ppner</a>, <a href="/search/cs?searchtype=author&amp;query=Bloesch%2C+M">Michael Bloesch</a>, <a href="/search/cs?searchtype=author&amp;query=Neunert%2C+M">Michael Neunert</a>, <a href="/search/cs?searchtype=author&amp;query=Wulfmeier%2C+M">Markus Wulfmeier</a>, <a href="/search/cs?searchtype=author&amp;query=Tunyasuvunakool%2C+S">Saran Tunyasuvunakool</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2008.12228v1-abstract-short" style="display: inline;"> Modern Reinforcement Learning (RL) algorithms promise to solve difficult motor control problems directly from raw sensory inputs. Their attraction is due in part to the fact that they can represent a general class of methods that allow to learn a solution with a reasonably set reward and minimal prior knowledge, even in situations where it is difficult or expensive for a human expert. For RL to tr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.12228v1-abstract-full').style.display = 'inline'; document.getElementById('2008.12228v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2008.12228v1-abstract-full" style="display: none;"> Modern Reinforcement Learning (RL) algorithms promise to solve difficult motor control problems directly from raw sensory inputs. Their attraction is due in part to the fact that they can represent a general class of methods that allow to learn a solution with a reasonably set reward and minimal prior knowledge, even in situations where it is difficult or expensive for a human expert. For RL to truly make good on this promise, however, we need algorithms and learning setups that can work across a broad range of problems with minimal problem specific adjustments or engineering. In this paper, we study this idea of generality in the locomotion domain. We develop a learning framework that can learn sophisticated locomotion behavior for a wide spectrum of legged robots, such as bipeds, tripeds, quadrupeds and hexapods, including wheeled variants. Our learning framework relies on a data-efficient, off-policy multi-task RL algorithm and a small set of reward functions that are semantically identical across robots. To underline the general applicability of the method, we keep the hyper-parameter settings and reward definitions constant across experiments and rely exclusively on on-board sensing. For nine different types of robots, including a real-world quadruped robot, we demonstrate that the same algorithm can rapidly learn diverse and reusable locomotion skills without any platform specific adjustments or additional instrumentation of the learning setup. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.12228v1-abstract-full').style.display = 'none'; document.getElementById('2008.12228v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2007.15588">arXiv:2007.15588</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2007.15588">pdf</a>, <a href="https://arxiv.org/format/2007.15588">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Data-efficient Hindsight Off-policy Option Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wulfmeier%2C+M">Markus Wulfmeier</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+D">Dushyant Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Hafner%2C+R">Roland Hafner</a>, <a href="/search/cs?searchtype=author&amp;query=Lampe%2C+T">Thomas Lampe</a>, <a href="/search/cs?searchtype=author&amp;query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&amp;query=Hertweck%2C+T">Tim Hertweck</a>, <a href="/search/cs?searchtype=author&amp;query=Neunert%2C+M">Michael Neunert</a>, <a href="/search/cs?searchtype=author&amp;query=Tirumala%2C+D">Dhruva Tirumala</a>, <a href="/search/cs?searchtype=author&amp;query=Siegel%2C+N">Noah Siegel</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2007.15588v2-abstract-short" style="display: inline;"> We introduce Hindsight Off-policy Options (HO2), a data-efficient option learning algorithm. Given any trajectory, HO2 infers likely option choices and backpropagates through the dynamic programming inference procedure to robustly train all policy components off-policy and end-to-end. The approach outperforms existing option learning methods on common benchmarks. To better understand the option fr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.15588v2-abstract-full').style.display = 'inline'; document.getElementById('2007.15588v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2007.15588v2-abstract-full" style="display: none;"> We introduce Hindsight Off-policy Options (HO2), a data-efficient option learning algorithm. Given any trajectory, HO2 infers likely option choices and backpropagates through the dynamic programming inference procedure to robustly train all policy components off-policy and end-to-end. The approach outperforms existing option learning methods on common benchmarks. To better understand the option framework and disentangle benefits from both temporal and action abstraction, we evaluate ablations with flat policies and mixture policies with comparable optimization. The results highlight the importance of both types of abstraction as well as off-policy training and trust-region constraints, particularly in challenging, simulated 3D robot manipulation tasks from raw pixel inputs. Finally, we intuitively adapt the inference step to investigate the effect of increased temporal abstraction on training with pre-trained options and from scratch. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.15588v2-abstract-full').style.display = 'none'; document.getElementById('2007.15588v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published at ICML2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2005.07541">arXiv:2005.07541</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2005.07541">pdf</a>, <a href="https://arxiv.org/format/2005.07541">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Simple Sensor Intentions for Exploration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hertweck%2C+T">Tim Hertweck</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a>, <a href="/search/cs?searchtype=author&amp;query=Bloesch%2C+M">Michael Bloesch</a>, <a href="/search/cs?searchtype=author&amp;query=Springenberg%2C+J+T">Jost Tobias Springenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Siegel%2C+N">Noah Siegel</a>, <a href="/search/cs?searchtype=author&amp;query=Wulfmeier%2C+M">Markus Wulfmeier</a>, <a href="/search/cs?searchtype=author&amp;query=Hafner%2C+R">Roland Hafner</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2005.07541v1-abstract-short" style="display: inline;"> Modern reinforcement learning algorithms can learn solutions to increasingly difficult control problems while at the same time reduce the amount of prior knowledge needed for their application. One of the remaining challenges is the definition of reward schemes that appropriately facilitate exploration without biasing the solution in undesirable ways, and that can be implemented on real robotic sy&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.07541v1-abstract-full').style.display = 'inline'; document.getElementById('2005.07541v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2005.07541v1-abstract-full" style="display: none;"> Modern reinforcement learning algorithms can learn solutions to increasingly difficult control problems while at the same time reduce the amount of prior knowledge needed for their application. One of the remaining challenges is the definition of reward schemes that appropriately facilitate exploration without biasing the solution in undesirable ways, and that can be implemented on real robotic systems without expensive instrumentation. In this paper we focus on a setting in which goal tasks are defined via simple sparse rewards, and exploration is facilitated via agent-internal auxiliary tasks. We introduce the idea of simple sensor intentions (SSIs) as a generic way to define auxiliary tasks. SSIs reduce the amount of prior knowledge that is required to define suitable rewards. They can further be computed directly from raw sensor streams and thus do not require expensive and possibly brittle state estimation on real systems. We demonstrate that a learning system based on these rewards can solve complex robotic tasks in simulation and in real world settings. In particular, we show that a real robotic arm can learn to grasp and lift and solve a Ball-in-a-Cup task from scratch, when only raw sensor streams are used for both controller input and in the auxiliary reward definition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.07541v1-abstract-full').style.display = 'none'; document.getElementById('2005.07541v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2005.07513">arXiv:2005.07513</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2005.07513">pdf</a>, <a href="https://arxiv.org/format/2005.07513">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> A Distributional View on Multi-Objective Policy Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+S+H">Sandy H. Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Hasenclever%2C+L">Leonard Hasenclever</a>, <a href="/search/cs?searchtype=author&amp;query=Neunert%2C+M">Michael Neunert</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+H+F">H. Francis Song</a>, <a href="/search/cs?searchtype=author&amp;query=Zambelli%2C+M">Martina Zambelli</a>, <a href="/search/cs?searchtype=author&amp;query=Martins%2C+M+F">Murilo F. Martins</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&amp;query=Hadsell%2C+R">Raia Hadsell</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2005.07513v1-abstract-short" style="display: inline;"> Many real-world problems require trading off multiple competing objectives. However, these objectives are often in different units and/or scales, which can make it challenging for practitioners to express numerical preferences over objectives in their native units. In this paper we propose a novel algorithm for multi-objective reinforcement learning that enables setting desired preferences for obj&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.07513v1-abstract-full').style.display = 'inline'; document.getElementById('2005.07513v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2005.07513v1-abstract-full" style="display: none;"> Many real-world problems require trading off multiple competing objectives. However, these objectives are often in different units and/or scales, which can make it challenging for practitioners to express numerical preferences over objectives in their native units. In this paper we propose a novel algorithm for multi-objective reinforcement learning that enables setting desired preferences for objectives in a scale-invariant way. We propose to learn an action distribution for each objective, and we use supervised learning to fit a parametric policy to a combination of these distributions. We demonstrate the effectiveness of our approach on challenging high-dimensional real and simulated robotics tasks, and show that setting different preferences in our framework allows us to trace out the space of nondominated solutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.07513v1-abstract-full').style.display = 'none'; document.getElementById('2005.07513v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2002.08396">arXiv:2002.08396</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2002.08396">pdf</a>, <a href="https://arxiv.org/format/2002.08396">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Keep Doing What Worked: Behavioral Modelling Priors for Offline Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Siegel%2C+N+Y">Noah Y. Siegel</a>, <a href="/search/cs?searchtype=author&amp;query=Springenberg%2C+J+T">Jost Tobias Springenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Berkenkamp%2C+F">Felix Berkenkamp</a>, <a href="/search/cs?searchtype=author&amp;query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&amp;query=Neunert%2C+M">Michael Neunert</a>, <a href="/search/cs?searchtype=author&amp;query=Lampe%2C+T">Thomas Lampe</a>, <a href="/search/cs?searchtype=author&amp;query=Hafner%2C+R">Roland Hafner</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2002.08396v3-abstract-short" style="display: inline;"> Off-policy reinforcement learning algorithms promise to be applicable in settings where only a fixed data-set (batch) of environment interactions is available and no new experience can be acquired. This property makes these algorithms appealing for real world problems such as robot control. In practice, however, standard off-policy algorithms fail in the batch setting for continuous control. In th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2002.08396v3-abstract-full').style.display = 'inline'; document.getElementById('2002.08396v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2002.08396v3-abstract-full" style="display: none;"> Off-policy reinforcement learning algorithms promise to be applicable in settings where only a fixed data-set (batch) of environment interactions is available and no new experience can be acquired. This property makes these algorithms appealing for real world problems such as robot control. In practice, however, standard off-policy algorithms fail in the batch setting for continuous control. In this paper, we propose a simple solution to this problem. It admits the use of data generated by arbitrary behavior policies and uses a learned prior -- the advantage-weighted behavior model (ABM) -- to bias the RL policy towards actions that have previously been executed and are likely to be successful on the new task. Our method can be seen as an extension of recent work on batch-RL that enables stable learning from conflicting data-sources. We find improvements on competitive baselines in a variety of RL tasks -- including standard continuous control benchmarks and multi-task learning for simulated and real-world robots. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2002.08396v3-abstract-full').style.display = 'none'; document.getElementById('2002.08396v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 February, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6; I.2.9 </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ICLR 2020 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2001.00449">arXiv:2001.00449</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2001.00449">pdf</a>, <a href="https://arxiv.org/format/2001.00449">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Continuous-Discrete Reinforcement Learning for Hybrid Control in Robotics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Neunert%2C+M">Michael Neunert</a>, <a href="/search/cs?searchtype=author&amp;query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&amp;query=Wulfmeier%2C+M">Markus Wulfmeier</a>, <a href="/search/cs?searchtype=author&amp;query=Lampe%2C+T">Thomas Lampe</a>, <a href="/search/cs?searchtype=author&amp;query=Springenberg%2C+J+T">Jost Tobias Springenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Hafner%2C+R">Roland Hafner</a>, <a href="/search/cs?searchtype=author&amp;query=Romano%2C+F">Francesco Romano</a>, <a href="/search/cs?searchtype=author&amp;query=Buchli%2C+J">Jonas Buchli</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2001.00449v1-abstract-short" style="display: inline;"> Many real-world control problems involve both discrete decision variables - such as the choice of control modes, gear switching or digital outputs - as well as continuous decision variables - such as velocity setpoints, control gains or analogue outputs. However, when defining the corresponding optimal control or reinforcement learning problem, it is commonly approximated with fully continuous or&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.00449v1-abstract-full').style.display = 'inline'; document.getElementById('2001.00449v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2001.00449v1-abstract-full" style="display: none;"> Many real-world control problems involve both discrete decision variables - such as the choice of control modes, gear switching or digital outputs - as well as continuous decision variables - such as velocity setpoints, control gains or analogue outputs. However, when defining the corresponding optimal control or reinforcement learning problem, it is commonly approximated with fully continuous or fully discrete action spaces. These simplifications aim at tailoring the problem to a particular algorithm or solver which may only support one type of action space. Alternatively, expert heuristics are used to remove discrete actions from an otherwise continuous space. In contrast, we propose to treat hybrid problems in their &#39;native&#39; form by solving them with hybrid reinforcement learning, which optimizes for discrete and continuous actions simultaneously. In our experiments, we first demonstrate that the proposed approach efficiently solves such natively hybrid reinforcement learning problems. We then show, both in simulation and on robotic hardware, the benefits of removing possibly imperfect expert-designed heuristics. Lastly, hybrid reinforcement learning encourages us to rethink problem definitions. We propose reformulating control problems, e.g. by adding meta actions, to improve exploration or reduce mechanical wear and tear. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.00449v1-abstract-full').style.display = 'none'; document.getElementById('2001.00449v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 January, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Presented at the 3rd Conference on Robot Learning (CoRL 2019), Osaka, Japan. Video: https://youtu.be/eUqQDLQXb7I</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1911.01831">arXiv:1911.01831</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1911.01831">pdf</a>, <a href="https://arxiv.org/format/1911.01831">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Quinoa: a Q-function You Infer Normalized Over Actions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Degrave%2C+J">Jonas Degrave</a>, <a href="/search/cs?searchtype=author&amp;query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&amp;query=Springenberg%2C+J+T">Jost Tobias Springenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1911.01831v1-abstract-short" style="display: inline;"> We present an algorithm for learning an approximate action-value soft Q-function in the relative entropy regularised reinforcement learning setting, for which an optimal improved policy can be recovered in closed form. We use recent advances in normalising flows for parametrising the policy together with a learned value-function; and show how this combination can be used to implicitly represent Q-&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1911.01831v1-abstract-full').style.display = 'inline'; document.getElementById('1911.01831v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1911.01831v1-abstract-full" style="display: none;"> We present an algorithm for learning an approximate action-value soft Q-function in the relative entropy regularised reinforcement learning setting, for which an optimal improved policy can be recovered in closed form. We use recent advances in normalising flows for parametrising the policy together with a learned value-function; and show how this combination can be used to implicitly represent Q-values of an arbitrary policy in continuous action space. Using simple temporal difference learning on the Q-values then leads to a unified objective for policy and value learning. We show how this approach considerably simplifies standard Actor-Critic off-policy algorithms, removing the need for a policy optimisation step. We perform experiments on a range of established reinforcement learning benchmarks, demonstrating that our approach allows for complex, multimodal policy distributions in continuous action spaces, while keeping the process of sampling from the policy both fast and exact. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1911.01831v1-abstract-full').style.display = 'none'; document.getElementById('1911.01831v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Deep RL Workshop/NeurIPS</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1910.04142">arXiv:1910.04142</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1910.04142">pdf</a>, <a href="https://arxiv.org/format/1910.04142">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Imagined Value Gradients: Model-Based Policy Optimization with Transferable Latent Dynamics Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Byravan%2C+A">Arunkumar Byravan</a>, <a href="/search/cs?searchtype=author&amp;query=Springenberg%2C+J+T">Jost Tobias Springenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&amp;query=Hafner%2C+R">Roland Hafner</a>, <a href="/search/cs?searchtype=author&amp;query=Neunert%2C+M">Michael Neunert</a>, <a href="/search/cs?searchtype=author&amp;query=Lampe%2C+T">Thomas Lampe</a>, <a href="/search/cs?searchtype=author&amp;query=Siegel%2C+N">Noah Siegel</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1910.04142v1-abstract-short" style="display: inline;"> Humans are masters at quickly learning many complex tasks, relying on an approximate understanding of the dynamics of their environments. In much the same way, we would like our learning agents to quickly adapt to new tasks. In this paper, we explore how model-based Reinforcement Learning (RL) can facilitate transfer to new tasks. We develop an algorithm that learns an action-conditional, predicti&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.04142v1-abstract-full').style.display = 'inline'; document.getElementById('1910.04142v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1910.04142v1-abstract-full" style="display: none;"> Humans are masters at quickly learning many complex tasks, relying on an approximate understanding of the dynamics of their environments. In much the same way, we would like our learning agents to quickly adapt to new tasks. In this paper, we explore how model-based Reinforcement Learning (RL) can facilitate transfer to new tasks. We develop an algorithm that learns an action-conditional, predictive model of expected future observations, rewards and values from which a policy can be derived by following the gradient of the estimated value along imagined trajectories. We show how robust policy optimization can be achieved in robot manipulation tasks even with approximate models that are learned directly from vision and proprioception. We evaluate the efficacy of our approach in a transfer learning scenario, re-using previously learned models on tasks with different reward structures and visual distractors, and show a significant improvement in learning speed compared to strong off-policy baselines. Videos with results can be found at https://sites.google.com/view/ivg-corl19 <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.04142v1-abstract-full').style.display = 'none'; document.getElementById('1910.04142v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at the 3rd annual Conference on Robot Learning, Osaka, Japan (CoRL 2019). 24 pages including appendix (main paper - 8 pages)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1909.12238">arXiv:1909.12238</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1909.12238">pdf</a>, <a href="https://arxiv.org/format/1909.12238">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> V-MPO: On-Policy Maximum a Posteriori Policy Optimization for Discrete and Continuous Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Song%2C+H+F">H. Francis Song</a>, <a href="/search/cs?searchtype=author&amp;query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&amp;query=Springenberg%2C+J+T">Jost Tobias Springenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Clark%2C+A">Aidan Clark</a>, <a href="/search/cs?searchtype=author&amp;query=Soyer%2C+H">Hubert Soyer</a>, <a href="/search/cs?searchtype=author&amp;query=Rae%2C+J+W">Jack W. Rae</a>, <a href="/search/cs?searchtype=author&amp;query=Noury%2C+S">Seb Noury</a>, <a href="/search/cs?searchtype=author&amp;query=Ahuja%2C+A">Arun Ahuja</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Siqi Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Tirumala%2C+D">Dhruva Tirumala</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&amp;query=Belov%2C+D">Dan Belov</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a>, <a href="/search/cs?searchtype=author&amp;query=Botvinick%2C+M+M">Matthew M. Botvinick</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1909.12238v1-abstract-short" style="display: inline;"> Some of the most successful applications of deep reinforcement learning to challenging domains in discrete and continuous control have used policy gradient methods in the on-policy setting. However, policy gradients can suffer from large variance that may limit performance, and in practice require carefully tuned entropy regularization to prevent policy collapse. As an alternative to policy gradie&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1909.12238v1-abstract-full').style.display = 'inline'; document.getElementById('1909.12238v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1909.12238v1-abstract-full" style="display: none;"> Some of the most successful applications of deep reinforcement learning to challenging domains in discrete and continuous control have used policy gradient methods in the on-policy setting. However, policy gradients can suffer from large variance that may limit performance, and in practice require carefully tuned entropy regularization to prevent policy collapse. As an alternative to policy gradient algorithms, we introduce V-MPO, an on-policy adaptation of Maximum a Posteriori Policy Optimization (MPO) that performs policy iteration based on a learned state-value function. We show that V-MPO surpasses previously reported scores for both the Atari-57 and DMLab-30 benchmark suites in the multi-task setting, and does so reliably without importance weighting, entropy regularization, or population-based tuning of hyperparameters. On individual DMLab and Atari levels, the proposed algorithm can achieve scores that are substantially higher than has previously been reported. V-MPO is also applicable to problems with high-dimensional, continuous action spaces, which we demonstrate in the context of learning to control simulated humanoids with 22 degrees of freedom from full state observations and 56 degrees of freedom from pixel observations, as well as example OpenAI Gym tasks where V-MPO achieves substantially higher asymptotic scores than previously reported. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1909.12238v1-abstract-full').style.display = 'none'; document.getElementById('1909.12238v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">* equal contribution</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1906.11228">arXiv:1906.11228</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1906.11228">pdf</a>, <a href="https://arxiv.org/format/1906.11228">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Compositional Transfer in Hierarchical Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wulfmeier%2C+M">Markus Wulfmeier</a>, <a href="/search/cs?searchtype=author&amp;query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&amp;query=Hafner%2C+R">Roland Hafner</a>, <a href="/search/cs?searchtype=author&amp;query=Springenberg%2C+J+T">Jost Tobias Springenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Neunert%2C+M">Michael Neunert</a>, <a href="/search/cs?searchtype=author&amp;query=Hertweck%2C+T">Tim Hertweck</a>, <a href="/search/cs?searchtype=author&amp;query=Lampe%2C+T">Thomas Lampe</a>, <a href="/search/cs?searchtype=author&amp;query=Siegel%2C+N">Noah Siegel</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1906.11228v3-abstract-short" style="display: inline;"> The successful application of general reinforcement learning algorithms to real-world robotics applications is often limited by their high data requirements. We introduce Regularized Hierarchical Policy Optimization (RHPO) to improve data-efficiency for domains with multiple dominant tasks and ultimately reduce required platform time. To this end, we employ compositional inductive biases on multip&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.11228v3-abstract-full').style.display = 'inline'; document.getElementById('1906.11228v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1906.11228v3-abstract-full" style="display: none;"> The successful application of general reinforcement learning algorithms to real-world robotics applications is often limited by their high data requirements. We introduce Regularized Hierarchical Policy Optimization (RHPO) to improve data-efficiency for domains with multiple dominant tasks and ultimately reduce required platform time. To this end, we employ compositional inductive biases on multiple levels and corresponding mechanisms for sharing off-policy transition data across low-level controllers and tasks as well as scheduling of tasks. The presented algorithm enables stable and fast learning for complex, real-world domains in the parallel multitask and sequential transfer case. We show that the investigated types of hierarchy enable positive transfer while partially mitigating negative interference and evaluate the benefits of additional incentives for efficient, compositional task solutions in single task domains. Finally, we demonstrate substantial data-efficiency and final performance gains over competitive baselines in a week-long, physical robot stacking experiment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.11228v3-abstract-full').style.display = 'none'; document.getElementById('1906.11228v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 June, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Robotics Science and Systems 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1906.07516">arXiv:1906.07516</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1906.07516">pdf</a>, <a href="https://arxiv.org/format/1906.07516">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Robust Reinforcement Learning for Continuous Control with Model Misspecification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mankowitz%2C+D+J">Daniel J. Mankowitz</a>, <a href="/search/cs?searchtype=author&amp;query=Levine%2C+N">Nir Levine</a>, <a href="/search/cs?searchtype=author&amp;query=Jeong%2C+R">Rae Jeong</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+Y">Yuanyuan Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Kay%2C+J">Jackie Kay</a>, <a href="/search/cs?searchtype=author&amp;query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&amp;query=Springenberg%2C+J+T">Jost Tobias Springenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Mann%2C+T">Timothy Mann</a>, <a href="/search/cs?searchtype=author&amp;query=Hester%2C+T">Todd Hester</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1906.07516v2-abstract-short" style="display: inline;"> We provide a framework for incorporating robustness -- to perturbations in the transition dynamics which we refer to as model misspecification -- into continuous control Reinforcement Learning (RL) algorithms. We specifically focus on incorporating robustness into a state-of-the-art continuous control RL algorithm called Maximum a-posteriori Policy Optimization (MPO). We achieve this by learning a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.07516v2-abstract-full').style.display = 'inline'; document.getElementById('1906.07516v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1906.07516v2-abstract-full" style="display: none;"> We provide a framework for incorporating robustness -- to perturbations in the transition dynamics which we refer to as model misspecification -- into continuous control Reinforcement Learning (RL) algorithms. We specifically focus on incorporating robustness into a state-of-the-art continuous control RL algorithm called Maximum a-posteriori Policy Optimization (MPO). We achieve this by learning a policy that optimizes for a worst case expected return objective and derive a corresponding robust entropy-regularized Bellman contraction operator. In addition, we introduce a less conservative, soft-robust, entropy-regularized objective with a corresponding Bellman operator. We show that both, robust and soft-robust policies, outperform their non-robust counterparts in nine Mujoco domains with environment perturbations. In addition, we show improved robust performance on a high-dimensional, simulated, dexterous robotic hand. Finally, we present multiple investigative experiments that provide a deeper insight into the robustness framework. This includes an adaptation to another continuous control RL algorithm as well as learning the uncertainty set from offline data. Performance videos can be found online at https://sites.google.com/view/robust-rl. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.07516v2-abstract-full').style.display = 'none'; document.getElementById('1906.07516v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 June, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1902.04706">arXiv:1902.04706</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1902.04706">pdf</a>, <a href="https://arxiv.org/format/1902.04706">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Simultaneously Learning Vision and Feature-based Control Policies for Real-world Ball-in-a-Cup </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Schwab%2C+D">Devin Schwab</a>, <a href="/search/cs?searchtype=author&amp;query=Springenberg%2C+T">Tobias Springenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Martins%2C+M+F">Murilo F. Martins</a>, <a href="/search/cs?searchtype=author&amp;query=Lampe%2C+T">Thomas Lampe</a>, <a href="/search/cs?searchtype=author&amp;query=Neunert%2C+M">Michael Neunert</a>, <a href="/search/cs?searchtype=author&amp;query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&amp;query=Hertweck%2C+T">Tim Hertweck</a>, <a href="/search/cs?searchtype=author&amp;query=Hafner%2C+R">Roland Hafner</a>, <a href="/search/cs?searchtype=author&amp;query=Nori%2C+F">Francesco Nori</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1902.04706v2-abstract-short" style="display: inline;"> We present a method for fast training of vision based control policies on real robots. The key idea behind our method is to perform multi-task Reinforcement Learning with auxiliary tasks that differ not only in the reward to be optimized but also in the state-space in which they operate. In particular, we allow auxiliary task policies to utilize task features that are available only at training-ti&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1902.04706v2-abstract-full').style.display = 'inline'; document.getElementById('1902.04706v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1902.04706v2-abstract-full" style="display: none;"> We present a method for fast training of vision based control policies on real robots. The key idea behind our method is to perform multi-task Reinforcement Learning with auxiliary tasks that differ not only in the reward to be optimized but also in the state-space in which they operate. In particular, we allow auxiliary task policies to utilize task features that are available only at training-time. This allows for fast learning of auxiliary policies, which subsequently generate good data for training the main, vision-based control policies. This method can be seen as an extension of the Scheduled Auxiliary Control (SAC-X) framework. We demonstrate the efficacy of our method by using both a simulated and real-world Ball-in-a-Cup game controlled by a robot arm. In simulation, our approach leads to significant learning speed-ups when compared to standard SAC-X. On the real robot we show that the task can be learned from-scratch, i.e., with no transfer from simulation and no imitation learning. Videos of our learned policies running on the real robot can be found at https://sites.google.com/view/rss-2019-sawyer-bic/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1902.04706v2-abstract-full').style.display = 'none'; document.getElementById('1902.04706v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 February, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Videos can be found at https://sites.google.com/view/rss-2019-sawyer-bic/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1901.00943">arXiv:1901.00943</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1901.00943">pdf</a>, <a href="https://arxiv.org/format/1901.00943">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Self-supervised Learning of Image Embedding for Continuous Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Florensa%2C+C">Carlos Florensa</a>, <a href="/search/cs?searchtype=author&amp;query=Degrave%2C+J">Jonas Degrave</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&amp;query=Springenberg%2C+J+T">Jost Tobias Springenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1901.00943v1-abstract-short" style="display: inline;"> Operating directly from raw high dimensional sensory inputs like images is still a challenge for robotic control. Recently, Reinforcement Learning methods have been proposed to solve specific tasks end-to-end, from pixels to torques. However, these approaches assume the access to a specified reward which may require specialized instrumentation of the environment. Furthermore, the obtained policy a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1901.00943v1-abstract-full').style.display = 'inline'; document.getElementById('1901.00943v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1901.00943v1-abstract-full" style="display: none;"> Operating directly from raw high dimensional sensory inputs like images is still a challenge for robotic control. Recently, Reinforcement Learning methods have been proposed to solve specific tasks end-to-end, from pixels to torques. However, these approaches assume the access to a specified reward which may require specialized instrumentation of the environment. Furthermore, the obtained policy and representations tend to be task specific and may not transfer well. In this work we investigate completely self-supervised learning of a general image embedding and control primitives, based on finding the shortest time to reach any state. We also introduce a new structure for the state-action value function that builds a connection between model-free and model-based methods, and improves the performance of the learning algorithm. We experimentally demonstrate these findings in three simulated robotic tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1901.00943v1-abstract-full').style.display = 'none'; document.getElementById('1901.00943v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Contributed talk at Inference to Control workshop at NeurIPS2018</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1812.02256">arXiv:1812.02256</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1812.02256">pdf</a>, <a href="https://arxiv.org/format/1812.02256">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Relative Entropy Regularized Policy Iteration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&amp;query=Springenberg%2C+J+T">Jost Tobias Springenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Degrave%2C+J">Jonas Degrave</a>, <a href="/search/cs?searchtype=author&amp;query=Bohez%2C+S">Steven Bohez</a>, <a href="/search/cs?searchtype=author&amp;query=Tassa%2C+Y">Yuval Tassa</a>, <a href="/search/cs?searchtype=author&amp;query=Belov%2C+D">Dan Belov</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1812.02256v1-abstract-short" style="display: inline;"> We present an off-policy actor-critic algorithm for Reinforcement Learning (RL) that combines ideas from gradient-free optimization via stochastic search with learned action-value function. The result is a simple procedure consisting of three steps: i) policy evaluation by estimating a parametric action-value function; ii) policy improvement via the estimation of a local non-parametric policy; and&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1812.02256v1-abstract-full').style.display = 'inline'; document.getElementById('1812.02256v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1812.02256v1-abstract-full" style="display: none;"> We present an off-policy actor-critic algorithm for Reinforcement Learning (RL) that combines ideas from gradient-free optimization via stochastic search with learned action-value function. The result is a simple procedure consisting of three steps: i) policy evaluation by estimating a parametric action-value function; ii) policy improvement via the estimation of a local non-parametric policy; and iii) generalization by fitting a parametric policy. Each step can be implemented in different ways, giving rise to several algorithm variants. Our algorithm draws on connections to existing literature on black-box optimization and &#39;RL as an inference&#39; and it can be seen either as an extension of the Maximum a Posteriori Policy Optimisation algorithm (MPO) [Abdolmaleki et al., 2018a], or as an extension of Trust Region Covariance Matrix Adaptation Evolutionary Strategy (CMA-ES) [Abdolmaleki et al., 2017b; Hansen et al., 1997] to a policy iteration scheme. Our comparison on 31 continuous control tasks from parkour suite [Heess et al., 2017], DeepMind control suite [Tassa et al., 2018] and OpenAI Gym [Brockman et al., 2016] with diverse properties, limited amount of compute and a single set of hyperparameters, demonstrate the effectiveness of our method and the state of art results. Videos, summarizing results, can be found at goo.gl/HtvJKR . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1812.02256v1-abstract-full').style.display = 'none'; document.getElementById('1812.02256v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2018. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1806.06920">arXiv:1806.06920</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1806.06920">pdf</a>, <a href="https://arxiv.org/format/1806.06920">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Maximum a Posteriori Policy Optimisation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&amp;query=Springenberg%2C+J+T">Jost Tobias Springenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Tassa%2C+Y">Yuval Tassa</a>, <a href="/search/cs?searchtype=author&amp;query=Munos%2C+R">Remi Munos</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1806.06920v1-abstract-short" style="display: inline;"> We introduce a new algorithm for reinforcement learning called Maximum aposteriori Policy Optimisation (MPO) based on coordinate ascent on a relative entropy objective. We show that several existing methods can directly be related to our derivation. We develop two off-policy algorithms and demonstrate that they are competitive with the state-of-the-art in deep reinforcement learning. In particular&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1806.06920v1-abstract-full').style.display = 'inline'; document.getElementById('1806.06920v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1806.06920v1-abstract-full" style="display: none;"> We introduce a new algorithm for reinforcement learning called Maximum aposteriori Policy Optimisation (MPO) based on coordinate ascent on a relative entropy objective. We show that several existing methods can directly be related to our derivation. We develop two off-policy algorithms and demonstrate that they are competitive with the state-of-the-art in deep reinforcement learning. In particular, for continuous control, our method outperforms existing methods with respect to sample efficiency, premature convergence and robustness to hyperparameter settings while achieving similar or better final performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1806.06920v1-abstract-full').style.display = 'none'; document.getElementById('1806.06920v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 June, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2018. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1806.01242">arXiv:1806.01242</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1806.01242">pdf</a>, <a href="https://arxiv.org/format/1806.01242">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Graph networks as learnable physics engines for inference and control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sanchez-Gonzalez%2C+A">Alvaro Sanchez-Gonzalez</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&amp;query=Springenberg%2C+J+T">Jost Tobias Springenberg</a>, <a href="/search/cs?searchtype=author&amp;query=Merel%2C+J">Josh Merel</a>, <a href="/search/cs?searchtype=author&amp;query=Riedmiller%2C+M">Martin Riedmiller</a>, <a href="/search/cs?searchtype=author&amp;query=Hadsell%2C+R">Raia Hadsell</a>, <a href="/search/cs?searchtype=author&amp;query=Battaglia%2C+P">Peter Battaglia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1806.01242v1-abstract-short" style="display: inline;"> Understanding and interacting with everyday physical scenes requires rich knowledge about the structure of the world, represented either implicitly in a value or policy function, or explicitly in a transition model. Here we introduce a new class of learnable models--based on graph networks--which implement an inductive bias for object- and relation-centric representations of complex, dynamical sys&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1806.01242v1-abstract-full').style.display = 'inline'; document.getElementById('1806.01242v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1806.01242v1-abstract-full" style="display: none;"> Understanding and interacting with everyday physical scenes requires rich knowledge about the structure of the world, represented either implicitly in a value or policy function, or explicitly in a transition model. Here we introduce a new class of learnable models--based on graph networks--which implement an inductive bias for object- and relation-centric representations of complex, dynamical systems. Our results show that as a forward model, our approach supports accurate predictions from real and simulated data, and surprisingly strong and efficient generalization, across eight distinct physical systems which we varied parametrically and structurally. We also found that our inference model can perform system identification. Our models are also differentiable, and support online planning via gradient-based trajectory optimization, as well as offline policy optimization. Our framework offers new opportunities for harnessing and exploiting rich knowledge about the world, and takes a key step toward building machines with more human-like representations of the world. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1806.01242v1-abstract-full').style.display = 'none'; document.getElementById('1806.01242v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 June, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2018</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Riedmiller%2C+M&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Riedmiller%2C+M&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Riedmiller%2C+M&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10