CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–16 of 16 results for author: <span class="mathjax">Haarnoja, T</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Haarnoja%2C+T">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Haarnoja, T"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Haarnoja%2C+T&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Haarnoja, T"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.02425">arXiv:2405.02425</a> <span> [<a href="https://arxiv.org/pdf/2405.02425">pdf</a>, <a href="https://arxiv.org/format/2405.02425">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Learning Robot Soccer from Egocentric Vision with Deep Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tirumala%2C+D">Dhruva Tirumala</a>, <a href="/search/cs?searchtype=author&query=Wulfmeier%2C+M">Markus Wulfmeier</a>, <a href="/search/cs?searchtype=author&query=Moran%2C+B">Ben Moran</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Sandy Huang</a>, <a href="/search/cs?searchtype=author&query=Humplik%2C+J">Jan Humplik</a>, <a href="/search/cs?searchtype=author&query=Lever%2C+G">Guy Lever</a>, <a href="/search/cs?searchtype=author&query=Haarnoja%2C+T">Tuomas Haarnoja</a>, <a href="/search/cs?searchtype=author&query=Hasenclever%2C+L">Leonard Hasenclever</a>, <a href="/search/cs?searchtype=author&query=Byravan%2C+A">Arunkumar Byravan</a>, <a href="/search/cs?searchtype=author&query=Batchelor%2C+N">Nathan Batchelor</a>, <a href="/search/cs?searchtype=author&query=Sreendra%2C+N">Neil Sreendra</a>, <a href="/search/cs?searchtype=author&query=Patel%2C+K">Kushal Patel</a>, <a href="/search/cs?searchtype=author&query=Gwira%2C+M">Marlon Gwira</a>, <a href="/search/cs?searchtype=author&query=Nori%2C+F">Francesco Nori</a>, <a href="/search/cs?searchtype=author&query=Riedmiller%2C+M">Martin Riedmiller</a>, <a href="/search/cs?searchtype=author&query=Heess%2C+N">Nicolas Heess</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.02425v1-abstract-short" style="display: inline;"> We apply multi-agent deep reinforcement learning (RL) to train end-to-end robot soccer policies with fully onboard computation and sensing via egocentric RGB vision. This setting reflects many challenges of real-world robotics, including active perception, agile full-body control, and long-horizon planning in a dynamic, partially-observable, multi-agent domain. We rely on large-scale, simulation-b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02425v1-abstract-full').style.display = 'inline'; document.getElementById('2405.02425v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.02425v1-abstract-full" style="display: none;"> We apply multi-agent deep reinforcement learning (RL) to train end-to-end robot soccer policies with fully onboard computation and sensing via egocentric RGB vision. This setting reflects many challenges of real-world robotics, including active perception, agile full-body control, and long-horizon planning in a dynamic, partially-observable, multi-agent domain. We rely on large-scale, simulation-based data generation to obtain complex behaviors from egocentric vision which can be successfully transferred to physical robots using low-cost sensors. To achieve adequate visual realism, our simulation combines rigid-body physics with learned, realistic rendering via multiple Neural Radiance Fields (NeRFs). We combine teacher-based multi-agent RL and cross-experiment data reuse to enable the discovery of sophisticated soccer strategies. We analyze active-perception behaviors including object tracking and ball seeking that emerge when simply optimizing perception-agnostic soccer play. The agents display equivalent levels of performance and agility as policies with access to privileged, ground-truth state. To our knowledge, this paper constitutes a first demonstration of end-to-end training for multi-agent robot soccer, mapping raw pixel observations to joint-level actions, that can be deployed in the real world. Videos of the game-play and analyses can be seen on our website https://sites.google.com/view/vision-soccer . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02425v1-abstract-full').style.display = 'none'; document.getElementById('2405.02425v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.15951">arXiv:2311.15951</a> <span> [<a href="https://arxiv.org/pdf/2311.15951">pdf</a>, <a href="https://arxiv.org/format/2311.15951">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Replay across Experiments: A Natural Extension of Off-Policy RL </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tirumala%2C+D">Dhruva Tirumala</a>, <a href="/search/cs?searchtype=author&query=Lampe%2C+T">Thomas Lampe</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J+E">Jose Enrique Chen</a>, <a href="/search/cs?searchtype=author&query=Haarnoja%2C+T">Tuomas Haarnoja</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Sandy Huang</a>, <a href="/search/cs?searchtype=author&query=Lever%2C+G">Guy Lever</a>, <a href="/search/cs?searchtype=author&query=Moran%2C+B">Ben Moran</a>, <a href="/search/cs?searchtype=author&query=Hertweck%2C+T">Tim Hertweck</a>, <a href="/search/cs?searchtype=author&query=Hasenclever%2C+L">Leonard Hasenclever</a>, <a href="/search/cs?searchtype=author&query=Riedmiller%2C+M">Martin Riedmiller</a>, <a href="/search/cs?searchtype=author&query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&query=Wulfmeier%2C+M">Markus Wulfmeier</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.15951v2-abstract-short" style="display: inline;"> Replaying data is a principal mechanism underlying the stability and data efficiency of off-policy reinforcement learning (RL). We present an effective yet simple framework to extend the use of replays across multiple experiments, minimally adapting the RL workflow for sizeable improvements in controller performance and research iteration times. At its core, Replay Across Experiments (RaE) involve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.15951v2-abstract-full').style.display = 'inline'; document.getElementById('2311.15951v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.15951v2-abstract-full" style="display: none;"> Replaying data is a principal mechanism underlying the stability and data efficiency of off-policy reinforcement learning (RL). We present an effective yet simple framework to extend the use of replays across multiple experiments, minimally adapting the RL workflow for sizeable improvements in controller performance and research iteration times. At its core, Replay Across Experiments (RaE) involves reusing experience from previous experiments to improve exploration and bootstrap learning while reducing required changes to a minimum in comparison to prior work. We empirically show benefits across a number of RL algorithms and challenging control domains spanning both locomotion and manipulation, including hard exploration tasks from egocentric vision. Through comprehensive ablations, we demonstrate robustness to the quality and amount of data available and various hyperparameter choices. Finally, we discuss how our approach can be applied more broadly across research life cycles and can increase resilience by reloading data across random seeds or hyperparameter variations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.15951v2-abstract-full').style.display = 'none'; document.getElementById('2311.15951v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.13653">arXiv:2304.13653</a> <span> [<a href="https://arxiv.org/pdf/2304.13653">pdf</a>, <a href="https://arxiv.org/format/2304.13653">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1126/scirobotics.adi8022">10.1126/scirobotics.adi8022 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Learning Agile Soccer Skills for a Bipedal Robot with Deep Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Haarnoja%2C+T">Tuomas Haarnoja</a>, <a href="/search/cs?searchtype=author&query=Moran%2C+B">Ben Moran</a>, <a href="/search/cs?searchtype=author&query=Lever%2C+G">Guy Lever</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S+H">Sandy H. Huang</a>, <a href="/search/cs?searchtype=author&query=Tirumala%2C+D">Dhruva Tirumala</a>, <a href="/search/cs?searchtype=author&query=Humplik%2C+J">Jan Humplik</a>, <a href="/search/cs?searchtype=author&query=Wulfmeier%2C+M">Markus Wulfmeier</a>, <a href="/search/cs?searchtype=author&query=Tunyasuvunakool%2C+S">Saran Tunyasuvunakool</a>, <a href="/search/cs?searchtype=author&query=Siegel%2C+N+Y">Noah Y. Siegel</a>, <a href="/search/cs?searchtype=author&query=Hafner%2C+R">Roland Hafner</a>, <a href="/search/cs?searchtype=author&query=Bloesch%2C+M">Michael Bloesch</a>, <a href="/search/cs?searchtype=author&query=Hartikainen%2C+K">Kristian Hartikainen</a>, <a href="/search/cs?searchtype=author&query=Byravan%2C+A">Arunkumar Byravan</a>, <a href="/search/cs?searchtype=author&query=Hasenclever%2C+L">Leonard Hasenclever</a>, <a href="/search/cs?searchtype=author&query=Tassa%2C+Y">Yuval Tassa</a>, <a href="/search/cs?searchtype=author&query=Sadeghi%2C+F">Fereshteh Sadeghi</a>, <a href="/search/cs?searchtype=author&query=Batchelor%2C+N">Nathan Batchelor</a>, <a href="/search/cs?searchtype=author&query=Casarini%2C+F">Federico Casarini</a>, <a href="/search/cs?searchtype=author&query=Saliceti%2C+S">Stefano Saliceti</a>, <a href="/search/cs?searchtype=author&query=Game%2C+C">Charles Game</a>, <a href="/search/cs?searchtype=author&query=Sreendra%2C+N">Neil Sreendra</a>, <a href="/search/cs?searchtype=author&query=Patel%2C+K">Kushal Patel</a>, <a href="/search/cs?searchtype=author&query=Gwira%2C+M">Marlon Gwira</a>, <a href="/search/cs?searchtype=author&query=Huber%2C+A">Andrea Huber</a>, <a href="/search/cs?searchtype=author&query=Hurley%2C+N">Nicole Hurley</a> , et al. (3 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.13653v2-abstract-short" style="display: inline;"> We investigate whether Deep Reinforcement Learning (Deep RL) is able to synthesize sophisticated and safe movement skills for a low-cost, miniature humanoid robot that can be composed into complex behavioral strategies in dynamic environments. We used Deep RL to train a humanoid robot with 20 actuated joints to play a simplified one-versus-one (1v1) soccer game. The resulting agent exhibits robust… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.13653v2-abstract-full').style.display = 'inline'; document.getElementById('2304.13653v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.13653v2-abstract-full" style="display: none;"> We investigate whether Deep Reinforcement Learning (Deep RL) is able to synthesize sophisticated and safe movement skills for a low-cost, miniature humanoid robot that can be composed into complex behavioral strategies in dynamic environments. We used Deep RL to train a humanoid robot with 20 actuated joints to play a simplified one-versus-one (1v1) soccer game. The resulting agent exhibits robust and dynamic movement skills such as rapid fall recovery, walking, turning, kicking and more; and it transitions between them in a smooth, stable, and efficient manner. The agent's locomotion and tactical behavior adapts to specific game contexts in a way that would be impractical to manually design. The agent also developed a basic strategic understanding of the game, and learned, for instance, to anticipate ball movements and to block opponent shots. Our agent was trained in simulation and transferred to real robots zero-shot. We found that a combination of sufficiently high-frequency control, targeted dynamics randomization, and perturbations during training in simulation enabled good-quality transfer. Although the robots are inherently fragile, basic regularization of the behavior during training led the robots to learn safe and effective movements while still performing in a dynamic and agile way -- well beyond what is intuitively expected from the robot. Indeed, in experiments, they walked 181% faster, turned 302% faster, took 63% less time to get up, and kicked a ball 34% faster than a scripted baseline, while efficiently combining the skills to achieve the longer term objectives. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.13653v2-abstract-full').style.display = 'none'; document.getElementById('2304.13653v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project website: https://sites.google.com/view/op3-soccer</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.13743">arXiv:2211.13743</a> <span> [<a href="https://arxiv.org/pdf/2211.13743">pdf</a>, <a href="https://arxiv.org/format/2211.13743">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SkillS: Adaptive Skill Sequencing for Efficient Temporally-Extended Exploration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Vezzani%2C+G">Giulia Vezzani</a>, <a href="/search/cs?searchtype=author&query=Tirumala%2C+D">Dhruva Tirumala</a>, <a href="/search/cs?searchtype=author&query=Wulfmeier%2C+M">Markus Wulfmeier</a>, <a href="/search/cs?searchtype=author&query=Rao%2C+D">Dushyant Rao</a>, <a href="/search/cs?searchtype=author&query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&query=Moran%2C+B">Ben Moran</a>, <a href="/search/cs?searchtype=author&query=Haarnoja%2C+T">Tuomas Haarnoja</a>, <a href="/search/cs?searchtype=author&query=Humplik%2C+J">Jan Humplik</a>, <a href="/search/cs?searchtype=author&query=Hafner%2C+R">Roland Hafner</a>, <a href="/search/cs?searchtype=author&query=Neunert%2C+M">Michael Neunert</a>, <a href="/search/cs?searchtype=author&query=Fantacci%2C+C">Claudio Fantacci</a>, <a href="/search/cs?searchtype=author&query=Hertweck%2C+T">Tim Hertweck</a>, <a href="/search/cs?searchtype=author&query=Lampe%2C+T">Thomas Lampe</a>, <a href="/search/cs?searchtype=author&query=Sadeghi%2C+F">Fereshteh Sadeghi</a>, <a href="/search/cs?searchtype=author&query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&query=Riedmiller%2C+M">Martin Riedmiller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.13743v3-abstract-short" style="display: inline;"> The ability to effectively reuse prior knowledge is a key requirement when building general and flexible Reinforcement Learning (RL) agents. Skill reuse is one of the most common approaches, but current methods have considerable limitations.For example, fine-tuning an existing policy frequently fails, as the policy can degrade rapidly early in training. In a similar vein, distillation of expert be… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.13743v3-abstract-full').style.display = 'inline'; document.getElementById('2211.13743v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.13743v3-abstract-full" style="display: none;"> The ability to effectively reuse prior knowledge is a key requirement when building general and flexible Reinforcement Learning (RL) agents. Skill reuse is one of the most common approaches, but current methods have considerable limitations.For example, fine-tuning an existing policy frequently fails, as the policy can degrade rapidly early in training. In a similar vein, distillation of expert behavior can lead to poor results when given sub-optimal experts. We compare several common approaches for skill transfer on multiple domains including changes in task and system dynamics. We identify how existing methods can fail and introduce an alternative approach to mitigate these problems. Our approach learns to sequence existing temporally-extended skills for exploration but learns the final policy directly from the raw experience. This conceptual split enables rapid adaptation and thus efficient data collection but without constraining the final solution.It significantly outperforms many classical methods across a suite of evaluation tasks and we use a broad set of ablations to highlight the importance of differentc omponents of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.13743v3-abstract-full').style.display = 'none'; document.getElementById('2211.13743v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.04932">arXiv:2210.04932</a> <span> [<a href="https://arxiv.org/pdf/2210.04932">pdf</a>, <a href="https://arxiv.org/format/2210.04932">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> NeRF2Real: Sim2real Transfer of Vision-guided Bipedal Motion Skills using Neural Radiance Fields </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Byravan%2C+A">Arunkumar Byravan</a>, <a href="/search/cs?searchtype=author&query=Humplik%2C+J">Jan Humplik</a>, <a href="/search/cs?searchtype=author&query=Hasenclever%2C+L">Leonard Hasenclever</a>, <a href="/search/cs?searchtype=author&query=Brussee%2C+A">Arthur Brussee</a>, <a href="/search/cs?searchtype=author&query=Nori%2C+F">Francesco Nori</a>, <a href="/search/cs?searchtype=author&query=Haarnoja%2C+T">Tuomas Haarnoja</a>, <a href="/search/cs?searchtype=author&query=Moran%2C+B">Ben Moran</a>, <a href="/search/cs?searchtype=author&query=Bohez%2C+S">Steven Bohez</a>, <a href="/search/cs?searchtype=author&query=Sadeghi%2C+F">Fereshteh Sadeghi</a>, <a href="/search/cs?searchtype=author&query=Vujatovic%2C+B">Bojan Vujatovic</a>, <a href="/search/cs?searchtype=author&query=Heess%2C+N">Nicolas Heess</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.04932v1-abstract-short" style="display: inline;"> We present a system for applying sim2real approaches to "in the wild" scenes with realistic visuals, and to policies which rely on active perception using RGB cameras. Given a short video of a static scene collected using a generic phone, we learn the scene's contact geometry and a function for novel view synthesis using a Neural Radiance Field (NeRF). We augment the NeRF rendering of the static s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.04932v1-abstract-full').style.display = 'inline'; document.getElementById('2210.04932v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.04932v1-abstract-full" style="display: none;"> We present a system for applying sim2real approaches to "in the wild" scenes with realistic visuals, and to policies which rely on active perception using RGB cameras. Given a short video of a static scene collected using a generic phone, we learn the scene's contact geometry and a function for novel view synthesis using a Neural Radiance Field (NeRF). We augment the NeRF rendering of the static scene by overlaying the rendering of other dynamic objects (e.g. the robot's own body, a ball). A simulation is then created using the rendering engine in a physics simulator which computes contact dynamics from the static scene geometry (estimated from the NeRF volume density) and the dynamic objects' geometry and physical properties (assumed known). We demonstrate that we can use this simulation to learn vision-based whole body navigation and ball pushing policies for a 20 degrees of freedom humanoid robot with an actuated head-mounted RGB camera, and we successfully transfer these policies to a real robot. Project video is available at https://sites.google.com/view/nerf2real/home <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.04932v1-abstract-full').style.display = 'none'; document.getElementById('2210.04932v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.05893">arXiv:2204.05893</a> <span> [<a href="https://arxiv.org/pdf/2204.05893">pdf</a>, <a href="https://arxiv.org/format/2204.05893">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Forgetting and Imbalance in Robot Lifelong Learning with Off-policy Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Wenxuan Zhou</a>, <a href="/search/cs?searchtype=author&query=Bohez%2C+S">Steven Bohez</a>, <a href="/search/cs?searchtype=author&query=Humplik%2C+J">Jan Humplik</a>, <a href="/search/cs?searchtype=author&query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&query=Rao%2C+D">Dushyant Rao</a>, <a href="/search/cs?searchtype=author&query=Wulfmeier%2C+M">Markus Wulfmeier</a>, <a href="/search/cs?searchtype=author&query=Haarnoja%2C+T">Tuomas Haarnoja</a>, <a href="/search/cs?searchtype=author&query=Heess%2C+N">Nicolas Heess</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.05893v2-abstract-short" style="display: inline;"> Robots will experience non-stationary environment dynamics throughout their lifetime: the robot dynamics can change due to wear and tear, or its surroundings may change over time. Eventually, the robots should perform well in all of the environment variations it has encountered. At the same time, it should still be able to learn fast in a new environment. We identify two challenges in Reinforcemen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.05893v2-abstract-full').style.display = 'inline'; document.getElementById('2204.05893v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.05893v2-abstract-full" style="display: none;"> Robots will experience non-stationary environment dynamics throughout their lifetime: the robot dynamics can change due to wear and tear, or its surroundings may change over time. Eventually, the robots should perform well in all of the environment variations it has encountered. At the same time, it should still be able to learn fast in a new environment. We identify two challenges in Reinforcement Learning (RL) under such a lifelong learning setting with off-policy data: first, existing off-policy algorithms struggle with the trade-off between being conservative to maintain good performance in the old environment and learning efficiently in the new environment, despite keeping all the data in the replay buffer. We propose the Offline Distillation Pipeline to break this trade-off by separating the training procedure into an online interaction phase and an offline distillation phase.Second, we find that training with the imbalanced off-policy data from multiple environments across the lifetime creates a significant performance drop. We identify that this performance drop is caused by the combination of the imbalanced quality and size among the datasets which exacerbate the extrapolation error of the Q-function. During the distillation phase, we apply a simple fix to the issue by keeping the policy closer to the behavior policy that generated the data. In the experiments, we demonstrate these two challenges and the proposed solutions with a simulated bipedal robot walk-ing task across various environment changes. We show that the Offline Distillation Pipeline achieves better performance across all the encountered environments without affecting data collection. We also provide a comprehensive empirical study to support our hypothesis on the data imbalance issue. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.05893v2-abstract-full').style.display = 'none'; document.getElementById('2204.05893v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published at 1st Conference on Lifelong Learning Agents, 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.17138">arXiv:2203.17138</a> <span> [<a href="https://arxiv.org/pdf/2203.17138">pdf</a>, <a href="https://arxiv.org/format/2203.17138">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Imitate and Repurpose: Learning Reusable Robot Movement Skills From Human and Animal Behaviors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bohez%2C+S">Steven Bohez</a>, <a href="/search/cs?searchtype=author&query=Tunyasuvunakool%2C+S">Saran Tunyasuvunakool</a>, <a href="/search/cs?searchtype=author&query=Brakel%2C+P">Philemon Brakel</a>, <a href="/search/cs?searchtype=author&query=Sadeghi%2C+F">Fereshteh Sadeghi</a>, <a href="/search/cs?searchtype=author&query=Hasenclever%2C+L">Leonard Hasenclever</a>, <a href="/search/cs?searchtype=author&query=Tassa%2C+Y">Yuval Tassa</a>, <a href="/search/cs?searchtype=author&query=Parisotto%2C+E">Emilio Parisotto</a>, <a href="/search/cs?searchtype=author&query=Humplik%2C+J">Jan Humplik</a>, <a href="/search/cs?searchtype=author&query=Haarnoja%2C+T">Tuomas Haarnoja</a>, <a href="/search/cs?searchtype=author&query=Hafner%2C+R">Roland Hafner</a>, <a href="/search/cs?searchtype=author&query=Wulfmeier%2C+M">Markus Wulfmeier</a>, <a href="/search/cs?searchtype=author&query=Neunert%2C+M">Michael Neunert</a>, <a href="/search/cs?searchtype=author&query=Moran%2C+B">Ben Moran</a>, <a href="/search/cs?searchtype=author&query=Siegel%2C+N">Noah Siegel</a>, <a href="/search/cs?searchtype=author&query=Huber%2C+A">Andrea Huber</a>, <a href="/search/cs?searchtype=author&query=Romano%2C+F">Francesco Romano</a>, <a href="/search/cs?searchtype=author&query=Batchelor%2C+N">Nathan Batchelor</a>, <a href="/search/cs?searchtype=author&query=Casarini%2C+F">Federico Casarini</a>, <a href="/search/cs?searchtype=author&query=Merel%2C+J">Josh Merel</a>, <a href="/search/cs?searchtype=author&query=Hadsell%2C+R">Raia Hadsell</a>, <a href="/search/cs?searchtype=author&query=Heess%2C+N">Nicolas Heess</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.17138v1-abstract-short" style="display: inline;"> We investigate the use of prior knowledge of human and animal movement to learn reusable locomotion skills for real legged robots. Our approach builds upon previous work on imitating human or dog Motion Capture (MoCap) data to learn a movement skill module. Once learned, this skill module can be reused for complex downstream tasks. Importantly, due to the prior imposed by the MoCap data, our appro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.17138v1-abstract-full').style.display = 'inline'; document.getElementById('2203.17138v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.17138v1-abstract-full" style="display: none;"> We investigate the use of prior knowledge of human and animal movement to learn reusable locomotion skills for real legged robots. Our approach builds upon previous work on imitating human or dog Motion Capture (MoCap) data to learn a movement skill module. Once learned, this skill module can be reused for complex downstream tasks. Importantly, due to the prior imposed by the MoCap data, our approach does not require extensive reward engineering to produce sensible and natural looking behavior at the time of reuse. This makes it easy to create well-regularized, task-oriented controllers that are suitable for deployment on real robots. We demonstrate how our skill module can be used for imitation, and train controllable walking and ball dribbling policies for both the ANYmal quadruped and OP3 humanoid. These policies are then deployed on hardware via zero-shot simulation-to-reality transfer. Accompanying videos are available at https://bit.ly/robot-npmp. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.17138v1-abstract-full').style.display = 'none'; document.getElementById('2203.17138v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">30 pages, 9 figures, 8 tables, 14 videos at https://bit.ly/robot-npmp , submitted to Science Robotics</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2105.12196">arXiv:2105.12196</a> <span> [<a href="https://arxiv.org/pdf/2105.12196">pdf</a>, <a href="https://arxiv.org/format/2105.12196">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> From Motor Control to Team Play in Simulated Humanoid Football </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+S">Siqi Liu</a>, <a href="/search/cs?searchtype=author&query=Lever%2C+G">Guy Lever</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhe Wang</a>, <a href="/search/cs?searchtype=author&query=Merel%2C+J">Josh Merel</a>, <a href="/search/cs?searchtype=author&query=Eslami%2C+S+M+A">S. M. Ali Eslami</a>, <a href="/search/cs?searchtype=author&query=Hennes%2C+D">Daniel Hennes</a>, <a href="/search/cs?searchtype=author&query=Czarnecki%2C+W+M">Wojciech M. Czarnecki</a>, <a href="/search/cs?searchtype=author&query=Tassa%2C+Y">Yuval Tassa</a>, <a href="/search/cs?searchtype=author&query=Omidshafiei%2C+S">Shayegan Omidshafiei</a>, <a href="/search/cs?searchtype=author&query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&query=Siegel%2C+N+Y">Noah Y. Siegel</a>, <a href="/search/cs?searchtype=author&query=Hasenclever%2C+L">Leonard Hasenclever</a>, <a href="/search/cs?searchtype=author&query=Marris%2C+L">Luke Marris</a>, <a href="/search/cs?searchtype=author&query=Tunyasuvunakool%2C+S">Saran Tunyasuvunakool</a>, <a href="/search/cs?searchtype=author&query=Song%2C+H+F">H. Francis Song</a>, <a href="/search/cs?searchtype=author&query=Wulfmeier%2C+M">Markus Wulfmeier</a>, <a href="/search/cs?searchtype=author&query=Muller%2C+P">Paul Muller</a>, <a href="/search/cs?searchtype=author&query=Haarnoja%2C+T">Tuomas Haarnoja</a>, <a href="/search/cs?searchtype=author&query=Tracey%2C+B+D">Brendan D. Tracey</a>, <a href="/search/cs?searchtype=author&query=Tuyls%2C+K">Karl Tuyls</a>, <a href="/search/cs?searchtype=author&query=Graepel%2C+T">Thore Graepel</a>, <a href="/search/cs?searchtype=author&query=Heess%2C+N">Nicolas Heess</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2105.12196v1-abstract-short" style="display: inline;"> Intelligent behaviour in the physical world exhibits structure at multiple spatial and temporal scales. Although movements are ultimately executed at the level of instantaneous muscle tensions or joint torques, they must be selected to serve goals defined on much longer timescales, and in terms of relations that extend far beyond the body itself, ultimately involving coordination with other agents… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2105.12196v1-abstract-full').style.display = 'inline'; document.getElementById('2105.12196v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2105.12196v1-abstract-full" style="display: none;"> Intelligent behaviour in the physical world exhibits structure at multiple spatial and temporal scales. Although movements are ultimately executed at the level of instantaneous muscle tensions or joint torques, they must be selected to serve goals defined on much longer timescales, and in terms of relations that extend far beyond the body itself, ultimately involving coordination with other agents. Recent research in artificial intelligence has shown the promise of learning-based approaches to the respective problems of complex movement, longer-term planning and multi-agent coordination. However, there is limited research aimed at their integration. We study this problem by training teams of physically simulated humanoid avatars to play football in a realistic virtual environment. We develop a method that combines imitation learning, single- and multi-agent reinforcement learning and population-based training, and makes use of transferable representations of behaviour for decision making at different levels of abstraction. In a sequence of stages, players first learn to control a fully articulated body to perform realistic, human-like movements such as running and turning; they then acquire mid-level football skills such as dribbling and shooting; finally, they develop awareness of others and play as a team, bridging the gap between low-level motor control at a timescale of milliseconds, and coordinated goal-directed behaviour as a team at the timescale of tens of seconds. We investigate the emergence of behaviours at different levels of abstraction, as well as the representations that underlie these behaviours using several analysis techniques, including statistics from real-world sports analytics. Our work constitutes a complete demonstration of integrated decision-making at multiple scales in a physically embodied multi-agent setting. See project video at https://youtu.be/KHMwq9pv7mg. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2105.12196v1-abstract-full').style.display = 'none'; document.getElementById('2105.12196v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 May, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1907.08225">arXiv:1907.08225</a> <span> [<a href="https://arxiv.org/pdf/1907.08225">pdf</a>, <a href="https://arxiv.org/format/1907.08225">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Dynamical Distance Learning for Semi-Supervised and Unsupervised Skill Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hartikainen%2C+K">Kristian Hartikainen</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+X">Xinyang Geng</a>, <a href="/search/cs?searchtype=author&query=Haarnoja%2C+T">Tuomas Haarnoja</a>, <a href="/search/cs?searchtype=author&query=Levine%2C+S">Sergey Levine</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1907.08225v4-abstract-short" style="display: inline;"> Reinforcement learning requires manual specification of a reward function to learn a task. While in principle this reward function only needs to specify the task goal, in practice reinforcement learning can be very time-consuming or even infeasible unless the reward function is shaped so as to provide a smooth gradient towards a successful outcome. This shaping is difficult to specify by hand, par… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1907.08225v4-abstract-full').style.display = 'inline'; document.getElementById('1907.08225v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1907.08225v4-abstract-full" style="display: none;"> Reinforcement learning requires manual specification of a reward function to learn a task. While in principle this reward function only needs to specify the task goal, in practice reinforcement learning can be very time-consuming or even infeasible unless the reward function is shaped so as to provide a smooth gradient towards a successful outcome. This shaping is difficult to specify by hand, particularly when the task is learned from raw observations, such as images. In this paper, we study how we can automatically learn dynamical distances: a measure of the expected number of time steps to reach a given goal state from any other state. These dynamical distances can be used to provide well-shaped reward functions for reaching new goals, making it possible to learn complex tasks efficiently. We show that dynamical distances can be used in a semi-supervised regime, where unsupervised interaction with the environment is used to learn the dynamical distances, while a small amount of preference supervision is used to determine the task goal, without any manually engineered reward function or goal examples. We evaluate our method both on a real-world robot and in simulation. We show that our method can learn to turn a valve with a real-world 9-DoF hand, using raw image observations and just ten preference labels, without any other supervision. Videos of the learned skills can be found on the project website: https://sites.google.com/view/dynamical-distance-learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1907.08225v4-abstract-full').style.display = 'none'; document.getElementById('1907.08225v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 July, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11+6 pages, 6+2 figures, last two authors (Tuomas Haarnoja, Sergey Levine) advised equally</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1812.11103">arXiv:1812.11103</a> <span> [<a href="https://arxiv.org/pdf/1812.11103">pdf</a>, <a href="https://arxiv.org/format/1812.11103">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Learning to Walk via Deep Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Haarnoja%2C+T">Tuomas Haarnoja</a>, <a href="/search/cs?searchtype=author&query=Ha%2C+S">Sehoon Ha</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+A">Aurick Zhou</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jie Tan</a>, <a href="/search/cs?searchtype=author&query=Tucker%2C+G">George Tucker</a>, <a href="/search/cs?searchtype=author&query=Levine%2C+S">Sergey Levine</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1812.11103v3-abstract-short" style="display: inline;"> Deep reinforcement learning (deep RL) holds the promise of automating the acquisition of complex controllers that can map sensory inputs directly to low-level actions. In the domain of robotic locomotion, deep RL could enable learning locomotion skills with minimal engineering and without an explicit model of the robot dynamics. Unfortunately, applying deep RL to real-world robotic tasks is except… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1812.11103v3-abstract-full').style.display = 'inline'; document.getElementById('1812.11103v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1812.11103v3-abstract-full" style="display: none;"> Deep reinforcement learning (deep RL) holds the promise of automating the acquisition of complex controllers that can map sensory inputs directly to low-level actions. In the domain of robotic locomotion, deep RL could enable learning locomotion skills with minimal engineering and without an explicit model of the robot dynamics. Unfortunately, applying deep RL to real-world robotic tasks is exceptionally difficult, primarily due to poor sample complexity and sensitivity to hyperparameters. While hyperparameters can be easily tuned in simulated domains, tuning may be prohibitively expensive on physical systems, such as legged robots, that can be damaged through extensive trial-and-error learning. In this paper, we propose a sample-efficient deep RL algorithm based on maximum entropy RL that requires minimal per-task tuning and only a modest number of trials to learn neural network policies. We apply this method to learning walking gaits on a real-world Minitaur robot. Our method can acquire a stable gait from scratch directly in the real world in about two hours, without relying on any model or simulation, and the resulting policy is robust to moderate variations in the environment. We further show that our algorithm achieves state-of-the-art performance on simulated benchmarks with a single set of hyperparameters. Videos of training and the learned policy can be found on the project website. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1812.11103v3-abstract-full').style.display = 'none'; document.getElementById('1812.11103v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 June, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 December, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">RSS 2019, https://sites.google.com/view/minitaur-locomotion/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1812.05905">arXiv:1812.05905</a> <span> [<a href="https://arxiv.org/pdf/1812.05905">pdf</a>, <a href="https://arxiv.org/format/1812.05905">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Soft Actor-Critic Algorithms and Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Haarnoja%2C+T">Tuomas Haarnoja</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+A">Aurick Zhou</a>, <a href="/search/cs?searchtype=author&query=Hartikainen%2C+K">Kristian Hartikainen</a>, <a href="/search/cs?searchtype=author&query=Tucker%2C+G">George Tucker</a>, <a href="/search/cs?searchtype=author&query=Ha%2C+S">Sehoon Ha</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jie Tan</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+V">Vikash Kumar</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Henry Zhu</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+A">Abhishek Gupta</a>, <a href="/search/cs?searchtype=author&query=Abbeel%2C+P">Pieter Abbeel</a>, <a href="/search/cs?searchtype=author&query=Levine%2C+S">Sergey Levine</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1812.05905v2-abstract-short" style="display: inline;"> Model-free deep reinforcement learning (RL) algorithms have been successfully applied to a range of challenging sequential decision making and control tasks. However, these methods typically suffer from two major challenges: high sample complexity and brittleness to hyperparameters. Both of these challenges limit the applicability of such methods to real-world domains. In this paper, we describe S… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1812.05905v2-abstract-full').style.display = 'inline'; document.getElementById('1812.05905v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1812.05905v2-abstract-full" style="display: none;"> Model-free deep reinforcement learning (RL) algorithms have been successfully applied to a range of challenging sequential decision making and control tasks. However, these methods typically suffer from two major challenges: high sample complexity and brittleness to hyperparameters. Both of these challenges limit the applicability of such methods to real-world domains. In this paper, we describe Soft Actor-Critic (SAC), our recently introduced off-policy actor-critic algorithm based on the maximum entropy RL framework. In this framework, the actor aims to simultaneously maximize expected return and entropy. That is, to succeed at the task while acting as randomly as possible. We extend SAC to incorporate a number of modifications that accelerate training and improve stability with respect to the hyperparameters, including a constrained formulation that automatically tunes the temperature hyperparameter. We systematically evaluate SAC on a range of benchmark tasks, as well as real-world challenging tasks such as locomotion for a quadrupedal robot and robotic manipulation with a dexterous hand. With these improvements, SAC achieves state-of-the-art performance, outperforming prior on-policy and off-policy methods in sample-efficiency and asymptotic performance. Furthermore, we demonstrate that, in contrast to other off-policy algorithms, our approach is very stable, achieving similar performance across different random seeds. These results suggest that SAC is a promising candidate for learning in real-world robotics tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1812.05905v2-abstract-full').style.display = 'none'; document.getElementById('1812.05905v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 December, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: substantial text overlap with arXiv:1801.01290</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1804.02808">arXiv:1804.02808</a> <span> [<a href="https://arxiv.org/pdf/1804.02808">pdf</a>, <a href="https://arxiv.org/format/1804.02808">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Latent Space Policies for Hierarchical Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Haarnoja%2C+T">Tuomas Haarnoja</a>, <a href="/search/cs?searchtype=author&query=Hartikainen%2C+K">Kristian Hartikainen</a>, <a href="/search/cs?searchtype=author&query=Abbeel%2C+P">Pieter Abbeel</a>, <a href="/search/cs?searchtype=author&query=Levine%2C+S">Sergey Levine</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1804.02808v2-abstract-short" style="display: inline;"> We address the problem of learning hierarchical deep neural network policies for reinforcement learning. In contrast to methods that explicitly restrict or cripple lower layers of a hierarchy to force them to use higher-level modulating signals, each layer in our framework is trained to directly solve the task, but acquires a range of diverse strategies via a maximum entropy reinforcement learning… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1804.02808v2-abstract-full').style.display = 'inline'; document.getElementById('1804.02808v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1804.02808v2-abstract-full" style="display: none;"> We address the problem of learning hierarchical deep neural network policies for reinforcement learning. In contrast to methods that explicitly restrict or cripple lower layers of a hierarchy to force them to use higher-level modulating signals, each layer in our framework is trained to directly solve the task, but acquires a range of diverse strategies via a maximum entropy reinforcement learning objective. Each layer is also augmented with latent random variables, which are sampled from a prior distribution during the training of that layer. The maximum entropy objective causes these latent variables to be incorporated into the layer's policy, and the higher level layer can directly control the behavior of the lower layer through this latent space. Furthermore, by constraining the mapping from latent variables to actions to be invertible, higher layers retain full expressivity: neither the higher layers nor the lower layers are constrained in their behavior. Our experimental evaluation demonstrates that we can improve on the performance of single-layer policies on standard benchmark tasks simply by adding additional layers, and that our method can solve more complex sparse-reward tasks by learning higher-level policies on top of high-entropy skills optimized for simple low-level objectives. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1804.02808v2-abstract-full').style.display = 'none'; document.getElementById('1804.02808v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 September, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 April, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2018; Videos: https://sites.google.com/view/latent-space-deep-rl Code: https://github.com/haarnoja/sac</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1803.06773">arXiv:1803.06773</a> <span> [<a href="https://arxiv.org/pdf/1803.06773">pdf</a>, <a href="https://arxiv.org/format/1803.06773">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Composable Deep Reinforcement Learning for Robotic Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Haarnoja%2C+T">Tuomas Haarnoja</a>, <a href="/search/cs?searchtype=author&query=Pong%2C+V">Vitchyr Pong</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+A">Aurick Zhou</a>, <a href="/search/cs?searchtype=author&query=Dalal%2C+M">Murtaza Dalal</a>, <a href="/search/cs?searchtype=author&query=Abbeel%2C+P">Pieter Abbeel</a>, <a href="/search/cs?searchtype=author&query=Levine%2C+S">Sergey Levine</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1803.06773v1-abstract-short" style="display: inline;"> Model-free deep reinforcement learning has been shown to exhibit good performance in domains ranging from video games to simulated robotic manipulation and locomotion. However, model-free methods are known to perform poorly when the interaction time with the environment is limited, as is the case for most real-world robotic tasks. In this paper, we study how maximum entropy policies trained using… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1803.06773v1-abstract-full').style.display = 'inline'; document.getElementById('1803.06773v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1803.06773v1-abstract-full" style="display: none;"> Model-free deep reinforcement learning has been shown to exhibit good performance in domains ranging from video games to simulated robotic manipulation and locomotion. However, model-free methods are known to perform poorly when the interaction time with the environment is limited, as is the case for most real-world robotic tasks. In this paper, we study how maximum entropy policies trained using soft Q-learning can be applied to real-world robotic manipulation. The application of this method to real-world manipulation is facilitated by two important features of soft Q-learning. First, soft Q-learning can learn multimodal exploration strategies by learning policies represented by expressive energy-based models. Second, we show that policies learned with soft Q-learning can be composed to create new policies, and that the optimality of the resulting policy can be bounded in terms of the divergence between the composed policies. This compositionality provides an especially valuable tool for real-world manipulation, where constructing new policies by composing existing skills can provide a large gain in efficiency over training from scratch. Our experimental evaluation demonstrates that soft Q-learning is substantially more sample efficient than prior model-free deep reinforcement learning methods, and that compositionality can be performed for both simulated and real-world tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1803.06773v1-abstract-full').style.display = 'none'; document.getElementById('1803.06773v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Videos: https://sites.google.com/view/composing-real-world-policies/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1801.01290">arXiv:1801.01290</a> <span> [<a href="https://arxiv.org/pdf/1801.01290">pdf</a>, <a href="https://arxiv.org/format/1801.01290">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Haarnoja%2C+T">Tuomas Haarnoja</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+A">Aurick Zhou</a>, <a href="/search/cs?searchtype=author&query=Abbeel%2C+P">Pieter Abbeel</a>, <a href="/search/cs?searchtype=author&query=Levine%2C+S">Sergey Levine</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1801.01290v2-abstract-short" style="display: inline;"> Model-free deep reinforcement learning (RL) algorithms have been demonstrated on a range of challenging decision making and control tasks. However, these methods typically suffer from two major challenges: very high sample complexity and brittle convergence properties, which necessitate meticulous hyperparameter tuning. Both of these challenges severely limit the applicability of such methods to c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1801.01290v2-abstract-full').style.display = 'inline'; document.getElementById('1801.01290v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1801.01290v2-abstract-full" style="display: none;"> Model-free deep reinforcement learning (RL) algorithms have been demonstrated on a range of challenging decision making and control tasks. However, these methods typically suffer from two major challenges: very high sample complexity and brittle convergence properties, which necessitate meticulous hyperparameter tuning. Both of these challenges severely limit the applicability of such methods to complex, real-world domains. In this paper, we propose soft actor-critic, an off-policy actor-critic deep RL algorithm based on the maximum entropy reinforcement learning framework. In this framework, the actor aims to maximize expected reward while also maximizing entropy. That is, to succeed at the task while acting as randomly as possible. Prior deep RL methods based on this framework have been formulated as Q-learning methods. By combining off-policy updates with a stable stochastic actor-critic formulation, our method achieves state-of-the-art performance on a range of continuous control benchmark tasks, outperforming prior on-policy and off-policy methods. Furthermore, we demonstrate that, in contrast to other off-policy algorithms, our approach is very stable, achieving very similar performance across different random seeds. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1801.01290v2-abstract-full').style.display = 'none'; document.getElementById('1801.01290v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 August, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 January, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2018 Videos: sites.google.com/view/soft-actor-critic Code: github.com/haarnoja/sac</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1702.08165">arXiv:1702.08165</a> <span> [<a href="https://arxiv.org/pdf/1702.08165">pdf</a>, <a href="https://arxiv.org/format/1702.08165">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Reinforcement Learning with Deep Energy-Based Policies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Haarnoja%2C+T">Tuomas Haarnoja</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+H">Haoran Tang</a>, <a href="/search/cs?searchtype=author&query=Abbeel%2C+P">Pieter Abbeel</a>, <a href="/search/cs?searchtype=author&query=Levine%2C+S">Sergey Levine</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1702.08165v2-abstract-short" style="display: inline;"> We propose a method for learning expressive energy-based policies for continuous states and actions, which has been feasible only in tabular domains before. We apply our method to learning maximum entropy policies, resulting into a new algorithm, called soft Q-learning, that expresses the optimal policy via a Boltzmann distribution. We use the recently proposed amortized Stein variational gradient… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1702.08165v2-abstract-full').style.display = 'inline'; document.getElementById('1702.08165v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1702.08165v2-abstract-full" style="display: none;"> We propose a method for learning expressive energy-based policies for continuous states and actions, which has been feasible only in tabular domains before. We apply our method to learning maximum entropy policies, resulting into a new algorithm, called soft Q-learning, that expresses the optimal policy via a Boltzmann distribution. We use the recently proposed amortized Stein variational gradient descent to learn a stochastic sampling network that approximates samples from this distribution. The benefits of the proposed algorithm include improved exploration and compositionality that allows transferring skills between tasks, which we confirm in simulated experiments with swimming and walking robots. We also draw a connection to actor-critic methods, which can be viewed performing approximate inference on the corresponding energy-based model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1702.08165v2-abstract-full').style.display = 'none'; document.getElementById('1702.08165v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 July, 2017; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 February, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2017. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1605.07148">arXiv:1605.07148</a> <span> [<a href="https://arxiv.org/pdf/1605.07148">pdf</a>, <a href="https://arxiv.org/format/1605.07148">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Backprop KF: Learning Discriminative Deterministic State Estimators </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Haarnoja%2C+T">Tuomas Haarnoja</a>, <a href="/search/cs?searchtype=author&query=Ajay%2C+A">Anurag Ajay</a>, <a href="/search/cs?searchtype=author&query=Levine%2C+S">Sergey Levine</a>, <a href="/search/cs?searchtype=author&query=Abbeel%2C+P">Pieter Abbeel</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1605.07148v4-abstract-short" style="display: inline;"> Generative state estimators based on probabilistic filters and smoothers are one of the most popular classes of state estimators for robots and autonomous vehicles. However, generative models have limited capacity to handle rich sensory observations, such as camera images, since they must model the entire distribution over sensor readings. Discriminative models do not suffer from this limitation,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1605.07148v4-abstract-full').style.display = 'inline'; document.getElementById('1605.07148v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1605.07148v4-abstract-full" style="display: none;"> Generative state estimators based on probabilistic filters and smoothers are one of the most popular classes of state estimators for robots and autonomous vehicles. However, generative models have limited capacity to handle rich sensory observations, such as camera images, since they must model the entire distribution over sensor readings. Discriminative models do not suffer from this limitation, but are typically more complex to train as latent variable models for state estimation. We present an alternative approach where the parameters of the latent state distribution are directly optimized as a deterministic computation graph, resulting in a simple and effective gradient descent algorithm for training discriminative state estimators. We show that this procedure can be used to train state estimators that use complex input, such as raw camera images, which must be processed using expressive nonlinear function approximators such as convolutional neural networks. Our model can be viewed as a type of recurrent neural network, and the connection to probabilistic filtering allows us to design a network architecture that is particularly well suited for state estimation. We evaluate our approach on synthetic tracking task with raw image inputs and on the visual odometry task in the KITTI dataset. The results show significant improvement over both standard generative approaches and regular recurrent neural networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1605.07148v4-abstract-full').style.display = 'none'; document.getElementById('1605.07148v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2017; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 May, 2016; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2016. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NIPS 2016</span> </p> </li> </ol> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>