CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 59 results for author: <span class="mathjax">Lanctot, M</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Lanctot%2C+M">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Lanctot, M"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Lanctot%2C+M&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Lanctot, M"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Lanctot%2C+M&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Lanctot%2C+M&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Lanctot%2C+M&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11645">arXiv:2502.11645</a> <span> [<a href="https://arxiv.org/pdf/2502.11645">pdf</a>, <a href="https://arxiv.org/format/2502.11645">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Other Statistics">stat.OT</span> </div> </div> <p class="title is-5 mathjax"> Deviation Ratings: A General, Clone-Invariant Rating Method </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Marris%2C+L">Luke Marris</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Siqi Liu</a>, <a href="/search/cs?searchtype=author&query=Gemp%2C+I">Ian Gemp</a>, <a href="/search/cs?searchtype=author&query=Piliouras%2C+G">Georgios Piliouras</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11645v1-abstract-short" style="display: inline;"> Many real-world multi-agent or multi-task evaluation scenarios can be naturally modelled as normal-form games due to inherent strategic (adversarial, cooperative, and mixed motive) interactions. These strategic interactions may be agentic (e.g. players trying to win), fundamental (e.g. cost vs quality), or complementary (e.g. niche finding and specialization). In such a formulation, it is the stra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11645v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11645v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11645v1-abstract-full" style="display: none;"> Many real-world multi-agent or multi-task evaluation scenarios can be naturally modelled as normal-form games due to inherent strategic (adversarial, cooperative, and mixed motive) interactions. These strategic interactions may be agentic (e.g. players trying to win), fundamental (e.g. cost vs quality), or complementary (e.g. niche finding and specialization). In such a formulation, it is the strategies (actions, policies, agents, models, tasks, prompts, etc.) that are rated. However, the rating problem is complicated by redundancy and complexity of N-player strategic interactions. Repeated or similar strategies can distort ratings for those that counter or complement them. Previous work proposed ``clone invariant'' ratings to handle such redundancies, but this was limited to two-player zero-sum (i.e. strictly competitive) interactions. This work introduces the first N-player general-sum clone invariant rating, called deviation ratings, based on coarse correlated equilibria. The rating is explored on several domains including LLMs evaluation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11645v1-abstract-full').style.display = 'none'; document.getElementById('2502.11645v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.19266">arXiv:2501.19266</a> <span> [<a href="https://arxiv.org/pdf/2501.19266">pdf</a>, <a href="https://arxiv.org/format/2501.19266">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Theoretical Economics">econ.TH</span> </div> </div> <p class="title is-5 mathjax"> Jackpot! Alignment as a Maximal Lottery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Maura-Rivero%2C+R">Roberto-Rafael Maura-Rivero</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Visin%2C+F">Francesco Visin</a>, <a href="/search/cs?searchtype=author&query=Larson%2C+K">Kate Larson</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.19266v1-abstract-short" style="display: inline;"> Reinforcement Learning from Human Feedback (RLHF), the standard for aligning Large Language Models (LLMs) with human values, is known to fail to satisfy properties that are intuitively desirable, such as respecting the preferences of the majority \cite{ge2024axioms}. To overcome these issues, we propose the use of a probabilistic Social Choice rule called \emph{maximal lotteries} as a replacement… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19266v1-abstract-full').style.display = 'inline'; document.getElementById('2501.19266v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.19266v1-abstract-full" style="display: none;"> Reinforcement Learning from Human Feedback (RLHF), the standard for aligning Large Language Models (LLMs) with human values, is known to fail to satisfy properties that are intuitively desirable, such as respecting the preferences of the majority \cite{ge2024axioms}. To overcome these issues, we propose the use of a probabilistic Social Choice rule called \emph{maximal lotteries} as a replacement for RLHF. We show that a family of alignment techniques, namely Nash Learning from Human Feedback (NLHF) \cite{munos2023nash} and variants, approximate maximal lottery outcomes and thus inherit its beneficial properties. We confirm experimentally that our proposed methodology handles situations that arise when working with preferences more robustly than standard RLHF, including supporting the preferences of the majority, providing principled ways of handling non-transitivities in the preference data, and robustness to irrelevant alternatives. This results in systems that better incorporate human values and respect human intentions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19266v1-abstract-full').style.display = 'none'; document.getElementById('2501.19266v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12119">arXiv:2412.12119</a> <span> [<a href="https://arxiv.org/pdf/2412.12119">pdf</a>, <a href="https://arxiv.org/format/2412.12119">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Mastering Board Games by External and Internal Planning with Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Schultz%2C+J">John Schultz</a>, <a href="/search/cs?searchtype=author&query=Adamek%2C+J">Jakub Adamek</a>, <a href="/search/cs?searchtype=author&query=Jusup%2C+M">Matej Jusup</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Kaisers%2C+M">Michael Kaisers</a>, <a href="/search/cs?searchtype=author&query=Perrin%2C+S">Sarah Perrin</a>, <a href="/search/cs?searchtype=author&query=Hennes%2C+D">Daniel Hennes</a>, <a href="/search/cs?searchtype=author&query=Shar%2C+J">Jeremy Shar</a>, <a href="/search/cs?searchtype=author&query=Lewis%2C+C">Cannada Lewis</a>, <a href="/search/cs?searchtype=author&query=Ruoss%2C+A">Anian Ruoss</a>, <a href="/search/cs?searchtype=author&query=Zahavy%2C+T">Tom Zahavy</a>, <a href="/search/cs?searchtype=author&query=Veli%C4%8Dkovi%C4%87%2C+P">Petar Veli膷kovi膰</a>, <a href="/search/cs?searchtype=author&query=Prince%2C+L">Laurel Prince</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+S">Satinder Singh</a>, <a href="/search/cs?searchtype=author&query=Malmi%2C+E">Eric Malmi</a>, <a href="/search/cs?searchtype=author&query=Toma%C5%A1ev%2C+N">Nenad Toma拧ev</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12119v1-abstract-short" style="display: inline;"> While large language models perform well on a range of complex tasks (e.g., text generation, question answering, summarization), robust multi-step planning and reasoning remains a considerable challenge for them. In this paper we show that search-based planning can significantly improve LLMs' playing strength across several board games (Chess, Fischer Random / Chess960, Connect Four, and Hex). We… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12119v1-abstract-full').style.display = 'inline'; document.getElementById('2412.12119v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12119v1-abstract-full" style="display: none;"> While large language models perform well on a range of complex tasks (e.g., text generation, question answering, summarization), robust multi-step planning and reasoning remains a considerable challenge for them. In this paper we show that search-based planning can significantly improve LLMs' playing strength across several board games (Chess, Fischer Random / Chess960, Connect Four, and Hex). We introduce, compare and contrast two major approaches: In external search, the model guides Monte Carlo Tree Search (MCTS) rollouts and evaluations without calls to an external engine, and in internal search, the model directly generates in-context a linearized tree of potential futures and a resulting final choice. Both build on a language model pre-trained on relevant domain knowledge, capturing the transition and value functions across these games. We find that our pre-training method minimizes hallucinations, as our model is highly accurate regarding state prediction and legal moves. Additionally, both internal and external search indeed improve win-rates against state-of-the-art bots, even reaching Grandmaster-level performance in chess while operating on a similar move count search budget per decision as human Grandmasters. The way we combine search with domain knowledge is not specific to board games, suggesting direct extensions into more general language model inference and training techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12119v1-abstract-full').style.display = 'none'; document.getElementById('2412.12119v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00119">arXiv:2411.00119</a> <span> [<a href="https://arxiv.org/pdf/2411.00119">pdf</a>, <a href="https://arxiv.org/format/2411.00119">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Soft Condorcet Optimization for Ranking of General Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Larson%2C+K">Kate Larson</a>, <a href="/search/cs?searchtype=author&query=Kaisers%2C+M">Michael Kaisers</a>, <a href="/search/cs?searchtype=author&query=Berthet%2C+Q">Quentin Berthet</a>, <a href="/search/cs?searchtype=author&query=Gemp%2C+I">Ian Gemp</a>, <a href="/search/cs?searchtype=author&query=Diaz%2C+M">Manfred Diaz</a>, <a href="/search/cs?searchtype=author&query=Maura-Rivero%2C+R">Roberto-Rafael Maura-Rivero</a>, <a href="/search/cs?searchtype=author&query=Bachrach%2C+Y">Yoram Bachrach</a>, <a href="/search/cs?searchtype=author&query=Koop%2C+A">Anna Koop</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00119v2-abstract-short" style="display: inline;"> A common way to drive progress of AI models and agents is to compare their performance on standardized benchmarks. Comparing the performance of general agents requires aggregating their individual performances across a potentially wide variety of different tasks. In this paper, we describe a novel ranking scheme inspired by social choice frameworks, called Soft Condorcet Optimization (SCO), to com… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00119v2-abstract-full').style.display = 'inline'; document.getElementById('2411.00119v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00119v2-abstract-full" style="display: none;"> A common way to drive progress of AI models and agents is to compare their performance on standardized benchmarks. Comparing the performance of general agents requires aggregating their individual performances across a potentially wide variety of different tasks. In this paper, we describe a novel ranking scheme inspired by social choice frameworks, called Soft Condorcet Optimization (SCO), to compute the optimal ranking of agents: the one that makes the fewest mistakes in predicting the agent comparisons in the evaluation data. This optimal ranking is the maximum likelihood estimate when evaluation data (which we view as votes) are interpreted as noisy samples from a ground truth ranking, a solution to Condorcet's original voting system criteria. SCO ratings are maximal for Condorcet winners when they exist, which we show is not necessarily true for the classical rating system Elo. We propose three optimization algorithms to compute SCO ratings and evaluate their empirical performance. When serving as an approximation to the Kemeny-Young voting method, SCO rankings are on average 0 to 0.043 away from the optimal ranking in normalized Kendall-tau distance across 865 preference profiles from the PrefLib open ranking archive. In a simulated noisy tournament setting, SCO achieves accurate approximations to the ground truth ranking and the best among several baselines when 59\% or more of the preference data is missing. Finally, SCO ranking provides the best approximation to the optimal ranking, measured on held-out test sets, in a problem containing 52,958 human players across 31,049 games of the classic seven-player game of Diplomacy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00119v2-abstract-full').style.display = 'none'; document.getElementById('2411.00119v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03875">arXiv:2409.03875</a> <span> [<a href="https://arxiv.org/pdf/2409.03875">pdf</a>, <a href="https://arxiv.org/format/2409.03875">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> Learning in Games with Progressive Hiding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Heymann%2C+B">Benjamin Heymann</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03875v2-abstract-short" style="display: inline;"> When learning to play an imperfect information game, it is often easier to first start with the basic mechanics of the game rules. For example, one can play several example rounds with private cards revealed to all players to better understand the basic actions and their effects. Building on this intuition, this paper introduces {\it progressive hiding}, an algorithm that learns to play imperfect… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03875v2-abstract-full').style.display = 'inline'; document.getElementById('2409.03875v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03875v2-abstract-full" style="display: none;"> When learning to play an imperfect information game, it is often easier to first start with the basic mechanics of the game rules. For example, one can play several example rounds with private cards revealed to all players to better understand the basic actions and their effects. Building on this intuition, this paper introduces {\it progressive hiding}, an algorithm that learns to play imperfect information games by first learning the basic mechanics and then progressively adding information constraints over time. Progressive hiding is inspired by methods from stochastic multistage optimization such as scenario decomposition and progressive hedging. We prove that it enables the adaptation of counterfactual regret minimization to games where perfect recall is not satisfied. Numerical experiments illustrate that progressive hiding can achieve optimal payoff in a benchmark of emergent communication trading game. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03875v2-abstract-full').style.display = 'none'; document.getElementById('2409.03875v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.11835">arXiv:2402.11835</a> <span> [<a href="https://arxiv.org/pdf/2402.11835">pdf</a>, <a href="https://arxiv.org/format/2402.11835">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Easy as ABCs: Unifying Boltzmann Q-Learning and Counterfactual Regret Minimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=D%27Amico-Wong%2C+L">Luca D'Amico-Wong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hugh Zhang</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Parkes%2C+D+C">David C. Parkes</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.11835v1-abstract-short" style="display: inline;"> We propose ABCs (Adaptive Branching through Child stationarity), a best-of-both-worlds algorithm combining Boltzmann Q-learning (BQL), a classic reinforcement learning algorithm for single-agent domains, and counterfactual regret minimization (CFR), a central algorithm for learning in multi-agent domains. ABCs adaptively chooses what fraction of the environment to explore each iteration by measuri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.11835v1-abstract-full').style.display = 'inline'; document.getElementById('2402.11835v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.11835v1-abstract-full" style="display: none;"> We propose ABCs (Adaptive Branching through Child stationarity), a best-of-both-worlds algorithm combining Boltzmann Q-learning (BQL), a classic reinforcement learning algorithm for single-agent domains, and counterfactual regret minimization (CFR), a central algorithm for learning in multi-agent domains. ABCs adaptively chooses what fraction of the environment to explore each iteration by measuring the stationarity of the environment's reward and transition dynamics. In Markov decision processes, ABCs converges to the optimal policy with at most an O(A) factor slowdown compared to BQL, where A is the number of actions in the environment. In two-player zero-sum games, ABCs is guaranteed to converge to a Nash equilibrium (assuming access to a perfect oracle for detecting stationarity), while BQL has no such guarantees. Empirically, ABCs demonstrates strong performance when benchmarked across environments drawn from the OpenSpiel game library and OpenAI Gym and exceeds all prior methods in environments which are neither fully stationary nor fully nonstationary. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.11835v1-abstract-full').style.display = 'none'; document.getElementById('2402.11835v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.03928">arXiv:2402.03928</a> <span> [<a href="https://arxiv.org/pdf/2402.03928">pdf</a>, <a href="https://arxiv.org/format/2402.03928">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Approximating the Core via Iterative Coalition Sampling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gemp%2C+I">Ian Gemp</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Marris%2C+L">Luke Marris</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Yiran Mao</a>, <a href="/search/cs?searchtype=author&query=Du%C3%A9%C3%B1ez-Guzm%C3%A1n%2C+E">Edgar Du茅帽ez-Guzm谩n</a>, <a href="/search/cs?searchtype=author&query=Perrin%2C+S">Sarah Perrin</a>, <a href="/search/cs?searchtype=author&query=Gyorgy%2C+A">Andras Gyorgy</a>, <a href="/search/cs?searchtype=author&query=Elie%2C+R">Romuald Elie</a>, <a href="/search/cs?searchtype=author&query=Piliouras%2C+G">Georgios Piliouras</a>, <a href="/search/cs?searchtype=author&query=Kaisers%2C+M">Michael Kaisers</a>, <a href="/search/cs?searchtype=author&query=Hennes%2C+D">Daniel Hennes</a>, <a href="/search/cs?searchtype=author&query=Bullard%2C+K">Kalesha Bullard</a>, <a href="/search/cs?searchtype=author&query=Larson%2C+K">Kate Larson</a>, <a href="/search/cs?searchtype=author&query=Bachrach%2C+Y">Yoram Bachrach</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.03928v1-abstract-short" style="display: inline;"> The core is a central solution concept in cooperative game theory, defined as the set of feasible allocations or payments such that no subset of agents has incentive to break away and form their own subgroup or coalition. However, it has long been known that the core (and approximations, such as the least-core) are hard to compute. This limits our ability to analyze cooperative games in general, a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.03928v1-abstract-full').style.display = 'inline'; document.getElementById('2402.03928v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.03928v1-abstract-full" style="display: none;"> The core is a central solution concept in cooperative game theory, defined as the set of feasible allocations or payments such that no subset of agents has incentive to break away and form their own subgroup or coalition. However, it has long been known that the core (and approximations, such as the least-core) are hard to compute. This limits our ability to analyze cooperative games in general, and to fully embrace cooperative game theory contributions in domains such as explainable AI (XAI), where the core can complement the Shapley values to identify influential features or instances supporting predictions by black-box models. We propose novel iterative algorithms for computing variants of the core, which avoid the computational bottleneck of many other approaches; namely solving large linear programs. As such, they scale better to very large problems as we demonstrate across different classes of cooperative games, including weighted voting games, induced subgraph games, and marginal contribution networks. We also explore our algorithms in the context of XAI, providing further evidence of the power of the core for such applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.03928v1-abstract-full').style.display = 'none'; document.getElementById('2402.03928v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in AAMAS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.01704">arXiv:2402.01704</a> <span> [<a href="https://arxiv.org/pdf/2402.01704">pdf</a>, <a href="https://arxiv.org/format/2402.01704">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> Steering Language Models with Game-Theoretic Solvers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gemp%2C+I">Ian Gemp</a>, <a href="/search/cs?searchtype=author&query=Patel%2C+R">Roma Patel</a>, <a href="/search/cs?searchtype=author&query=Bachrach%2C+Y">Yoram Bachrach</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Dasagi%2C+V">Vibhavari Dasagi</a>, <a href="/search/cs?searchtype=author&query=Marris%2C+L">Luke Marris</a>, <a href="/search/cs?searchtype=author&query=Piliouras%2C+G">Georgios Piliouras</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Siqi Liu</a>, <a href="/search/cs?searchtype=author&query=Tuyls%2C+K">Karl Tuyls</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.01704v3-abstract-short" style="display: inline;"> Mathematical models of interactions among rational agents have long been studied in game theory. However these interactions are often over a small set of discrete game actions which is very different from how humans communicate in natural language. To bridge this gap, we introduce a framework that allows equilibrium solvers to work over the space of natural language dialogue generated by large lan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.01704v3-abstract-full').style.display = 'inline'; document.getElementById('2402.01704v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.01704v3-abstract-full" style="display: none;"> Mathematical models of interactions among rational agents have long been studied in game theory. However these interactions are often over a small set of discrete game actions which is very different from how humans communicate in natural language. To bridge this gap, we introduce a framework that allows equilibrium solvers to work over the space of natural language dialogue generated by large language models (LLMs). Specifically, by modelling the players, strategies and payoffs in a "game" of dialogue, we create a binding from natural language interactions to the conventional symbolic logic of game theory. Given this binding, we can ask existing game-theoretic algorithms to provide us with strategic solutions (e.g., what string an LLM should generate to maximize payoff in the face of strategic partners or opponents), giving us predictors of stable, rational conversational strategies. We focus on three domains that require different negotiation strategies: scheduling meetings, trading fruit and debate, and evaluate an LLM's generated language when guided by solvers. We see that LLMs that follow game-theory solvers result in dialogue generations that are less exploitable than the control (no guidance from solvers), and the language generated results in higher rewards, in all negotiation domains. We discuss future implications of this work, and how game-theoretic solvers that can leverage the expressivity of natural language can open up a new avenue of guiding language research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.01704v3-abstract-full').style.display = 'none'; document.getElementById('2402.01704v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code available @ https://github.com/google-deepmind/open_spiel/blob/master/open_spiel/python/games/chat_game.py</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.05133">arXiv:2401.05133</a> <span> [<a href="https://arxiv.org/pdf/2401.05133">pdf</a>, <a href="https://arxiv.org/format/2401.05133">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Neural Population Learning beyond Symmetric Zero-sum Games </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+S">Siqi Liu</a>, <a href="/search/cs?searchtype=author&query=Marris%2C+L">Luke Marris</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Piliouras%2C+G">Georgios Piliouras</a>, <a href="/search/cs?searchtype=author&query=Leibo%2C+J+Z">Joel Z. Leibo</a>, <a href="/search/cs?searchtype=author&query=Heess%2C+N">Nicolas Heess</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.05133v1-abstract-short" style="display: inline;"> We study computationally efficient methods for finding equilibria in n-player general-sum games, specifically ones that afford complex visuomotor skills. We show how existing methods would struggle in this setting, either computationally or in theory. We then introduce NeuPL-JPSRO, a neural population learning algorithm that benefits from transfer learning of skills and converges to a Coarse Corre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.05133v1-abstract-full').style.display = 'inline'; document.getElementById('2401.05133v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.05133v1-abstract-full" style="display: none;"> We study computationally efficient methods for finding equilibria in n-player general-sum games, specifically ones that afford complex visuomotor skills. We show how existing methods would struggle in this setting, either computationally or in theory. We then introduce NeuPL-JPSRO, a neural population learning algorithm that benefits from transfer learning of skills and converges to a Coarse Correlated Equilibrium (CCE) of the game. We show empirical convergence in a suite of OpenSpiel games, validated rigorously by exact game solvers. We then deploy NeuPL-JPSRO to complex domains, where our approach enables adaptive coordination in a MuJoCo control domain and skill transfer in capture-the-flag. Our work shows that equilibrium convergent population learning can be implemented at scale and in generality, paving the way towards solving real-world games between heterogeneous players with mixed motives. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.05133v1-abstract-full').style.display = 'none'; document.getElementById('2401.05133v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.03121">arXiv:2312.03121</a> <span> [<a href="https://arxiv.org/pdf/2312.03121">pdf</a>, <a href="https://arxiv.org/format/2312.03121">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Evaluating Agents using Social Choice Theory </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Larson%2C+K">Kate Larson</a>, <a href="/search/cs?searchtype=author&query=Bachrach%2C+Y">Yoram Bachrach</a>, <a href="/search/cs?searchtype=author&query=Marris%2C+L">Luke Marris</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zun Li</a>, <a href="/search/cs?searchtype=author&query=Bhoopchand%2C+A">Avishkar Bhoopchand</a>, <a href="/search/cs?searchtype=author&query=Anthony%2C+T">Thomas Anthony</a>, <a href="/search/cs?searchtype=author&query=Tanner%2C+B">Brian Tanner</a>, <a href="/search/cs?searchtype=author&query=Koop%2C+A">Anna Koop</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.03121v3-abstract-short" style="display: inline;"> We argue that many general evaluation problems can be viewed through the lens of voting theory. Each task is interpreted as a separate voter, which requires only ordinal rankings or pairwise comparisons of agents to produce an overall evaluation. By viewing the aggregator as a social welfare function, we are able to leverage centuries of research in social choice theory to derive principled evalua… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.03121v3-abstract-full').style.display = 'inline'; document.getElementById('2312.03121v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.03121v3-abstract-full" style="display: none;"> We argue that many general evaluation problems can be viewed through the lens of voting theory. Each task is interpreted as a separate voter, which requires only ordinal rankings or pairwise comparisons of agents to produce an overall evaluation. By viewing the aggregator as a social welfare function, we are able to leverage centuries of research in social choice theory to derive principled evaluation frameworks with axiomatic foundations. These evaluations are interpretable and flexible, while avoiding many of the problems currently facing cross-task evaluation. We apply this Voting-as-Evaluation (VasE) framework across multiple settings, including reinforcement learning, large language models, and humans. In practice, we observe that VasE can be more robust than popular evaluation frameworks (Elo and Nash averaging), discovers properties in the evaluation data not evident from scores alone, and can predict outcomes better than Elo in a complex seven-player game. We identify one particular approach, maximal lotteries, that satisfies important consistency properties relevant to evaluation, is computationally efficient (polynomial in the size of the evaluation data), and identifies game-theoretic cycles. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.03121v3-abstract-full').style.display = 'none'; document.getElementById('2312.03121v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.03196">arXiv:2303.03196</a> <span> [<a href="https://arxiv.org/pdf/2303.03196">pdf</a>, <a href="https://arxiv.org/format/2303.03196">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Population-based Evaluation in Repeated Rock-Paper-Scissors as a Benchmark for Multiagent Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Schultz%2C+J">John Schultz</a>, <a href="/search/cs?searchtype=author&query=Burch%2C+N">Neil Burch</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+M+O">Max Olan Smith</a>, <a href="/search/cs?searchtype=author&query=Hennes%2C+D">Daniel Hennes</a>, <a href="/search/cs?searchtype=author&query=Anthony%2C+T">Thomas Anthony</a>, <a href="/search/cs?searchtype=author&query=Perolat%2C+J">Julien Perolat</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.03196v2-abstract-short" style="display: inline;"> Progress in fields of machine learning and adversarial planning has benefited significantly from benchmark domains, from checkers and the classic UCI data sets to Go and Diplomacy. In sequential decision-making, agent evaluation has largely been restricted to few interactions against experts, with the aim to reach some desired level of performance (e.g. beating a human professional player). We pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.03196v2-abstract-full').style.display = 'inline'; document.getElementById('2303.03196v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.03196v2-abstract-full" style="display: none;"> Progress in fields of machine learning and adversarial planning has benefited significantly from benchmark domains, from checkers and the classic UCI data sets to Go and Diplomacy. In sequential decision-making, agent evaluation has largely been restricted to few interactions against experts, with the aim to reach some desired level of performance (e.g. beating a human professional player). We propose a benchmark for multiagent learning based on repeated play of the simple game Rock, Paper, Scissors along with a population of forty-three tournament entries, some of which are intentionally sub-optimal. We describe metrics to measure the quality of agents based both on average returns and exploitability. We then show that several RL, online learning, and language model approaches can learn good counter-strategies and generalize well, but ultimately lose to the top-performing bots, creating an opportunity for research in multiagent learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.03196v2-abstract-full').style.display = 'none'; document.getElementById('2303.03196v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages, 8 figures, Accepted at TMLR October 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.01074">arXiv:2303.01074</a> <span> [<a href="https://arxiv.org/pdf/2303.01074">pdf</a>, <a href="https://arxiv.org/format/2303.01074">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Learning not to Regret </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sychrovsk%C3%BD%2C+D">David Sychrovsk媒</a>, <a href="/search/cs?searchtype=author&query=%C5%A0ustr%2C+M">Michal 艩ustr</a>, <a href="/search/cs?searchtype=author&query=Davoodi%2C+E">Elnaz Davoodi</a>, <a href="/search/cs?searchtype=author&query=Bowling%2C+M">Michael Bowling</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Schmid%2C+M">Martin Schmid</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.01074v2-abstract-short" style="display: inline;"> The literature on game-theoretic equilibrium finding predominantly focuses on single games or their repeated play. Nevertheless, numerous real-world scenarios feature playing a game sampled from a distribution of similar, but not identical games, such as playing poker with different public cards or trading correlated assets on the stock market. As these similar games feature similar equilibra, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.01074v2-abstract-full').style.display = 'inline'; document.getElementById('2303.01074v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.01074v2-abstract-full" style="display: none;"> The literature on game-theoretic equilibrium finding predominantly focuses on single games or their repeated play. Nevertheless, numerous real-world scenarios feature playing a game sampled from a distribution of similar, but not identical games, such as playing poker with different public cards or trading correlated assets on the stock market. As these similar games feature similar equilibra, we investigate a way to accelerate equilibrium finding on such a distribution. We present a novel "learning not to regret" framework, enabling us to meta-learn a regret minimizer tailored to a specific distribution. Our key contribution, Neural Predictive Regret Matching, is uniquely meta-learned to converge rapidly for the chosen distribution of games, while having regret minimization guarantees on any game. We validated our algorithms' faster convergence on a distribution of river poker games. Our experiments show that the meta-learned algorithms outpace their non-meta-learned counterparts, achieving more than tenfold improvements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.01074v2-abstract-full').style.display = 'none'; document.getElementById('2303.01074v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.00797">arXiv:2302.00797</a> <span> [<a href="https://arxiv.org/pdf/2302.00797">pdf</a>, <a href="https://arxiv.org/format/2302.00797">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Combining Tree-Search, Generative Models, and Nash Bargaining Concepts in Game-Theoretic Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zun Li</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=McKee%2C+K+R">Kevin R. McKee</a>, <a href="/search/cs?searchtype=author&query=Marris%2C+L">Luke Marris</a>, <a href="/search/cs?searchtype=author&query=Gemp%2C+I">Ian Gemp</a>, <a href="/search/cs?searchtype=author&query=Hennes%2C+D">Daniel Hennes</a>, <a href="/search/cs?searchtype=author&query=Muller%2C+P">Paul Muller</a>, <a href="/search/cs?searchtype=author&query=Larson%2C+K">Kate Larson</a>, <a href="/search/cs?searchtype=author&query=Bachrach%2C+Y">Yoram Bachrach</a>, <a href="/search/cs?searchtype=author&query=Wellman%2C+M+P">Michael P. Wellman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.00797v1-abstract-short" style="display: inline;"> Multiagent reinforcement learning (MARL) has benefited significantly from population-based and game-theoretic training regimes. One approach, Policy-Space Response Oracles (PSRO), employs standard reinforcement learning to compute response policies via approximate best responses and combines them via meta-strategy selection. We augment PSRO by adding a novel search procedure with generative sampli… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.00797v1-abstract-full').style.display = 'inline'; document.getElementById('2302.00797v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.00797v1-abstract-full" style="display: none;"> Multiagent reinforcement learning (MARL) has benefited significantly from population-based and game-theoretic training regimes. One approach, Policy-Space Response Oracles (PSRO), employs standard reinforcement learning to compute response policies via approximate best responses and combines them via meta-strategy selection. We augment PSRO by adding a novel search procedure with generative sampling of world states, and introduce two new meta-strategy solvers based on the Nash bargaining solution. We evaluate PSRO's ability to compute approximate Nash equilibrium, and its performance in two negotiation games: Colored Trails, and Deal or No Deal. We conduct behavioral studies where human participants negotiate with our agents ($N = 346$). We find that search with generative modeling finds stronger policies during both training time and test time, enables online Bayesian co-player prediction, and can produce agents that achieve comparable social welfare negotiating with humans as humans trading among themselves. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.00797v1-abstract-full').style.display = 'none'; document.getElementById('2302.00797v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.02205">arXiv:2210.02205</a> <span> [<a href="https://arxiv.org/pdf/2210.02205">pdf</a>, <a href="https://arxiv.org/format/2210.02205">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Game Theoretic Rating in N-player general-sum games with Equilibria </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Marris%2C+L">Luke Marris</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Gemp%2C+I">Ian Gemp</a>, <a href="/search/cs?searchtype=author&query=Omidshafiei%2C+S">Shayegan Omidshafiei</a>, <a href="/search/cs?searchtype=author&query=McAleer%2C+S">Stephen McAleer</a>, <a href="/search/cs?searchtype=author&query=Connor%2C+J">Jerome Connor</a>, <a href="/search/cs?searchtype=author&query=Tuyls%2C+K">Karl Tuyls</a>, <a href="/search/cs?searchtype=author&query=Graepel%2C+T">Thore Graepel</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.02205v1-abstract-short" style="display: inline;"> Rating strategies in a game is an important area of research in game theory and artificial intelligence, and can be applied to any real-world competitive or cooperative setting. Traditionally, only transitive dependencies between strategies have been used to rate strategies (e.g. Elo), however recent work has expanded ratings to utilize game theoretic solutions to better rate strategies in non-tra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.02205v1-abstract-full').style.display = 'inline'; document.getElementById('2210.02205v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.02205v1-abstract-full" style="display: none;"> Rating strategies in a game is an important area of research in game theory and artificial intelligence, and can be applied to any real-world competitive or cooperative setting. Traditionally, only transitive dependencies between strategies have been used to rate strategies (e.g. Elo), however recent work has expanded ratings to utilize game theoretic solutions to better rate strategies in non-transitive games. This work generalizes these ideas and proposes novel algorithms suitable for N-player, general-sum rating of strategies in normal-form games according to the payoff rating system. This enables well-established solution concepts, such as equilibria, to be leveraged to efficiently rate strategies in games with complex strategic interactions, which arise in multiagent training and real-world interactions between many agents. We empirically validate our methods on real world normal-form data (Premier League) and multiagent reinforcement learning agent evaluation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.02205v1-abstract-full').style.display = 'none'; document.getElementById('2210.02205v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.10958">arXiv:2209.10958</a> <span> [<a href="https://arxiv.org/pdf/2209.10958">pdf</a>, <a href="https://arxiv.org/ps/2209.10958">ps</a>, <a href="https://arxiv.org/format/2209.10958">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Developing, Evaluating and Scaling Learning Agents in Multi-Agent Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gemp%2C+I">Ian Gemp</a>, <a href="/search/cs?searchtype=author&query=Anthony%2C+T">Thomas Anthony</a>, <a href="/search/cs?searchtype=author&query=Bachrach%2C+Y">Yoram Bachrach</a>, <a href="/search/cs?searchtype=author&query=Bhoopchand%2C+A">Avishkar Bhoopchand</a>, <a href="/search/cs?searchtype=author&query=Bullard%2C+K">Kalesha Bullard</a>, <a href="/search/cs?searchtype=author&query=Connor%2C+J">Jerome Connor</a>, <a href="/search/cs?searchtype=author&query=Dasagi%2C+V">Vibhavari Dasagi</a>, <a href="/search/cs?searchtype=author&query=De+Vylder%2C+B">Bart De Vylder</a>, <a href="/search/cs?searchtype=author&query=Duenez-Guzman%2C+E">Edgar Duenez-Guzman</a>, <a href="/search/cs?searchtype=author&query=Elie%2C+R">Romuald Elie</a>, <a href="/search/cs?searchtype=author&query=Everett%2C+R">Richard Everett</a>, <a href="/search/cs?searchtype=author&query=Hennes%2C+D">Daniel Hennes</a>, <a href="/search/cs?searchtype=author&query=Hughes%2C+E">Edward Hughes</a>, <a href="/search/cs?searchtype=author&query=Khan%2C+M">Mina Khan</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Larson%2C+K">Kate Larson</a>, <a href="/search/cs?searchtype=author&query=Lever%2C+G">Guy Lever</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Siqi Liu</a>, <a href="/search/cs?searchtype=author&query=Marris%2C+L">Luke Marris</a>, <a href="/search/cs?searchtype=author&query=McKee%2C+K+R">Kevin R. McKee</a>, <a href="/search/cs?searchtype=author&query=Muller%2C+P">Paul Muller</a>, <a href="/search/cs?searchtype=author&query=Perolat%2C+J">Julien Perolat</a>, <a href="/search/cs?searchtype=author&query=Strub%2C+F">Florian Strub</a>, <a href="/search/cs?searchtype=author&query=Tacchetti%2C+A">Andrea Tacchetti</a>, <a href="/search/cs?searchtype=author&query=Tarassov%2C+E">Eugene Tarassov</a> , et al. (2 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.10958v1-abstract-short" style="display: inline;"> The Game Theory & Multi-Agent team at DeepMind studies several aspects of multi-agent learning ranging from computing approximations to fundamental concepts in game theory to simulating social dilemmas in rich spatial environments and training 3-d humanoids in difficult team coordination tasks. A signature aim of our group is to use the resources and expertise made available to us at DeepMind in d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.10958v1-abstract-full').style.display = 'inline'; document.getElementById('2209.10958v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.10958v1-abstract-full" style="display: none;"> The Game Theory & Multi-Agent team at DeepMind studies several aspects of multi-agent learning ranging from computing approximations to fundamental concepts in game theory to simulating social dilemmas in rich spatial environments and training 3-d humanoids in difficult team coordination tasks. A signature aim of our group is to use the resources and expertise made available to us at DeepMind in deep reinforcement learning to explore multi-agent systems in complex environments and use these benchmarks to advance our understanding. Here, we summarise the recent work of our team and present a taxonomy that we feel highlights many important open challenges in multi-agent research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.10958v1-abstract-full').style.display = 'none'; document.getElementById('2209.10958v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in AI Communications 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.15378">arXiv:2206.15378</a> <span> [<a href="https://arxiv.org/pdf/2206.15378">pdf</a>, <a href="https://arxiv.org/format/2206.15378">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1126/science.add4679">10.1126/science.add4679 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Mastering the Game of Stratego with Model-Free Multiagent Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Perolat%2C+J">Julien Perolat</a>, <a href="/search/cs?searchtype=author&query=de+Vylder%2C+B">Bart de Vylder</a>, <a href="/search/cs?searchtype=author&query=Hennes%2C+D">Daniel Hennes</a>, <a href="/search/cs?searchtype=author&query=Tarassov%2C+E">Eugene Tarassov</a>, <a href="/search/cs?searchtype=author&query=Strub%2C+F">Florian Strub</a>, <a href="/search/cs?searchtype=author&query=de+Boer%2C+V">Vincent de Boer</a>, <a href="/search/cs?searchtype=author&query=Muller%2C+P">Paul Muller</a>, <a href="/search/cs?searchtype=author&query=Connor%2C+J+T">Jerome T. Connor</a>, <a href="/search/cs?searchtype=author&query=Burch%2C+N">Neil Burch</a>, <a href="/search/cs?searchtype=author&query=Anthony%2C+T">Thomas Anthony</a>, <a href="/search/cs?searchtype=author&query=McAleer%2C+S">Stephen McAleer</a>, <a href="/search/cs?searchtype=author&query=Elie%2C+R">Romuald Elie</a>, <a href="/search/cs?searchtype=author&query=Cen%2C+S+H">Sarah H. Cen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhe Wang</a>, <a href="/search/cs?searchtype=author&query=Gruslys%2C+A">Audrunas Gruslys</a>, <a href="/search/cs?searchtype=author&query=Malysheva%2C+A">Aleksandra Malysheva</a>, <a href="/search/cs?searchtype=author&query=Khan%2C+M">Mina Khan</a>, <a href="/search/cs?searchtype=author&query=Ozair%2C+S">Sherjil Ozair</a>, <a href="/search/cs?searchtype=author&query=Timbers%2C+F">Finbarr Timbers</a>, <a href="/search/cs?searchtype=author&query=Pohlen%2C+T">Toby Pohlen</a>, <a href="/search/cs?searchtype=author&query=Eccles%2C+T">Tom Eccles</a>, <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Lespiau%2C+J">Jean-Baptiste Lespiau</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a> , et al. (9 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.15378v1-abstract-short" style="display: inline;"> We introduce DeepNash, an autonomous agent capable of learning to play the imperfect information game Stratego from scratch, up to a human expert level. Stratego is one of the few iconic board games that Artificial Intelligence (AI) has not yet mastered. This popular game has an enormous game tree on the order of $10^{535}$ nodes, i.e., $10^{175}$ times larger than that of Go. It has the additiona… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.15378v1-abstract-full').style.display = 'inline'; document.getElementById('2206.15378v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.15378v1-abstract-full" style="display: none;"> We introduce DeepNash, an autonomous agent capable of learning to play the imperfect information game Stratego from scratch, up to a human expert level. Stratego is one of the few iconic board games that Artificial Intelligence (AI) has not yet mastered. This popular game has an enormous game tree on the order of $10^{535}$ nodes, i.e., $10^{175}$ times larger than that of Go. It has the additional complexity of requiring decision-making under imperfect information, similar to Texas hold'em poker, which has a significantly smaller game tree (on the order of $10^{164}$ nodes). Decisions in Stratego are made over a large number of discrete actions with no obvious link between action and outcome. Episodes are long, with often hundreds of moves before a player wins, and situations in Stratego can not easily be broken down into manageably-sized sub-problems as in poker. For these reasons, Stratego has been a grand challenge for the field of AI for decades, and existing AI methods barely reach an amateur level of play. DeepNash uses a game-theoretic, model-free deep reinforcement learning method, without search, that learns to master Stratego via self-play. The Regularised Nash Dynamics (R-NaD) algorithm, a key component of DeepNash, converges to an approximate Nash equilibrium, instead of 'cycling' around it, by directly modifying the underlying multi-agent learning dynamics. DeepNash beats existing state-of-the-art AI methods in Stratego and achieved a yearly (2022) and all-time top-3 rank on the Gravon games platform, competing with human expert players. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.15378v1-abstract-full').style.display = 'none'; document.getElementById('2206.15378v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.05825">arXiv:2206.05825</a> <span> [<a href="https://arxiv.org/pdf/2206.05825">pdf</a>, <a href="https://arxiv.org/format/2206.05825">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> A Unified Approach to Reinforcement Learning, Quantal Response Equilibria, and Two-Player Zero-Sum Games </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sokota%2C+S">Samuel Sokota</a>, <a href="/search/cs?searchtype=author&query=D%27Orazio%2C+R">Ryan D'Orazio</a>, <a href="/search/cs?searchtype=author&query=Kolter%2C+J+Z">J. Zico Kolter</a>, <a href="/search/cs?searchtype=author&query=Loizou%2C+N">Nicolas Loizou</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Mitliagkas%2C+I">Ioannis Mitliagkas</a>, <a href="/search/cs?searchtype=author&query=Brown%2C+N">Noam Brown</a>, <a href="/search/cs?searchtype=author&query=Kroer%2C+C">Christian Kroer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.05825v4-abstract-short" style="display: inline;"> This work studies an algorithm, which we call magnetic mirror descent, that is inspired by mirror descent and the non-Euclidean proximal gradient algorithm. Our contribution is demonstrating the virtues of magnetic mirror descent as both an equilibrium solver and as an approach to reinforcement learning in two-player zero-sum games. These virtues include: 1) Being the first quantal response equili… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.05825v4-abstract-full').style.display = 'inline'; document.getElementById('2206.05825v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.05825v4-abstract-full" style="display: none;"> This work studies an algorithm, which we call magnetic mirror descent, that is inspired by mirror descent and the non-Euclidean proximal gradient algorithm. Our contribution is demonstrating the virtues of magnetic mirror descent as both an equilibrium solver and as an approach to reinforcement learning in two-player zero-sum games. These virtues include: 1) Being the first quantal response equilibria solver to achieve linear convergence for extensive-form games with first order feedback; 2) Being the first standard reinforcement learning algorithm to achieve empirically competitive results with CFR in tabular settings; 3) Achieving favorable performance in 3x3 Dark Hex and Phantom Tic-Tac-Toe as a self-play deep reinforcement learning algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.05825v4-abstract-full').style.display = 'none'; document.getElementById('2206.05825v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.04122">arXiv:2206.04122</a> <span> [<a href="https://arxiv.org/pdf/2206.04122">pdf</a>, <a href="https://arxiv.org/format/2206.04122">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> ESCHER: Eschewing Importance Sampling in Games by Computing a History Value Function to Estimate Regret </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=McAleer%2C+S">Stephen McAleer</a>, <a href="/search/cs?searchtype=author&query=Farina%2C+G">Gabriele Farina</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Sandholm%2C+T">Tuomas Sandholm</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.04122v2-abstract-short" style="display: inline;"> Recent techniques for approximating Nash equilibria in very large games leverage neural networks to learn approximately optimal policies (strategies). One promising line of research uses neural networks to approximate counterfactual regret minimization (CFR) or its modern variants. DREAM, the only current CFR-based neural method that is model free and therefore scalable to very large games, trains… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.04122v2-abstract-full').style.display = 'inline'; document.getElementById('2206.04122v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.04122v2-abstract-full" style="display: none;"> Recent techniques for approximating Nash equilibria in very large games leverage neural networks to learn approximately optimal policies (strategies). One promising line of research uses neural networks to approximate counterfactual regret minimization (CFR) or its modern variants. DREAM, the only current CFR-based neural method that is model free and therefore scalable to very large games, trains a neural network on an estimated regret target that can have extremely high variance due to an importance sampling term inherited from Monte Carlo CFR (MCCFR). In this paper we propose an unbiased model-free method that does not require any importance sampling. Our method, ESCHER, is principled and is guaranteed to converge to an approximate Nash equilibrium with high probability. We show that the variance of the estimated regret of ESCHER is orders of magnitude lower than DREAM and other baselines. We then show that ESCHER outperforms the prior state of the art -- DREAM and neural fictitious self play (NFSP) -- on a number of games and the difference becomes dramatic as game size increases. In the very large game of dark chess, ESCHER is able to beat DREAM and NFSP in a head-to-head competition over $90\%$ of the time. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.04122v2-abstract-full').style.display = 'none'; document.getElementById('2206.04122v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.15879">arXiv:2205.15879</a> <span> [<a href="https://arxiv.org/pdf/2205.15879">pdf</a>, <a href="https://arxiv.org/format/2205.15879">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Simplex Neural Population Learning: Any-Mixture Bayes-Optimality in Symmetric Zero-sum Games </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+S">Siqi Liu</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Marris%2C+L">Luke Marris</a>, <a href="/search/cs?searchtype=author&query=Heess%2C+N">Nicolas Heess</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.15879v4-abstract-short" style="display: inline;"> Learning to play optimally against any mixture over a diverse set of strategies is of important practical interests in competitive games. In this paper, we propose simplex-NeuPL that satisfies two desiderata simultaneously: i) learning a population of strategically diverse basis policies, represented by a single conditional network; ii) using the same network, learn best-responses to any mixture o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.15879v4-abstract-full').style.display = 'inline'; document.getElementById('2205.15879v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.15879v4-abstract-full" style="display: none;"> Learning to play optimally against any mixture over a diverse set of strategies is of important practical interests in competitive games. In this paper, we propose simplex-NeuPL that satisfies two desiderata simultaneously: i) learning a population of strategically diverse basis policies, represented by a single conditional network; ii) using the same network, learn best-responses to any mixture over the simplex of basis policies. We show that the resulting conditional policies incorporate prior information about their opponents effectively, enabling near optimal returns against arbitrary mixture policies in a game with tractable best-responses. We verify that such policies behave Bayes-optimally under uncertainty and offer insights in using this flexibility at test time. Finally, we offer evidence that learning best-responses to any mixture policies is an effective auxiliary task for strategic exploration, which, by itself, can lead to more performant populations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.15879v4-abstract-full').style.display = 'none'; document.getElementById('2205.15879v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of the 39th International Conference on Machine Learning (ICML 2022) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.12031">arXiv:2205.12031</a> <span> </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Efficient Deviation Types and Learning for Hindsight Rationality in Extensive-Form Games: Corrections </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Morrill%2C+D">Dustin Morrill</a>, <a href="/search/cs?searchtype=author&query=D%27Orazio%2C+R">Ryan D'Orazio</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Wright%2C+J+R">James R. Wright</a>, <a href="/search/cs?searchtype=author&query=Bowling%2C+M">Michael Bowling</a>, <a href="/search/cs?searchtype=author&query=Greenwald%2C+A+R">Amy R. Greenwald</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.12031v2-abstract-short" style="display: inline;"> Hindsight rationality is an approach to playing general-sum games that prescribes no-regret learning dynamics for individual agents with respect to a set of deviations, and further describes jointly rational behavior among multiple agents with mediated equilibria. To develop hindsight rational learning in sequential decision-making settings, we formalize behavioral deviations as a general class of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.12031v2-abstract-full').style.display = 'inline'; document.getElementById('2205.12031v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.12031v2-abstract-full" style="display: none;"> Hindsight rationality is an approach to playing general-sum games that prescribes no-regret learning dynamics for individual agents with respect to a set of deviations, and further describes jointly rational behavior among multiple agents with mediated equilibria. To develop hindsight rational learning in sequential decision-making settings, we formalize behavioral deviations as a general class of deviations that respect the structure of extensive-form games. Integrating the idea of time selection into counterfactual regret minimization (CFR), we introduce the extensive-form regret minimization (EFR) algorithm that achieves hindsight rationality for any given set of behavioral deviations with computation that scales closely with the complexity of the set. We identify behavioral deviation subsets, the partial sequence deviation types, that subsume previously studied types and lead to efficient EFR instances in games with moderate lengths. In addition, we present a thorough empirical analysis of EFR instantiated with different deviation types in benchmark games, where we find that stronger types typically induce better performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.12031v2-abstract-full').style.display = 'none'; document.getElementById('2205.12031v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Please see version 4 of arXiv:2102.06973 (arXiv:2102.06973v4). This submission was a version of that paper with highlighted corrections. After submitting, I figured out that it would be better to submit this report as another version of arXiv:2102.06973</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2201.07700">arXiv:2201.07700</a> <span> [<a href="https://arxiv.org/pdf/2201.07700">pdf</a>, <a href="https://arxiv.org/format/2201.07700">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Anytime PSRO for Two-Player Zero-Sum Games </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=McAleer%2C+S">Stephen McAleer</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kevin Wang</a>, <a href="/search/cs?searchtype=author&query=Lanier%2C+J">John Lanier</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Baldi%2C+P">Pierre Baldi</a>, <a href="/search/cs?searchtype=author&query=Sandholm%2C+T">Tuomas Sandholm</a>, <a href="/search/cs?searchtype=author&query=Fox%2C+R">Roy Fox</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2201.07700v2-abstract-short" style="display: inline;"> Policy space response oracles (PSRO) is a multi-agent reinforcement learning algorithm that has achieved state-of-the-art performance in very large two-player zero-sum games. PSRO is based on the tabular double oracle (DO) method, an algorithm that is guaranteed to converge to a Nash equilibrium, but may increase exploitability from one iteration to the next. We propose anytime double oracle (ADO)… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.07700v2-abstract-full').style.display = 'inline'; document.getElementById('2201.07700v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2201.07700v2-abstract-full" style="display: none;"> Policy space response oracles (PSRO) is a multi-agent reinforcement learning algorithm that has achieved state-of-the-art performance in very large two-player zero-sum games. PSRO is based on the tabular double oracle (DO) method, an algorithm that is guaranteed to converge to a Nash equilibrium, but may increase exploitability from one iteration to the next. We propose anytime double oracle (ADO), a tabular double oracle algorithm for 2-player zero-sum games that is guaranteed to converge to a Nash equilibrium while decreasing exploitability from one iteration to the next. Unlike DO, in which the restricted distribution is based on the restricted game formed by each player's strategy sets, ADO finds the restricted distribution for each player that minimizes its exploitability against any policy in the full, unrestricted game. We also propose a method of finding this restricted distribution via a no-regret algorithm updated against best responses, called RM-BR DO. Finally, we propose anytime PSRO (APSRO), a version of ADO that calculates best responses via reinforcement learning. In experiments on Leduc poker and random normal form games, we show that our methods achieve far lower exploitability than DO and PSRO and decrease exploitability monotonically. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.07700v2-abstract-full').style.display = 'none'; document.getElementById('2201.07700v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in AAAI Reinforcement Learning in Games Workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.03178">arXiv:2112.03178</a> <span> [<a href="https://arxiv.org/pdf/2112.03178">pdf</a>, <a href="https://arxiv.org/format/2112.03178">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1126/sciadv.adg3256">10.1126/sciadv.adg3256 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Student of Games: A unified learning algorithm for both perfect and imperfect information games </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Schmid%2C+M">Martin Schmid</a>, <a href="/search/cs?searchtype=author&query=Moravcik%2C+M">Matej Moravcik</a>, <a href="/search/cs?searchtype=author&query=Burch%2C+N">Neil Burch</a>, <a href="/search/cs?searchtype=author&query=Kadlec%2C+R">Rudolf Kadlec</a>, <a href="/search/cs?searchtype=author&query=Davidson%2C+J">Josh Davidson</a>, <a href="/search/cs?searchtype=author&query=Waugh%2C+K">Kevin Waugh</a>, <a href="/search/cs?searchtype=author&query=Bard%2C+N">Nolan Bard</a>, <a href="/search/cs?searchtype=author&query=Timbers%2C+F">Finbarr Timbers</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Holland%2C+G+Z">G. Zacharias Holland</a>, <a href="/search/cs?searchtype=author&query=Davoodi%2C+E">Elnaz Davoodi</a>, <a href="/search/cs?searchtype=author&query=Christianson%2C+A">Alden Christianson</a>, <a href="/search/cs?searchtype=author&query=Bowling%2C+M">Michael Bowling</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.03178v2-abstract-short" style="display: inline;"> Games have a long history as benchmarks for progress in artificial intelligence. Approaches using search and learning produced strong performance across many perfect information games, and approaches using game-theoretic reasoning and learning demonstrated strong performance for specific imperfect information poker variants. We introduce Student of Games, a general-purpose algorithm that unifies p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.03178v2-abstract-full').style.display = 'inline'; document.getElementById('2112.03178v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.03178v2-abstract-full" style="display: none;"> Games have a long history as benchmarks for progress in artificial intelligence. Approaches using search and learning produced strong performance across many perfect information games, and approaches using game-theoretic reasoning and learning demonstrated strong performance for specific imperfect information poker variants. We introduce Student of Games, a general-purpose algorithm that unifies previous approaches, combining guided search, self-play learning, and game-theoretic reasoning. Student of Games achieves strong empirical performance in large perfect and imperfect information games -- an important step towards truly general algorithms for arbitrary environments. We prove that Student of Games is sound, converging to perfect play as available computation and approximation capacity increases. Student of Games reaches strong performance in chess and Go, beats the strongest openly available agent in heads-up no-limit Texas hold'em poker, and defeats the state-of-the-art agent in Scotland Yard, an imperfect information game that illustrates the value of guided search, learning, and game-theoretic reasoning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.03178v2-abstract-full').style.display = 'none'; document.getElementById('2112.03178v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in Science Advances</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Science Advances 9, eadg3256 (2023) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.14241">arXiv:2110.14241</a> <span> [<a href="https://arxiv.org/pdf/2110.14241">pdf</a>, <a href="https://arxiv.org/format/2110.14241">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Dynamic population-based meta-learning for multi-agent communication with natural language </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gupta%2C+A">Abhinav Gupta</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Lazaridou%2C+A">Angeliki Lazaridou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.14241v1-abstract-short" style="display: inline;"> In this work, our goal is to train agents that can coordinate with seen, unseen as well as human partners in a multi-agent communication environment involving natural language. Previous work using a single set of agents has shown great progress in generalizing to known partners, however it struggles when coordinating with unfamiliar agents. To mitigate that, recent work explored the use of populat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.14241v1-abstract-full').style.display = 'inline'; document.getElementById('2110.14241v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.14241v1-abstract-full" style="display: none;"> In this work, our goal is to train agents that can coordinate with seen, unseen as well as human partners in a multi-agent communication environment involving natural language. Previous work using a single set of agents has shown great progress in generalizing to known partners, however it struggles when coordinating with unfamiliar agents. To mitigate that, recent work explored the use of population-based approaches, where multiple agents interact with each other with the goal of learning more generic protocols. These methods, while able to result in good coordination between unseen partners, still only achieve so in cases of simple languages, thus failing to adapt to human partners using natural language. We attribute this to the use of static populations and instead propose a dynamic population-based meta-learning approach that builds such a population in an iterative manner. We perform a holistic evaluation of our method on two different referential games, and show that our agents outperform all prior work when communicating with seen partners and humans. Furthermore, we analyze the natural language generation skills of our agents, where we find that our agents also outperform strong baselines. Finally, we test the robustness of our agents when communicating with out-of-population agents and carefully test the importance of each component of our method through ablation studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.14241v1-abstract-full').style.display = 'none'; document.getElementById('2110.14241v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NeurIPS 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.09435">arXiv:2106.09435</a> <span> [<a href="https://arxiv.org/pdf/2106.09435">pdf</a>, <a href="https://arxiv.org/format/2106.09435">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Multi-Agent Training beyond Zero-Sum with Correlated Equilibrium Meta-Solvers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Marris%2C+L">Luke Marris</a>, <a href="/search/cs?searchtype=author&query=Muller%2C+P">Paul Muller</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Tuyls%2C+K">Karl Tuyls</a>, <a href="/search/cs?searchtype=author&query=Graepel%2C+T">Thore Graepel</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.09435v3-abstract-short" style="display: inline;"> Two-player, constant-sum games are well studied in the literature, but there has been limited progress outside of this setting. We propose Joint Policy-Space Response Oracles (JPSRO), an algorithm for training agents in n-player, general-sum extensive form games, which provably converges to an equilibrium. We further suggest correlated equilibria (CE) as promising meta-solvers, and propose a novel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.09435v3-abstract-full').style.display = 'inline'; document.getElementById('2106.09435v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.09435v3-abstract-full" style="display: none;"> Two-player, constant-sum games are well studied in the literature, but there has been limited progress outside of this setting. We propose Joint Policy-Space Response Oracles (JPSRO), an algorithm for training agents in n-player, general-sum extensive form games, which provably converges to an equilibrium. We further suggest correlated equilibria (CE) as promising meta-solvers, and propose a novel solution concept Maximum Gini Correlated Equilibrium (MGCE), a principled and computationally efficient family of solutions for solving the correlated equilibrium selection problem. We conduct several experiments using CE meta-solvers for JPSRO and demonstrate convergence on n-player, general-sum games. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.09435v3-abstract-full').style.display = 'none'; document.getElementById('2106.09435v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2021, 9 pages, coded implementation available in https://github.com/deepmind/open_spiel/ (jpsro.py in examples)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.01285">arXiv:2106.01285</a> <span> [<a href="https://arxiv.org/pdf/2106.01285">pdf</a>, <a href="https://arxiv.org/format/2106.01285">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> Sample-based Approximation of Nash in Large Many-Player Games via Gradient Descent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gemp%2C+I">Ian Gemp</a>, <a href="/search/cs?searchtype=author&query=Savani%2C+R">Rahul Savani</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Bachrach%2C+Y">Yoram Bachrach</a>, <a href="/search/cs?searchtype=author&query=Anthony%2C+T">Thomas Anthony</a>, <a href="/search/cs?searchtype=author&query=Everett%2C+R">Richard Everett</a>, <a href="/search/cs?searchtype=author&query=Tacchetti%2C+A">Andrea Tacchetti</a>, <a href="/search/cs?searchtype=author&query=Eccles%2C+T">Tom Eccles</a>, <a href="/search/cs?searchtype=author&query=Kram%C3%A1r%2C+J">J谩nos Kram谩r</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.01285v3-abstract-short" style="display: inline;"> Nash equilibrium is a central concept in game theory. Several Nash solvers exist, yet none scale to normal-form games with many actions and many players, especially those with payoff tensors too big to be stored in memory. In this work, we propose an approach that iteratively improves an approximation to a Nash equilibrium through joint play. It accomplishes this by tracing a previously establishe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.01285v3-abstract-full').style.display = 'inline'; document.getElementById('2106.01285v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.01285v3-abstract-full" style="display: none;"> Nash equilibrium is a central concept in game theory. Several Nash solvers exist, yet none scale to normal-form games with many actions and many players, especially those with payoff tensors too big to be stored in memory. In this work, we propose an approach that iteratively improves an approximation to a Nash equilibrium through joint play. It accomplishes this by tracing a previously established homotopy that defines a continuum of equilibria for the game regularized with decaying levels of entropy. This continuum asymptotically approaches the limiting logit equilibrium, proven by McKelvey and Palfrey (1995) to be unique in almost all games, thereby partially circumventing the well-known equilibrium selection problem of many-player games. To encourage iterates to remain near this path, we efficiently minimize average deviation incentive via stochastic gradient descent, intelligently sampling entries in the payoff tensor as needed. Monte Carlo estimates of the stochastic gradient from joint play are biased due to the appearance of a nonlinear max operator in the objective, so we introduce additional innovations to the algorithm to alleviate gradient bias. The descent process can also be viewed as repeatedly constructing and reacting to a polymatrix approximation to the game. In these ways, our proposed approach, average deviation incentive descent with adaptive sampling (ADIDAS), is most similar to three classical approaches, namely homotopy-type, Lyapunov, and iterative polymatrix solvers. The lack of local convergence guarantees for biased gradient descent prevents guaranteed convergence to Nash, however, we demonstrate through extensive experiments the ability of this approach to approximate a unique Nash in normal-form games with as many as seven players and twenty one actions (several billion outcomes) that are orders of magnitude larger than those possible with prior algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.01285v3-abstract-full').style.display = 'none'; document.getElementById('2106.01285v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in AAMAS 2022 (code available as part of open_spiel on github -- search ADIDAS in repo)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2102.06973">arXiv:2102.06973</a> <span> [<a href="https://arxiv.org/pdf/2102.06973">pdf</a>, <a href="https://arxiv.org/format/2102.06973">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Efficient Deviation Types and Learning for Hindsight Rationality in Extensive-Form Games </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Morrill%2C+D">Dustin Morrill</a>, <a href="/search/cs?searchtype=author&query=D%27Orazio%2C+R">Ryan D'Orazio</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Wright%2C+J+R">James R. Wright</a>, <a href="/search/cs?searchtype=author&query=Bowling%2C+M">Michael Bowling</a>, <a href="/search/cs?searchtype=author&query=Greenwald%2C+A">Amy Greenwald</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2102.06973v7-abstract-short" style="display: inline;"> Hindsight rationality is an approach to playing general-sum games that prescribes no-regret learning dynamics for individual agents with respect to a set of deviations, and further describes jointly rational behavior among multiple agents with mediated equilibria. To develop hindsight rational learning in sequential decision-making settings, we formalize behavioral deviations as a general class of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.06973v7-abstract-full').style.display = 'inline'; document.getElementById('2102.06973v7-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2102.06973v7-abstract-full" style="display: none;"> Hindsight rationality is an approach to playing general-sum games that prescribes no-regret learning dynamics for individual agents with respect to a set of deviations, and further describes jointly rational behavior among multiple agents with mediated equilibria. To develop hindsight rational learning in sequential decision-making settings, we formalize behavioral deviations as a general class of deviations that respect the structure of extensive-form games. Integrating the idea of time selection into counterfactual regret minimization (CFR), we introduce the extensive-form regret minimization (EFR) algorithm that achieves hindsight rationality for any given set of behavioral deviations with computation that scales closely with the complexity of the set. We identify behavioral deviation subsets, the partial sequence deviation types, that subsume previously studied types and lead to efficient EFR instances in games with moderate lengths. In addition, we present a thorough empirical analysis of EFR instantiated with different deviation types in benchmark games, where we find that stronger types typically induce better performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.06973v7-abstract-full').style.display = 'none'; document.getElementById('2102.06973v7-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Corrected technical report for the paper with the same title in the proceedings of the thirty-eighth International Conference on Machine Learning (ICML 2021), virtual. Compared to v5, this version removes the version indicator from an arXiv reference. 43 pages and 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2101.04237">arXiv:2101.04237</a> <span> [<a href="https://arxiv.org/pdf/2101.04237">pdf</a>, <a href="https://arxiv.org/format/2101.04237">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Solving Common-Payoff Games with Approximate Policy Iteration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sokota%2C+S">Samuel Sokota</a>, <a href="/search/cs?searchtype=author&query=Lockhart%2C+E">Edward Lockhart</a>, <a href="/search/cs?searchtype=author&query=Timbers%2C+F">Finbarr Timbers</a>, <a href="/search/cs?searchtype=author&query=Davoodi%2C+E">Elnaz Davoodi</a>, <a href="/search/cs?searchtype=author&query=D%27Orazio%2C+R">Ryan D'Orazio</a>, <a href="/search/cs?searchtype=author&query=Burch%2C+N">Neil Burch</a>, <a href="/search/cs?searchtype=author&query=Schmid%2C+M">Martin Schmid</a>, <a href="/search/cs?searchtype=author&query=Bowling%2C+M">Michael Bowling</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2101.04237v1-abstract-short" style="display: inline;"> For artificially intelligent learning systems to have widespread applicability in real-world settings, it is important that they be able to operate decentrally. Unfortunately, decentralized control is difficult -- computing even an epsilon-optimal joint policy is a NEXP complete problem. Nevertheless, a recently rediscovered insight -- that a team of agents can coordinate via common knowledge -- h… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.04237v1-abstract-full').style.display = 'inline'; document.getElementById('2101.04237v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2101.04237v1-abstract-full" style="display: none;"> For artificially intelligent learning systems to have widespread applicability in real-world settings, it is important that they be able to operate decentrally. Unfortunately, decentralized control is difficult -- computing even an epsilon-optimal joint policy is a NEXP complete problem. Nevertheless, a recently rediscovered insight -- that a team of agents can coordinate via common knowledge -- has given rise to algorithms capable of finding optimal joint policies in small common-payoff games. The Bayesian action decoder (BAD) leverages this insight and deep reinforcement learning to scale to games as large as two-player Hanabi. However, the approximations it uses to do so prevent it from discovering optimal joint policies even in games small enough to brute force optimal solutions. This work proposes CAPI, a novel algorithm which, like BAD, combines common knowledge with deep reinforcement learning. However, unlike BAD, CAPI prioritizes the propensity to discover optimal joint policies over scalability. While this choice precludes CAPI from scaling to games as large as Hanabi, empirical results demonstrate that, on the games to which CAPI does scale, it is capable of discovering optimal joint policies even when other modern multi-agent reinforcement learning algorithms are unable to do so. Code is available at https://github.com/ssokota/capi . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.04237v1-abstract-full').style.display = 'none'; document.getElementById('2101.04237v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AAAI 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2012.05874">arXiv:2012.05874</a> <span> [<a href="https://arxiv.org/pdf/2012.05874">pdf</a>, <a href="https://arxiv.org/format/2012.05874">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Hindsight and Sequential Rationality of Correlated Play </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Morrill%2C+D">Dustin Morrill</a>, <a href="/search/cs?searchtype=author&query=D%27Orazio%2C+R">Ryan D'Orazio</a>, <a href="/search/cs?searchtype=author&query=Sarfati%2C+R">Reca Sarfati</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Wright%2C+J+R">James R. Wright</a>, <a href="/search/cs?searchtype=author&query=Greenwald%2C+A">Amy Greenwald</a>, <a href="/search/cs?searchtype=author&query=Bowling%2C+M">Michael Bowling</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2012.05874v7-abstract-short" style="display: inline;"> Driven by recent successes in two-player, zero-sum game solving and playing, artificial intelligence work on games has increasingly focused on algorithms that produce equilibrium-based strategies. However, this approach has been less effective at producing competent players in general-sum games or those with more than two players than in two-player, zero-sum games. An appealing alternative is to c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.05874v7-abstract-full').style.display = 'inline'; document.getElementById('2012.05874v7-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2012.05874v7-abstract-full" style="display: none;"> Driven by recent successes in two-player, zero-sum game solving and playing, artificial intelligence work on games has increasingly focused on algorithms that produce equilibrium-based strategies. However, this approach has been less effective at producing competent players in general-sum games or those with more than two players than in two-player, zero-sum games. An appealing alternative is to consider adaptive algorithms that ensure strong performance in hindsight relative to what could have been achieved with modified behavior. This approach also leads to a game-theoretic analysis, but in the correlated play that arises from joint learning dynamics rather than factored agent behavior at equilibrium. We develop and advocate for this hindsight rationality framing of learning in general sequential decision-making settings. To this end, we re-examine mediated equilibrium and deviation types in extensive-form games, thereby gaining a more complete understanding and resolving past misconceptions. We present a set of examples illustrating the distinct strengths and weaknesses of each type of equilibrium in the literature, and prove that no tractable concept subsumes all others. This line of inquiry culminates in the definition of the deviation and equilibrium classes that correspond to algorithms in the counterfactual regret minimization (CFR) family, relating them to all others in the literature. Examining CFR in greater detail further leads to a new recursive definition of rationality in correlated play that extends sequential rationality in a way that naturally applies to hindsight evaluation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.05874v7-abstract-full').style.display = 'none'; document.getElementById('2012.05874v7-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 December, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Corrected technical report for the paper with the same title in the proceedings of the thirty-fifth AAAI Conference on Artificial Intelligence (AAAI-21), February 2-9, 2021, Virtual. Compared to v5, this version fixes the realized terminal history indicators in the diagram describing MacQueen's counterexample. 27 pages and 16 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2010.10380">arXiv:2010.10380</a> <span> [<a href="https://arxiv.org/pdf/2010.10380">pdf</a>, <a href="https://arxiv.org/format/2010.10380">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Negotiating Team Formation Using Deep Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bachrach%2C+Y">Yoram Bachrach</a>, <a href="/search/cs?searchtype=author&query=Everett%2C+R">Richard Everett</a>, <a href="/search/cs?searchtype=author&query=Hughes%2C+E">Edward Hughes</a>, <a href="/search/cs?searchtype=author&query=Lazaridou%2C+A">Angeliki Lazaridou</a>, <a href="/search/cs?searchtype=author&query=Leibo%2C+J+Z">Joel Z. Leibo</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Johanson%2C+M">Michael Johanson</a>, <a href="/search/cs?searchtype=author&query=Czarnecki%2C+W+M">Wojciech M. Czarnecki</a>, <a href="/search/cs?searchtype=author&query=Graepel%2C+T">Thore Graepel</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2010.10380v1-abstract-short" style="display: inline;"> When autonomous agents interact in the same environment, they must often cooperate to achieve their goals. One way for agents to cooperate effectively is to form a team, make a binding agreement on a joint plan, and execute it. However, when agents are self-interested, the gains from team formation must be allocated appropriately to incentivize agreement. Various approaches for multi-agent negotia… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.10380v1-abstract-full').style.display = 'inline'; document.getElementById('2010.10380v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2010.10380v1-abstract-full" style="display: none;"> When autonomous agents interact in the same environment, they must often cooperate to achieve their goals. One way for agents to cooperate effectively is to form a team, make a binding agreement on a joint plan, and execute it. However, when agents are self-interested, the gains from team formation must be allocated appropriately to incentivize agreement. Various approaches for multi-agent negotiation have been proposed, but typically only work for particular negotiation protocols. More general methods usually require human input or domain-specific data, and so do not scale. To address this, we propose a framework for training agents to negotiate and form teams using deep reinforcement learning. Importantly, our method makes no assumptions about the specific negotiation protocol, and is instead completely experience driven. We evaluate our approach on both non-spatial and spatially extended team-formation negotiation environments, demonstrating that our agents beat hand-crafted bots and reach negotiation outcomes consistent with fair solutions predicted by cooperative game theory. Additionally, we investigate how the physical location of agents influences negotiation outcomes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.10380v1-abstract-full').style.display = 'none'; document.getElementById('2010.10380v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6 </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Artificial Intelligence 288 (2020): 103356 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2008.12234">arXiv:2008.12234</a> <span> [<a href="https://arxiv.org/pdf/2008.12234">pdf</a>, <a href="https://arxiv.org/format/2008.12234">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> The Advantage Regret-Matching Actor-Critic </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gruslys%2C+A">Audr奴nas Gruslys</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Timbers%2C+F">Finbarr Timbers</a>, <a href="/search/cs?searchtype=author&query=Schmid%2C+M">Martin Schmid</a>, <a href="/search/cs?searchtype=author&query=Perolat%2C+J">Julien Perolat</a>, <a href="/search/cs?searchtype=author&query=Morrill%2C+D">Dustin Morrill</a>, <a href="/search/cs?searchtype=author&query=Zambaldi%2C+V">Vinicius Zambaldi</a>, <a href="/search/cs?searchtype=author&query=Lespiau%2C+J">Jean-Baptiste Lespiau</a>, <a href="/search/cs?searchtype=author&query=Schultz%2C+J">John Schultz</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Bowling%2C+M">Michael Bowling</a>, <a href="/search/cs?searchtype=author&query=Tuyls%2C+K">Karl Tuyls</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2008.12234v1-abstract-short" style="display: inline;"> Regret minimization has played a key role in online learning, equilibrium computation in games, and reinforcement learning (RL). In this paper, we describe a general model-free RL method for no-regret learning based on repeated reconsideration of past behavior. We propose a model-free RL algorithm, the AdvantageRegret-Matching Actor-Critic (ARMAC): rather than saving past state-action data, ARMAC… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.12234v1-abstract-full').style.display = 'inline'; document.getElementById('2008.12234v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2008.12234v1-abstract-full" style="display: none;"> Regret minimization has played a key role in online learning, equilibrium computation in games, and reinforcement learning (RL). In this paper, we describe a general model-free RL method for no-regret learning based on repeated reconsideration of past behavior. We propose a model-free RL algorithm, the AdvantageRegret-Matching Actor-Critic (ARMAC): rather than saving past state-action data, ARMAC saves a buffer of past policies, replaying through them to reconstruct hindsight assessments of past behavior. These retrospective value estimates are used to predict conditional advantages which, combined with regret matching, produces a new policy. In particular, ARMAC learns from sampled trajectories in a centralized training setting, without requiring the application of importance sampling commonly used in Monte Carlo counterfactual regret (CFR) minimization; hence, it does not suffer from excessive variance in large environments. In the single-agent setting, ARMAC shows an interesting form of exploration by keeping past policies intact. In the multiagent setting, ARMAC in self-play approaches Nash equilibria on some partially-observable zero-sum benchmarks. We provide exploitability estimates in the significantly larger game of betting-abstracted no-limit Texas Hold'em. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.12234v1-abstract-full').style.display = 'none'; document.getElementById('2008.12234v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2006.08740">arXiv:2006.08740</a> <span> [<a href="https://arxiv.org/pdf/2006.08740">pdf</a>, <a href="https://arxiv.org/format/2006.08740">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> Sound Algorithms in Imperfect Information Games </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=%C5%A0ustr%2C+M">Michal 艩ustr</a>, <a href="/search/cs?searchtype=author&query=Schmid%2C+M">Martin Schmid</a>, <a href="/search/cs?searchtype=author&query=Morav%C4%8D%C3%ADk%2C+M">Matej Morav膷铆k</a>, <a href="/search/cs?searchtype=author&query=Burch%2C+N">Neil Burch</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Bowling%2C+M">Michael Bowling</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2006.08740v2-abstract-short" style="display: inline;"> Search has played a fundamental role in computer game research since the very beginning. And while online search has been commonly used in perfect information games such as Chess and Go, online search methods for imperfect information games have only been introduced relatively recently. This paper addresses the question of what is a sound online algorithm in an imperfect information setting of two… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.08740v2-abstract-full').style.display = 'inline'; document.getElementById('2006.08740v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2006.08740v2-abstract-full" style="display: none;"> Search has played a fundamental role in computer game research since the very beginning. And while online search has been commonly used in perfect information games such as Chess and Go, online search methods for imperfect information games have only been introduced relatively recently. This paper addresses the question of what is a sound online algorithm in an imperfect information setting of two-player zero-sum games. We argue that the~fixed-strategy~definitions of exploitability and $蔚$-Nash equilibria are ill-suited to measure an online algorithm's worst-case performance. We thus formalize $蔚$-soundness, a concept that connects the worst-case performance of an online algorithm to the performance of an $蔚$-Nash equilibrium. As $蔚$-soundness can be difficult to compute in general, we introduce a consistency framework -- a hierarchy that connects an online algorithm's behavior to a Nash equilibrium. These multiple levels of consistency describe in what sense an online algorithm plays "just like a fixed Nash equilibrium". These notions further illustrate the difference between perfect and imperfect information settings, as the same consistency guarantees have different worst-case online performance in perfect and imperfect information games. The definitions of soundness and the consistency hierarchy finally provide appropriate tools to analyze online algorithms in repeated imperfect information games. We thus inspect some of the previous online algorithms in a new light, bringing new insights into their worst-case performance guarantees. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.08740v2-abstract-full').style.display = 'none'; document.getElementById('2006.08740v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to AAMAS2021 as extended abstract (Ref. numbers not available yet)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2006.04635">arXiv:2006.04635</a> <span> [<a href="https://arxiv.org/pdf/2006.04635">pdf</a>, <a href="https://arxiv.org/format/2006.04635">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Learning to Play No-Press Diplomacy with Best Response Policy Iteration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Anthony%2C+T">Thomas Anthony</a>, <a href="/search/cs?searchtype=author&query=Eccles%2C+T">Tom Eccles</a>, <a href="/search/cs?searchtype=author&query=Tacchetti%2C+A">Andrea Tacchetti</a>, <a href="/search/cs?searchtype=author&query=Kram%C3%A1r%2C+J">J谩nos Kram谩r</a>, <a href="/search/cs?searchtype=author&query=Gemp%2C+I">Ian Gemp</a>, <a href="/search/cs?searchtype=author&query=Hudson%2C+T+C">Thomas C. Hudson</a>, <a href="/search/cs?searchtype=author&query=Porcel%2C+N">Nicolas Porcel</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=P%C3%A9rolat%2C+J">Julien P茅rolat</a>, <a href="/search/cs?searchtype=author&query=Everett%2C+R">Richard Everett</a>, <a href="/search/cs?searchtype=author&query=Werpachowski%2C+R">Roman Werpachowski</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+S">Satinder Singh</a>, <a href="/search/cs?searchtype=author&query=Graepel%2C+T">Thore Graepel</a>, <a href="/search/cs?searchtype=author&query=Bachrach%2C+Y">Yoram Bachrach</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2006.04635v4-abstract-short" style="display: inline;"> Recent advances in deep reinforcement learning (RL) have led to considerable progress in many 2-player zero-sum games, such as Go, Poker and Starcraft. The purely adversarial nature of such games allows for conceptually simple and principled application of RL methods. However real-world settings are many-agent, and agent interactions are complex mixtures of common-interest and competitive aspects.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.04635v4-abstract-full').style.display = 'inline'; document.getElementById('2006.04635v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2006.04635v4-abstract-full" style="display: none;"> Recent advances in deep reinforcement learning (RL) have led to considerable progress in many 2-player zero-sum games, such as Go, Poker and Starcraft. The purely adversarial nature of such games allows for conceptually simple and principled application of RL methods. However real-world settings are many-agent, and agent interactions are complex mixtures of common-interest and competitive aspects. We consider Diplomacy, a 7-player board game designed to accentuate dilemmas resulting from many-agent interactions. It also features a large combinatorial action space and simultaneous moves, which are challenging for RL algorithms. We propose a simple yet effective approximate best response operator, designed to handle large combinatorial action spaces and simultaneous moves. We also introduce a family of policy iteration methods that approximate fictitious play. With these methods, we successfully apply RL to Diplomacy: we show that our agents convincingly outperform the previous state-of-the-art, and game theoretic equilibrium analysis shows that the new process yields consistent improvements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.04635v4-abstract-full').style.display = 'none'; document.getElementById('2006.04635v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2004.09677">arXiv:2004.09677</a> <span> [<a href="https://arxiv.org/pdf/2004.09677">pdf</a>, <a href="https://arxiv.org/format/2004.09677">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Approximate exploitability: Learning a best response in large games </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Timbers%2C+F">Finbarr Timbers</a>, <a href="/search/cs?searchtype=author&query=Bard%2C+N">Nolan Bard</a>, <a href="/search/cs?searchtype=author&query=Lockhart%2C+E">Edward Lockhart</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Schmid%2C+M">Martin Schmid</a>, <a href="/search/cs?searchtype=author&query=Burch%2C+N">Neil Burch</a>, <a href="/search/cs?searchtype=author&query=Schrittwieser%2C+J">Julian Schrittwieser</a>, <a href="/search/cs?searchtype=author&query=Hubert%2C+T">Thomas Hubert</a>, <a href="/search/cs?searchtype=author&query=Bowling%2C+M">Michael Bowling</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2004.09677v5-abstract-short" style="display: inline;"> Researchers have demonstrated that neural networks are vulnerable to adversarial examples and subtle environment changes, both of which one can view as a form of distribution shift. To humans, the resulting errors can look like blunders, eroding trust in these agents. In prior games research, agent evaluation often focused on the in-practice game outcomes. While valuable, such evaluation typically… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.09677v5-abstract-full').style.display = 'inline'; document.getElementById('2004.09677v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2004.09677v5-abstract-full" style="display: none;"> Researchers have demonstrated that neural networks are vulnerable to adversarial examples and subtle environment changes, both of which one can view as a form of distribution shift. To humans, the resulting errors can look like blunders, eroding trust in these agents. In prior games research, agent evaluation often focused on the in-practice game outcomes. While valuable, such evaluation typically fails to evaluate robustness to worst-case outcomes. Prior research in computer poker has examined how to assess such worst-case performance, both exactly and approximately. Unfortunately, exact computation is infeasible with larger domains, and existing approximations rely on poker-specific knowledge. We introduce ISMCTS-BR, a scalable search-based deep reinforcement learning algorithm for learning a best response to an agent, thereby approximating worst-case performance. We demonstrate the technique in several two-player zero-sum games against a variety of agents, including several AlphaZero-based agents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.09677v5-abstract-full').style.display = 'none'; document.getElementById('2004.09677v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 April, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2002.08456">arXiv:2002.08456</a> <span> [<a href="https://arxiv.org/pdf/2002.08456">pdf</a>, <a href="https://arxiv.org/format/2002.08456">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> From Poincar茅 Recurrence to Convergence in Imperfect Information Games: Finding Equilibrium via Regularization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Perolat%2C+J">Julien Perolat</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">Remi Munos</a>, <a href="/search/cs?searchtype=author&query=Lespiau%2C+J">Jean-Baptiste Lespiau</a>, <a href="/search/cs?searchtype=author&query=Omidshafiei%2C+S">Shayegan Omidshafiei</a>, <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Ortega%2C+P">Pedro Ortega</a>, <a href="/search/cs?searchtype=author&query=Burch%2C+N">Neil Burch</a>, <a href="/search/cs?searchtype=author&query=Anthony%2C+T">Thomas Anthony</a>, <a href="/search/cs?searchtype=author&query=Balduzzi%2C+D">David Balduzzi</a>, <a href="/search/cs?searchtype=author&query=De+Vylder%2C+B">Bart De Vylder</a>, <a href="/search/cs?searchtype=author&query=Piliouras%2C+G">Georgios Piliouras</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Tuyls%2C+K">Karl Tuyls</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2002.08456v1-abstract-short" style="display: inline;"> In this paper we investigate the Follow the Regularized Leader dynamics in sequential imperfect information games (IIG). We generalize existing results of Poincar茅 recurrence from normal-form games to zero-sum two-player imperfect information games and other sequential game settings. We then investigate how adapting the reward (by adding a regularization term) of the game can give strong convergen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2002.08456v1-abstract-full').style.display = 'inline'; document.getElementById('2002.08456v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2002.08456v1-abstract-full" style="display: none;"> In this paper we investigate the Follow the Regularized Leader dynamics in sequential imperfect information games (IIG). We generalize existing results of Poincar茅 recurrence from normal-form games to zero-sum two-player imperfect information games and other sequential game settings. We then investigate how adapting the reward (by adding a regularization term) of the game can give strong convergence guarantees in monotone games. We continue by showing how this reward adaptation technique can be leveraged to build algorithms that converge exactly to the Nash equilibrium. Finally, we show how these insights can be directly used to build state-of-the-art model-free algorithms for zero-sum two-player Imperfect Information Games (IIG). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2002.08456v1-abstract-full').style.display = 'none'; document.getElementById('2002.08456v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">43 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1909.12823">arXiv:1909.12823</a> <span> [<a href="https://arxiv.org/pdf/1909.12823">pdf</a>, <a href="https://arxiv.org/format/1909.12823">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Generalized Training Approach for Multiagent Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Muller%2C+P">Paul Muller</a>, <a href="/search/cs?searchtype=author&query=Omidshafiei%2C+S">Shayegan Omidshafiei</a>, <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Tuyls%2C+K">Karl Tuyls</a>, <a href="/search/cs?searchtype=author&query=Perolat%2C+J">Julien Perolat</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Siqi Liu</a>, <a href="/search/cs?searchtype=author&query=Hennes%2C+D">Daniel Hennes</a>, <a href="/search/cs?searchtype=author&query=Marris%2C+L">Luke Marris</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Hughes%2C+E">Edward Hughes</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhe Wang</a>, <a href="/search/cs?searchtype=author&query=Lever%2C+G">Guy Lever</a>, <a href="/search/cs?searchtype=author&query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&query=Graepel%2C+T">Thore Graepel</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">Remi Munos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1909.12823v2-abstract-short" style="display: inline;"> This paper investigates a population-based training regime based on game-theoretic principles called Policy-Spaced Response Oracles (PSRO). PSRO is general in the sense that it (1) encompasses well-known algorithms such as fictitious play and double oracle as special cases, and (2) in principle applies to general-sum, many-player games. Despite this, prior studies of PSRO have been focused on two-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1909.12823v2-abstract-full').style.display = 'inline'; document.getElementById('1909.12823v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1909.12823v2-abstract-full" style="display: none;"> This paper investigates a population-based training regime based on game-theoretic principles called Policy-Spaced Response Oracles (PSRO). PSRO is general in the sense that it (1) encompasses well-known algorithms such as fictitious play and double oracle as special cases, and (2) in principle applies to general-sum, many-player games. Despite this, prior studies of PSRO have been focused on two-player zero-sum games, a regime wherein Nash equilibria are tractably computable. In moving from two-player zero-sum games to more general settings, computation of Nash equilibria quickly becomes infeasible. Here, we extend the theoretical underpinnings of PSRO by considering an alternative solution concept, $伪$-Rank, which is unique (thus faces no equilibrium selection issues, unlike Nash) and applies readily to general-sum, many-player settings. We establish convergence guarantees in several games classes, and identify links between Nash equilibria and $伪$-Rank. We demonstrate the competitive performance of $伪$-Rank-based PSRO against an exact Nash solver-based PSRO in 2-player Kuhn and Leduc Poker. We then go beyond the reach of prior PSRO applications by considering 3- to 5-player poker games, yielding instances where $伪$-Rank achieves faster convergence than approximate Nash solvers, thus establishing it as a favorable general games solver. We also carry out an initial empirical validation in MuJoCo soccer, illustrating the feasibility of the proposed approach in another complex domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1909.12823v2-abstract-full').style.display = 'none'; document.getElementById('1909.12823v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 September, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1908.09453">arXiv:1908.09453</a> <span> [<a href="https://arxiv.org/pdf/1908.09453">pdf</a>, <a href="https://arxiv.org/format/1908.09453">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> OpenSpiel: A Framework for Reinforcement Learning in Games </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Lockhart%2C+E">Edward Lockhart</a>, <a href="/search/cs?searchtype=author&query=Lespiau%2C+J">Jean-Baptiste Lespiau</a>, <a href="/search/cs?searchtype=author&query=Zambaldi%2C+V">Vinicius Zambaldi</a>, <a href="/search/cs?searchtype=author&query=Upadhyay%2C+S">Satyaki Upadhyay</a>, <a href="/search/cs?searchtype=author&query=P%C3%A9rolat%2C+J">Julien P茅rolat</a>, <a href="/search/cs?searchtype=author&query=Srinivasan%2C+S">Sriram Srinivasan</a>, <a href="/search/cs?searchtype=author&query=Timbers%2C+F">Finbarr Timbers</a>, <a href="/search/cs?searchtype=author&query=Tuyls%2C+K">Karl Tuyls</a>, <a href="/search/cs?searchtype=author&query=Omidshafiei%2C+S">Shayegan Omidshafiei</a>, <a href="/search/cs?searchtype=author&query=Hennes%2C+D">Daniel Hennes</a>, <a href="/search/cs?searchtype=author&query=Morrill%2C+D">Dustin Morrill</a>, <a href="/search/cs?searchtype=author&query=Muller%2C+P">Paul Muller</a>, <a href="/search/cs?searchtype=author&query=Ewalds%2C+T">Timo Ewalds</a>, <a href="/search/cs?searchtype=author&query=Faulkner%2C+R">Ryan Faulkner</a>, <a href="/search/cs?searchtype=author&query=Kram%C3%A1r%2C+J">J谩nos Kram谩r</a>, <a href="/search/cs?searchtype=author&query=De+Vylder%2C+B">Bart De Vylder</a>, <a href="/search/cs?searchtype=author&query=Saeta%2C+B">Brennan Saeta</a>, <a href="/search/cs?searchtype=author&query=Bradbury%2C+J">James Bradbury</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+D">David Ding</a>, <a href="/search/cs?searchtype=author&query=Borgeaud%2C+S">Sebastian Borgeaud</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+M">Matthew Lai</a>, <a href="/search/cs?searchtype=author&query=Schrittwieser%2C+J">Julian Schrittwieser</a>, <a href="/search/cs?searchtype=author&query=Anthony%2C+T">Thomas Anthony</a>, <a href="/search/cs?searchtype=author&query=Hughes%2C+E">Edward Hughes</a> , et al. (2 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1908.09453v6-abstract-short" style="display: inline;"> OpenSpiel is a collection of environments and algorithms for research in general reinforcement learning and search/planning in games. OpenSpiel supports n-player (single- and multi- agent) zero-sum, cooperative and general-sum, one-shot and sequential, strictly turn-taking and simultaneous-move, perfect and imperfect information games, as well as traditional multiagent environments such as (partia… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1908.09453v6-abstract-full').style.display = 'inline'; document.getElementById('1908.09453v6-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1908.09453v6-abstract-full" style="display: none;"> OpenSpiel is a collection of environments and algorithms for research in general reinforcement learning and search/planning in games. OpenSpiel supports n-player (single- and multi- agent) zero-sum, cooperative and general-sum, one-shot and sequential, strictly turn-taking and simultaneous-move, perfect and imperfect information games, as well as traditional multiagent environments such as (partially- and fully- observable) grid worlds and social dilemmas. OpenSpiel also includes tools to analyze learning dynamics and other common evaluation metrics. This document serves both as an overview of the code base and an introduction to the terminology, core concepts, and algorithms across the fields of reinforcement learning, computational game theory, and search. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1908.09453v6-abstract-full').style.display = 'none'; document.getElementById('1908.09453v6-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 August, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1906.00190">arXiv:1906.00190</a> <span> [<a href="https://arxiv.org/pdf/1906.00190">pdf</a>, <a href="https://arxiv.org/format/1906.00190">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Neural Replicator Dynamics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hennes%2C+D">Daniel Hennes</a>, <a href="/search/cs?searchtype=author&query=Morrill%2C+D">Dustin Morrill</a>, <a href="/search/cs?searchtype=author&query=Omidshafiei%2C+S">Shayegan Omidshafiei</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">Remi Munos</a>, <a href="/search/cs?searchtype=author&query=Perolat%2C+J">Julien Perolat</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Gruslys%2C+A">Audrunas Gruslys</a>, <a href="/search/cs?searchtype=author&query=Lespiau%2C+J">Jean-Baptiste Lespiau</a>, <a href="/search/cs?searchtype=author&query=Parmas%2C+P">Paavo Parmas</a>, <a href="/search/cs?searchtype=author&query=Duenez-Guzman%2C+E">Edgar Duenez-Guzman</a>, <a href="/search/cs?searchtype=author&query=Tuyls%2C+K">Karl Tuyls</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1906.00190v5-abstract-short" style="display: inline;"> Policy gradient and actor-critic algorithms form the basis of many commonly used training techniques in deep reinforcement learning. Using these algorithms in multiagent environments poses problems such as nonstationarity and instability. In this paper, we first demonstrate that standard softmax-based policy gradient can be prone to poor performance in the presence of even the most benign nonstati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.00190v5-abstract-full').style.display = 'inline'; document.getElementById('1906.00190v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1906.00190v5-abstract-full" style="display: none;"> Policy gradient and actor-critic algorithms form the basis of many commonly used training techniques in deep reinforcement learning. Using these algorithms in multiagent environments poses problems such as nonstationarity and instability. In this paper, we first demonstrate that standard softmax-based policy gradient can be prone to poor performance in the presence of even the most benign nonstationarity. By contrast, it is known that the replicator dynamics, a well-studied model from evolutionary game theory, eliminates dominated strategies and exhibits convergence of the time-averaged trajectories to interior Nash equilibria in zero-sum games. Thus, using the replicator dynamics as a foundation, we derive an elegant one-line change to policy gradient methods that simply bypasses the gradient step through the softmax, yielding a new algorithm titled Neural Replicator Dynamics (NeuRD). NeuRD reduces to the exponential weights/Hedge algorithm in the single-state all-actions case. Additionally, NeuRD has formal equivalence to softmax counterfactual regret minimization, which guarantees convergence in the sequential tabular case. Importantly, our algorithm provides a straightforward way of extending the replicator dynamics to the function approximation setting. Empirical results show that NeuRD quickly adapts to nonstationarities, outperforming policy gradient significantly in both tabular and function approximation settings, when evaluated on the standard imperfect information benchmarks of Kuhn Poker, Leduc Poker, and Goofspiel. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.00190v5-abstract-full').style.display = 'none'; document.getElementById('1906.00190v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 February, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 June, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1903.05614">arXiv:1903.05614</a> <span> [<a href="https://arxiv.org/pdf/1903.05614">pdf</a>, <a href="https://arxiv.org/format/1903.05614">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Computing Approximate Equilibria in Sequential Adversarial Games by Exploitability Descent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lockhart%2C+E">Edward Lockhart</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=P%C3%A9rolat%2C+J">Julien P茅rolat</a>, <a href="/search/cs?searchtype=author&query=Lespiau%2C+J">Jean-Baptiste Lespiau</a>, <a href="/search/cs?searchtype=author&query=Morrill%2C+D">Dustin Morrill</a>, <a href="/search/cs?searchtype=author&query=Timbers%2C+F">Finbarr Timbers</a>, <a href="/search/cs?searchtype=author&query=Tuyls%2C+K">Karl Tuyls</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1903.05614v4-abstract-short" style="display: inline;"> In this paper, we present exploitability descent, a new algorithm to compute approximate equilibria in two-player zero-sum extensive-form games with imperfect information, by direct policy optimization against worst-case opponents. We prove that when following this optimization, the exploitability of a player's strategy converges asymptotically to zero, and hence when both players employ this opti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1903.05614v4-abstract-full').style.display = 'inline'; document.getElementById('1903.05614v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1903.05614v4-abstract-full" style="display: none;"> In this paper, we present exploitability descent, a new algorithm to compute approximate equilibria in two-player zero-sum extensive-form games with imperfect information, by direct policy optimization against worst-case opponents. We prove that when following this optimization, the exploitability of a player's strategy converges asymptotically to zero, and hence when both players employ this optimization, the joint policies converge to a Nash equilibrium. Unlike fictitious play (XFP) and counterfactual regret minimization (CFR), our convergence result pertains to the policies being optimized rather than the average policies. Our experiments demonstrate convergence rates comparable to XFP and CFR in four benchmark games in the tabular case. Using function approximation, we find that our algorithm outperforms the tabular version in two of the games, which, to the best of our knowledge, is the first such result in imperfect information games among this class of algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1903.05614v4-abstract-full').style.display = 'none'; document.getElementById('1903.05614v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 March, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IJCAI 2019, 11 pages, 1 figure</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1903.01373">arXiv:1903.01373</a> <span> [<a href="https://arxiv.org/pdf/1903.01373">pdf</a>, <a href="https://arxiv.org/format/1903.01373">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> $伪$-Rank: Multi-Agent Evaluation by Evolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Omidshafiei%2C+S">Shayegan Omidshafiei</a>, <a href="/search/cs?searchtype=author&query=Papadimitriou%2C+C">Christos Papadimitriou</a>, <a href="/search/cs?searchtype=author&query=Piliouras%2C+G">Georgios Piliouras</a>, <a href="/search/cs?searchtype=author&query=Tuyls%2C+K">Karl Tuyls</a>, <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Lespiau%2C+J">Jean-Baptiste Lespiau</a>, <a href="/search/cs?searchtype=author&query=Czarnecki%2C+W+M">Wojciech M. Czarnecki</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Perolat%2C+J">Julien Perolat</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">Remi Munos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1903.01373v4-abstract-short" style="display: inline;"> We introduce $伪$-Rank, a principled evolutionary dynamics methodology for the evaluation and ranking of agents in large-scale multi-agent interactions, grounded in a novel dynamical game-theoretic solution concept called Markov-Conley chains (MCCs). The approach leverages continuous- and discrete-time evolutionary dynamical systems applied to empirical games, and scales tractably in the number of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1903.01373v4-abstract-full').style.display = 'inline'; document.getElementById('1903.01373v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1903.01373v4-abstract-full" style="display: none;"> We introduce $伪$-Rank, a principled evolutionary dynamics methodology for the evaluation and ranking of agents in large-scale multi-agent interactions, grounded in a novel dynamical game-theoretic solution concept called Markov-Conley chains (MCCs). The approach leverages continuous- and discrete-time evolutionary dynamical systems applied to empirical games, and scales tractably in the number of agents, the type of interactions, and the type of empirical games (symmetric and asymmetric). Current models are fundamentally limited in one or more of these dimensions and are not guaranteed to converge to the desired game-theoretic solution concept (typically the Nash equilibrium). $伪$-Rank provides a ranking over the set of agents under evaluation and provides insights into their strengths, weaknesses, and long-term dynamics. This is a consequence of the links we establish to the MCC solution concept when the underlying evolutionary model's ranking-intensity parameter, $伪$, is chosen to be large, which exactly forms the basis of $伪$-Rank. In contrast to the Nash equilibrium, which is a static concept based on fixed points, MCCs are a dynamical solution concept based on the Markov chain formalism, Conley's Fundamental Theorem of Dynamical Systems, and the core ingredients of dynamical systems: fixed points, recurrent sets, periodic orbits, and limit cycles. $伪$-Rank runs in polynomial time with respect to the total number of pure strategy profiles, whereas computing a Nash equilibrium for a general-sum game is known to be intractable. We introduce proofs that not only provide a unifying perspective of existing continuous- and discrete-time evolutionary evaluation models, but also reveal the formal underpinnings of the $伪$-Rank methodology. We empirically validate the method in several domains including AlphaGo, AlphaZero, MuJoCo Soccer, and Poker. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1903.01373v4-abstract-full').style.display = 'none'; document.getElementById('1903.01373v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 March, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1903.00742">arXiv:1903.00742</a> <span> [<a href="https://arxiv.org/pdf/1903.00742">pdf</a>, <a href="https://arxiv.org/format/1903.00742">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> Autocurricula and the Emergence of Innovation from Social Interaction: A Manifesto for Multi-Agent Intelligence Research </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Leibo%2C+J+Z">Joel Z. Leibo</a>, <a href="/search/cs?searchtype=author&query=Hughes%2C+E">Edward Hughes</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Graepel%2C+T">Thore Graepel</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1903.00742v2-abstract-short" style="display: inline;"> Evolution has produced a multi-scale mosaic of interacting adaptive units. Innovations arise when perturbations push parts of the system away from stable equilibria into new regimes where previously well-adapted solutions no longer work. Here we explore the hypothesis that multi-agent systems sometimes display intrinsic dynamics arising from competition and cooperation that provide a naturally eme… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1903.00742v2-abstract-full').style.display = 'inline'; document.getElementById('1903.00742v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1903.00742v2-abstract-full" style="display: none;"> Evolution has produced a multi-scale mosaic of interacting adaptive units. Innovations arise when perturbations push parts of the system away from stable equilibria into new regimes where previously well-adapted solutions no longer work. Here we explore the hypothesis that multi-agent systems sometimes display intrinsic dynamics arising from competition and cooperation that provide a naturally emergent curriculum, which we term an autocurriculum. The solution of one social task often begets new social tasks, continually generating novel challenges, and thereby promoting innovation. Under certain conditions these challenges may become increasingly complex over time, demanding that agents accumulate ever more innovations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1903.00742v2-abstract-full').style.display = 'none'; document.getElementById('1903.00742v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 March, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1902.00506">arXiv:1902.00506</a> <span> [<a href="https://arxiv.org/pdf/1902.00506">pdf</a>, <a href="https://arxiv.org/format/1902.00506">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.artint.2019.103216">10.1016/j.artint.2019.103216 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> The Hanabi Challenge: A New Frontier for AI Research </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bard%2C+N">Nolan Bard</a>, <a href="/search/cs?searchtype=author&query=Foerster%2C+J+N">Jakob N. Foerster</a>, <a href="/search/cs?searchtype=author&query=Chandar%2C+S">Sarath Chandar</a>, <a href="/search/cs?searchtype=author&query=Burch%2C+N">Neil Burch</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Song%2C+H+F">H. Francis Song</a>, <a href="/search/cs?searchtype=author&query=Parisotto%2C+E">Emilio Parisotto</a>, <a href="/search/cs?searchtype=author&query=Dumoulin%2C+V">Vincent Dumoulin</a>, <a href="/search/cs?searchtype=author&query=Moitra%2C+S">Subhodeep Moitra</a>, <a href="/search/cs?searchtype=author&query=Hughes%2C+E">Edward Hughes</a>, <a href="/search/cs?searchtype=author&query=Dunning%2C+I">Iain Dunning</a>, <a href="/search/cs?searchtype=author&query=Mourad%2C+S">Shibl Mourad</a>, <a href="/search/cs?searchtype=author&query=Larochelle%2C+H">Hugo Larochelle</a>, <a href="/search/cs?searchtype=author&query=Bellemare%2C+M+G">Marc G. Bellemare</a>, <a href="/search/cs?searchtype=author&query=Bowling%2C+M">Michael Bowling</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1902.00506v2-abstract-short" style="display: inline;"> From the early days of computing, games have been important testbeds for studying how well machines can do sophisticated decision making. In recent years, machine learning has made dramatic advances with artificial agents reaching superhuman performance in challenge domains like Go, Atari, and some variants of poker. As with their predecessors of chess, checkers, and backgammon, these game domains… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1902.00506v2-abstract-full').style.display = 'inline'; document.getElementById('1902.00506v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1902.00506v2-abstract-full" style="display: none;"> From the early days of computing, games have been important testbeds for studying how well machines can do sophisticated decision making. In recent years, machine learning has made dramatic advances with artificial agents reaching superhuman performance in challenge domains like Go, Atari, and some variants of poker. As with their predecessors of chess, checkers, and backgammon, these game domains have driven research by providing sophisticated yet well-defined challenges for artificial intelligence practitioners. We continue this tradition by proposing the game of Hanabi as a new challenge domain with novel problems that arise from its combination of purely cooperative gameplay with two to five players and imperfect information. In particular, we argue that Hanabi elevates reasoning about the beliefs and intentions of other agents to the foreground. We believe developing novel techniques for such theory of mind reasoning will not only be crucial for success in Hanabi, but also in broader collaborative efforts, especially those with human partners. To facilitate future research, we introduce the open-source Hanabi Learning Environment, propose an experimental framework for the research community to evaluate algorithmic advances, and assess the performance of current state-of-the-art techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1902.00506v2-abstract-full').style.display = 'none'; document.getElementById('1902.00506v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 February, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">32 pages, 5 figures, In Press (Artificial Intelligence)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1810.09026">arXiv:1810.09026</a> <span> [<a href="https://arxiv.org/pdf/1810.09026">pdf</a>, <a href="https://arxiv.org/format/1810.09026">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Actor-Critic Policy Optimization in Partially Observable Multiagent Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Srinivasan%2C+S">Sriram Srinivasan</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Zambaldi%2C+V">Vinicius Zambaldi</a>, <a href="/search/cs?searchtype=author&query=Perolat%2C+J">Julien Perolat</a>, <a href="/search/cs?searchtype=author&query=Tuyls%2C+K">Karl Tuyls</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">Remi Munos</a>, <a href="/search/cs?searchtype=author&query=Bowling%2C+M">Michael Bowling</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1810.09026v5-abstract-short" style="display: inline;"> Optimization of parameterized policies for reinforcement learning (RL) is an important and challenging problem in artificial intelligence. Among the most common approaches are algorithms based on gradient ascent of a score function representing discounted return. In this paper, we examine the role of these policy gradient and actor-critic algorithms in partially-observable multiagent environments.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1810.09026v5-abstract-full').style.display = 'inline'; document.getElementById('1810.09026v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1810.09026v5-abstract-full" style="display: none;"> Optimization of parameterized policies for reinforcement learning (RL) is an important and challenging problem in artificial intelligence. Among the most common approaches are algorithms based on gradient ascent of a score function representing discounted return. In this paper, we examine the role of these policy gradient and actor-critic algorithms in partially-observable multiagent environments. We show several candidate policy update rules and relate them to a foundation of regret minimization and multiagent learning techniques for the one-shot and tabular cases, leading to previously unknown convergence guarantees. We apply our method to model-free multiagent reinforcement learning in adversarial sequential decision problems (zero-sum imperfect information games), using RL-style function approximation. We evaluate on commonly used benchmark Poker domains, showing performance against fixed policies and empirical convergence to approximate Nash equilibria in self-play with rates similar to or better than a baseline model-free algorithm for zero sum games, without any domain-specific state space reductions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1810.09026v5-abstract-full').style.display = 'none'; document.getElementById('1810.09026v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 October, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2018</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1809.03057">arXiv:1809.03057</a> <span> [<a href="https://arxiv.org/pdf/1809.03057">pdf</a>, <a href="https://arxiv.org/format/1809.03057">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Variance Reduction in Monte Carlo Counterfactual Regret Minimization (VR-MCCFR) for Extensive Form Games using Baselines </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Schmid%2C+M">Martin Schmid</a>, <a href="/search/cs?searchtype=author&query=Burch%2C+N">Neil Burch</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Moravcik%2C+M">Matej Moravcik</a>, <a href="/search/cs?searchtype=author&query=Kadlec%2C+R">Rudolf Kadlec</a>, <a href="/search/cs?searchtype=author&query=Bowling%2C+M">Michael Bowling</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1809.03057v1-abstract-short" style="display: inline;"> Learning strategies for imperfect information games from samples of interaction is a challenging problem. A common method for this setting, Monte Carlo Counterfactual Regret Minimization (MCCFR), can have slow long-term convergence rates due to high variance. In this paper, we introduce a variance reduction technique (VR-MCCFR) that applies to any sampling variant of MCCFR. Using this technique, p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1809.03057v1-abstract-full').style.display = 'inline'; document.getElementById('1809.03057v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1809.03057v1-abstract-full" style="display: none;"> Learning strategies for imperfect information games from samples of interaction is a challenging problem. A common method for this setting, Monte Carlo Counterfactual Regret Minimization (MCCFR), can have slow long-term convergence rates due to high variance. In this paper, we introduce a variance reduction technique (VR-MCCFR) that applies to any sampling variant of MCCFR. Using this technique, per-iteration estimated values and updates are reformulated as a function of sampled values and state-action baselines, similar to their use in policy gradient reinforcement learning. The new formulation allows estimates to be bootstrapped from other estimates within the same episode, propagating the benefits of baselines along the sampled trajectory; the estimates remain unbiased even when bootstrapping from other estimates. Finally, we show that given a perfect baseline, the variance of the value estimates can be reduced to zero. Experimental evaluation shows that VR-MCCFR brings an order of magnitude speedup, while the empirical variance decreases by three orders of magnitude. The decreased variance allows for the first time CFR+ to be used with sampling, increasing the speedup to two orders of magnitude. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1809.03057v1-abstract-full').style.display = 'none'; document.getElementById('1809.03057v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2018. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1804.03980">arXiv:1804.03980</a> <span> [<a href="https://arxiv.org/pdf/1804.03980">pdf</a>, <a href="https://arxiv.org/format/1804.03980">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Emergent Communication through Negotiation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+K">Kris Cao</a>, <a href="/search/cs?searchtype=author&query=Lazaridou%2C+A">Angeliki Lazaridou</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Leibo%2C+J+Z">Joel Z Leibo</a>, <a href="/search/cs?searchtype=author&query=Tuyls%2C+K">Karl Tuyls</a>, <a href="/search/cs?searchtype=author&query=Clark%2C+S">Stephen Clark</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1804.03980v1-abstract-short" style="display: inline;"> Multi-agent reinforcement learning offers a way to study how communication could emerge in communities of agents needing to solve specific problems. In this paper, we study the emergence of communication in the negotiation environment, a semi-cooperative model of agent interaction. We introduce two communication protocols -- one grounded in the semantics of the game, and one which is \textit{a pri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1804.03980v1-abstract-full').style.display = 'inline'; document.getElementById('1804.03980v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1804.03980v1-abstract-full" style="display: none;"> Multi-agent reinforcement learning offers a way to study how communication could emerge in communities of agents needing to solve specific problems. In this paper, we study the emergence of communication in the negotiation environment, a semi-cooperative model of agent interaction. We introduce two communication protocols -- one grounded in the semantics of the game, and one which is \textit{a priori} ungrounded and is a form of cheap talk. We show that self-interested agents can use the pre-grounded communication channel to negotiate fairly, but are unable to effectively use the ungrounded channel. However, prosocial agents do learn to use cheap talk to find an optimal negotiating strategy, suggesting that cooperation is necessary for language to emerge. We also study communication behaviour in a setting where one agent interacts with agents in a community with different levels of prosociality and show how agent identifiability can aid negotiation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1804.03980v1-abstract-full').style.display = 'none'; document.getElementById('1804.03980v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 April, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published as a conference paper at ICLR 2018</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1803.06376">arXiv:1803.06376</a> <span> [<a href="https://arxiv.org/pdf/1803.06376">pdf</a>, <a href="https://arxiv.org/format/1803.06376">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> A Generalised Method for Empirical Game Theoretic Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tuyls%2C+K">Karl Tuyls</a>, <a href="/search/cs?searchtype=author&query=Perolat%2C+J">Julien Perolat</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Leibo%2C+J+Z">Joel Z Leibo</a>, <a href="/search/cs?searchtype=author&query=Graepel%2C+T">Thore Graepel</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1803.06376v1-abstract-short" style="display: inline;"> This paper provides theoretical bounds for empirical game theoretical analysis of complex multi-agent interactions. We provide insights in the empirical meta game showing that a Nash equilibrium of the meta-game is an approximate Nash equilibrium of the true underlying game. We investigate and show how many data samples are required to obtain a close enough approximation of the underlying game. Ad… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1803.06376v1-abstract-full').style.display = 'inline'; document.getElementById('1803.06376v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1803.06376v1-abstract-full" style="display: none;"> This paper provides theoretical bounds for empirical game theoretical analysis of complex multi-agent interactions. We provide insights in the empirical meta game showing that a Nash equilibrium of the meta-game is an approximate Nash equilibrium of the true underlying game. We investigate and show how many data samples are required to obtain a close enough approximation of the underlying game. Additionally, we extend the meta-game analysis methodology to asymmetric games. The state-of-the-art has only considered empirical games in which agents have access to the same strategy sets and the payoff structure is symmetric, implying that agents are interchangeable. Finally, we carry out an empirical illustration of the generalised method in several domains, illustrating the theory and evolutionary dynamics of several versions of the AlphaGo algorithm (symmetric), the dynamics of the Colonel Blotto game played by human players on Facebook (symmetric), and an example of a meta-game in Leduc Poker (asymmetric), generated by the PSRO multi-agent learning algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1803.06376v1-abstract-full').style.display = 'none'; document.getElementById('1803.06376v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">will appear at AAMAS'18</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1712.01815">arXiv:1712.01815</a> <span> [<a href="https://arxiv.org/pdf/1712.01815">pdf</a>, <a href="https://arxiv.org/format/1712.01815">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Mastering Chess and Shogi by Self-Play with a General Reinforcement Learning Algorithm </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Silver%2C+D">David Silver</a>, <a href="/search/cs?searchtype=author&query=Hubert%2C+T">Thomas Hubert</a>, <a href="/search/cs?searchtype=author&query=Schrittwieser%2C+J">Julian Schrittwieser</a>, <a href="/search/cs?searchtype=author&query=Antonoglou%2C+I">Ioannis Antonoglou</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+M">Matthew Lai</a>, <a href="/search/cs?searchtype=author&query=Guez%2C+A">Arthur Guez</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Sifre%2C+L">Laurent Sifre</a>, <a href="/search/cs?searchtype=author&query=Kumaran%2C+D">Dharshan Kumaran</a>, <a href="/search/cs?searchtype=author&query=Graepel%2C+T">Thore Graepel</a>, <a href="/search/cs?searchtype=author&query=Lillicrap%2C+T">Timothy Lillicrap</a>, <a href="/search/cs?searchtype=author&query=Simonyan%2C+K">Karen Simonyan</a>, <a href="/search/cs?searchtype=author&query=Hassabis%2C+D">Demis Hassabis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1712.01815v1-abstract-short" style="display: inline;"> The game of chess is the most widely-studied domain in the history of artificial intelligence. The strongest programs are based on a combination of sophisticated search techniques, domain-specific adaptations, and handcrafted evaluation functions that have been refined by human experts over several decades. In contrast, the AlphaGo Zero program recently achieved superhuman performance in the game… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1712.01815v1-abstract-full').style.display = 'inline'; document.getElementById('1712.01815v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1712.01815v1-abstract-full" style="display: none;"> The game of chess is the most widely-studied domain in the history of artificial intelligence. The strongest programs are based on a combination of sophisticated search techniques, domain-specific adaptations, and handcrafted evaluation functions that have been refined by human experts over several decades. In contrast, the AlphaGo Zero program recently achieved superhuman performance in the game of Go, by tabula rasa reinforcement learning from games of self-play. In this paper, we generalise this approach into a single AlphaZero algorithm that can achieve, tabula rasa, superhuman performance in many challenging domains. Starting from random play, and given no domain knowledge except the game rules, AlphaZero achieved within 24 hours a superhuman level of play in the games of chess and shogi (Japanese chess) as well as Go, and convincingly defeated a world-champion program in each case. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1712.01815v1-abstract-full').style.display = 'none'; document.getElementById('1712.01815v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2017. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1711.05074">arXiv:1711.05074</a> <span> [<a href="https://arxiv.org/pdf/1711.05074">pdf</a>, <a href="https://arxiv.org/format/1711.05074">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Symmetric Decomposition of Asymmetric Games </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tuyls%2C+K">Karl Tuyls</a>, <a href="/search/cs?searchtype=author&query=Perolat%2C+J">Julien Perolat</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Ostrovski%2C+G">Georg Ostrovski</a>, <a href="/search/cs?searchtype=author&query=Savani%2C+R">Rahul Savani</a>, <a href="/search/cs?searchtype=author&query=Leibo%2C+J">Joel Leibo</a>, <a href="/search/cs?searchtype=author&query=Ord%2C+T">Toby Ord</a>, <a href="/search/cs?searchtype=author&query=Graepel%2C+T">Thore Graepel</a>, <a href="/search/cs?searchtype=author&query=Legg%2C+S">Shane Legg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1711.05074v3-abstract-short" style="display: inline;"> We introduce new theoretical insights into two-population asymmetric games allowing for an elegant symmetric decomposition into two single population symmetric games. Specifically, we show how an asymmetric bimatrix game (A,B) can be decomposed into its symmetric counterparts by envisioning and investigating the payoff tables (A and B) that constitute the asymmetric game, as two independent, singl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1711.05074v3-abstract-full').style.display = 'inline'; document.getElementById('1711.05074v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1711.05074v3-abstract-full" style="display: none;"> We introduce new theoretical insights into two-population asymmetric games allowing for an elegant symmetric decomposition into two single population symmetric games. Specifically, we show how an asymmetric bimatrix game (A,B) can be decomposed into its symmetric counterparts by envisioning and investigating the payoff tables (A and B) that constitute the asymmetric game, as two independent, single population, symmetric games. We reveal several surprising formal relationships between an asymmetric two-population game and its symmetric single population counterparts, which facilitate a convenient analysis of the original asymmetric game due to the dimensionality reduction of the decomposition. The main finding reveals that if (x,y) is a Nash equilibrium of an asymmetric game (A,B), this implies that y is a Nash equilibrium of the symmetric counterpart game determined by payoff table A, and x is a Nash equilibrium of the symmetric counterpart game determined by payoff table B. Also the reverse holds and combinations of Nash equilibria of the counterpart games form Nash equilibria of the asymmetric game. We illustrate how these formal relationships aid in identifying and analysing the Nash structure of asymmetric games, by examining the evolutionary dynamics of the simpler counterpart games in several canonical examples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1711.05074v3-abstract-full').style.display = 'none'; document.getElementById('1711.05074v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 November, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2017. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Paper is published in Scientific Reports; https://www.nature.com/articles/s41598-018-19194-4, 2018</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1711.00832">arXiv:1711.00832</a> <span> [<a href="https://arxiv.org/pdf/1711.00832">pdf</a>, <a href="https://arxiv.org/format/1711.00832">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> A Unified Game-Theoretic Approach to Multiagent Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Zambaldi%2C+V">Vinicius Zambaldi</a>, <a href="/search/cs?searchtype=author&query=Gruslys%2C+A">Audrunas Gruslys</a>, <a href="/search/cs?searchtype=author&query=Lazaridou%2C+A">Angeliki Lazaridou</a>, <a href="/search/cs?searchtype=author&query=Tuyls%2C+K">Karl Tuyls</a>, <a href="/search/cs?searchtype=author&query=Perolat%2C+J">Julien Perolat</a>, <a href="/search/cs?searchtype=author&query=Silver%2C+D">David Silver</a>, <a href="/search/cs?searchtype=author&query=Graepel%2C+T">Thore Graepel</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1711.00832v2-abstract-short" style="display: inline;"> To achieve general intelligence, agents must learn how to interact with others in a shared environment: this is the challenge of multiagent reinforcement learning (MARL). The simplest form is independent reinforcement learning (InRL), where each agent treats its experience as part of its (non-stationary) environment. In this paper, we first observe that policies learned using InRL can overfit to t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1711.00832v2-abstract-full').style.display = 'inline'; document.getElementById('1711.00832v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1711.00832v2-abstract-full" style="display: none;"> To achieve general intelligence, agents must learn how to interact with others in a shared environment: this is the challenge of multiagent reinforcement learning (MARL). The simplest form is independent reinforcement learning (InRL), where each agent treats its experience as part of its (non-stationary) environment. In this paper, we first observe that policies learned using InRL can overfit to the other agents' policies during training, failing to sufficiently generalize during execution. We introduce a new metric, joint-policy correlation, to quantify this effect. We describe an algorithm for general MARL, based on approximate best responses to mixtures of policies generated using deep reinforcement learning, and empirical game-theoretic analysis to compute meta-strategies for policy selection. The algorithm generalizes previous ones such as InRL, iterated best response, double oracle, and fictitious play. Then, we present a scalable implementation which reduces the memory requirement using decoupled meta-solvers. Finally, we demonstrate the generality of the resulting policies in two partially observable settings: gridworld coordination games and poker. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1711.00832v2-abstract-full').style.display = 'none'; document.getElementById('1711.00832v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2017; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 November, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2017. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Camera-ready copy of NIPS 2017 paper, including appendix</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1706.05296">arXiv:1706.05296</a> <span> [<a href="https://arxiv.org/pdf/1706.05296">pdf</a>, <a href="https://arxiv.org/format/1706.05296">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Value-Decomposition Networks For Cooperative Multi-Agent Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sunehag%2C+P">Peter Sunehag</a>, <a href="/search/cs?searchtype=author&query=Lever%2C+G">Guy Lever</a>, <a href="/search/cs?searchtype=author&query=Gruslys%2C+A">Audrunas Gruslys</a>, <a href="/search/cs?searchtype=author&query=Czarnecki%2C+W+M">Wojciech Marian Czarnecki</a>, <a href="/search/cs?searchtype=author&query=Zambaldi%2C+V">Vinicius Zambaldi</a>, <a href="/search/cs?searchtype=author&query=Jaderberg%2C+M">Max Jaderberg</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Sonnerat%2C+N">Nicolas Sonnerat</a>, <a href="/search/cs?searchtype=author&query=Leibo%2C+J+Z">Joel Z. Leibo</a>, <a href="/search/cs?searchtype=author&query=Tuyls%2C+K">Karl Tuyls</a>, <a href="/search/cs?searchtype=author&query=Graepel%2C+T">Thore Graepel</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1706.05296v1-abstract-short" style="display: inline;"> We study the problem of cooperative multi-agent reinforcement learning with a single joint reward signal. This class of learning problems is difficult because of the often large combined action and observation spaces. In the fully centralized and decentralized approaches, we find the problem of spurious rewards and a phenomenon we call the "lazy agent" problem, which arises due to partial observab… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1706.05296v1-abstract-full').style.display = 'inline'; document.getElementById('1706.05296v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1706.05296v1-abstract-full" style="display: none;"> We study the problem of cooperative multi-agent reinforcement learning with a single joint reward signal. This class of learning problems is difficult because of the often large combined action and observation spaces. In the fully centralized and decentralized approaches, we find the problem of spurious rewards and a phenomenon we call the "lazy agent" problem, which arises due to partial observability. We address these problems by training individual agents with a novel value decomposition network architecture, which learns to decompose the team value function into agent-wise value functions. We perform an experimental evaluation across a range of partially-observable multi-agent domains and show that learning such value-decompositions leads to superior results, in particular when combined with weight sharing, role information and information channels. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1706.05296v1-abstract-full').style.display = 'none'; document.getElementById('1706.05296v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 June, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2017. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.11 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1704.03732">arXiv:1704.03732</a> <span> [<a href="https://arxiv.org/pdf/1704.03732">pdf</a>, <a href="https://arxiv.org/ps/1704.03732">ps</a>, <a href="https://arxiv.org/format/1704.03732">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Deep Q-learning from Demonstrations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hester%2C+T">Todd Hester</a>, <a href="/search/cs?searchtype=author&query=Vecerik%2C+M">Matej Vecerik</a>, <a href="/search/cs?searchtype=author&query=Pietquin%2C+O">Olivier Pietquin</a>, <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Schaul%2C+T">Tom Schaul</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&query=Horgan%2C+D">Dan Horgan</a>, <a href="/search/cs?searchtype=author&query=Quan%2C+J">John Quan</a>, <a href="/search/cs?searchtype=author&query=Sendonaris%2C+A">Andrew Sendonaris</a>, <a href="/search/cs?searchtype=author&query=Dulac-Arnold%2C+G">Gabriel Dulac-Arnold</a>, <a href="/search/cs?searchtype=author&query=Osband%2C+I">Ian Osband</a>, <a href="/search/cs?searchtype=author&query=Agapiou%2C+J">John Agapiou</a>, <a href="/search/cs?searchtype=author&query=Leibo%2C+J+Z">Joel Z. Leibo</a>, <a href="/search/cs?searchtype=author&query=Gruslys%2C+A">Audrunas Gruslys</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1704.03732v4-abstract-short" style="display: inline;"> Deep reinforcement learning (RL) has achieved several high profile successes in difficult decision-making problems. However, these algorithms typically require a huge amount of data before they reach reasonable performance. In fact, their performance during learning can be extremely poor. This may be acceptable for a simulator, but it severely limits the applicability of deep RL to many real-world… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1704.03732v4-abstract-full').style.display = 'inline'; document.getElementById('1704.03732v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1704.03732v4-abstract-full" style="display: none;"> Deep reinforcement learning (RL) has achieved several high profile successes in difficult decision-making problems. However, these algorithms typically require a huge amount of data before they reach reasonable performance. In fact, their performance during learning can be extremely poor. This may be acceptable for a simulator, but it severely limits the applicability of deep RL to many real-world tasks, where the agent must learn in the real environment. In this paper we study a setting where the agent may access data from previous control of the system. We present an algorithm, Deep Q-learning from Demonstrations (DQfD), that leverages small sets of demonstration data to massively accelerate the learning process even from relatively small amounts of demonstration data and is able to automatically assess the necessary ratio of demonstration data while learning thanks to a prioritized replay mechanism. DQfD works by combining temporal difference updates with supervised classification of the demonstrator's actions. We show that DQfD has better initial performance than Prioritized Dueling Double Deep Q-Networks (PDD DQN) as it starts with better scores on the first million steps on 41 of 42 games and on average it takes PDD DQN 83 million steps to catch up to DQfD's performance. DQfD learns to out-perform the best demonstration given in 14 of 42 games. In addition, DQfD leverages human demonstrations to achieve state-of-the-art results for 11 games. Finally, we show that DQfD performs better than three related algorithms for incorporating demonstration data into DQN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1704.03732v4-abstract-full').style.display = 'none'; document.getElementById('1704.03732v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2017; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 April, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2017. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published at AAAI 2018. Previously on arxiv as "Learning from Demonstrations for Real World Reinforcement Learning"</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Lanctot%2C+M&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Lanctot%2C+M&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Lanctot%2C+M&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>