Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 188 results for author: <span class="mathjax">Precup, D</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Precup%2C+D">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Precup, D"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Precup%2C+D&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Precup, D"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Precup%2C+D&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Precup%2C+D&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Precup%2C+D&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Precup%2C+D&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Precup%2C+D&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00119">arXiv:2411.00119</a> <span> [<a href="https://arxiv.org/pdf/2411.00119">pdf</a>, <a href="https://arxiv.org/format/2411.00119">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Soft Condorcet Optimization for Ranking of General Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lanctot%2C+M">Marc Lanctot</a>, <a href="/search/cs?searchtype=author&query=Larson%2C+K">Kate Larson</a>, <a href="/search/cs?searchtype=author&query=Kaisers%2C+M">Michael Kaisers</a>, <a href="/search/cs?searchtype=author&query=Berthet%2C+Q">Quentin Berthet</a>, <a href="/search/cs?searchtype=author&query=Gemp%2C+I">Ian Gemp</a>, <a href="/search/cs?searchtype=author&query=Diaz%2C+M">Manfred Diaz</a>, <a href="/search/cs?searchtype=author&query=Maura-Rivero%2C+R">Roberto-Rafael Maura-Rivero</a>, <a href="/search/cs?searchtype=author&query=Bachrach%2C+Y">Yoram Bachrach</a>, <a href="/search/cs?searchtype=author&query=Koop%2C+A">Anna Koop</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00119v2-abstract-short" style="display: inline;"> A common way to drive progress of AI models and agents is to compare their performance on standardized benchmarks. Comparing the performance of general agents requires aggregating their individual performances across a potentially wide variety of different tasks. In this paper, we describe a novel ranking scheme inspired by social choice frameworks, called Soft Condorcet Optimization (SCO), to com… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00119v2-abstract-full').style.display = 'inline'; document.getElementById('2411.00119v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00119v2-abstract-full" style="display: none;"> A common way to drive progress of AI models and agents is to compare their performance on standardized benchmarks. Comparing the performance of general agents requires aggregating their individual performances across a potentially wide variety of different tasks. In this paper, we describe a novel ranking scheme inspired by social choice frameworks, called Soft Condorcet Optimization (SCO), to compute the optimal ranking of agents: the one that makes the fewest mistakes in predicting the agent comparisons in the evaluation data. This optimal ranking is the maximum likelihood estimate when evaluation data (which we view as votes) are interpreted as noisy samples from a ground truth ranking, a solution to Condorcet's original voting system criteria. SCO ratings are maximal for Condorcet winners when they exist, which we show is not necessarily true for the classical rating system Elo. We propose three optimization algorithms to compute SCO ratings and evaluate their empirical performance. When serving as an approximation to the Kemeny-Young voting method, SCO rankings are on average 0 to 0.043 away from the optimal ranking in normalized Kendall-tau distance across 865 preference profiles from the PrefLib open ranking archive. In a simulated noisy tournament setting, SCO achieves accurate approximations to the ground truth ranking and the best among several baselines when 59\% or more of the preference data is missing. Finally, SCO ranking provides the best approximation to the optimal ranking, measured on held-out test sets, in a problem containing 52,958 human players across 31,049 games of the classic seven-player game of Diplomacy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00119v2-abstract-full').style.display = 'none'; document.getElementById('2411.00119v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22133">arXiv:2410.22133</a> <span> [<a href="https://arxiv.org/pdf/2410.22133">pdf</a>, <a href="https://arxiv.org/format/2410.22133">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Learning Successor Features the Simple Way </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chua%2C+R">Raymond Chua</a>, <a href="/search/cs?searchtype=author&query=Ghosh%2C+A">Arna Ghosh</a>, <a href="/search/cs?searchtype=author&query=Kaplanis%2C+C">Christos Kaplanis</a>, <a href="/search/cs?searchtype=author&query=Richards%2C+B+A">Blake A. Richards</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22133v2-abstract-short" style="display: inline;"> In Deep Reinforcement Learning (RL), it is a challenge to learn representations that do not exhibit catastrophic forgetting or interference in non-stationary environments. Successor Features (SFs) offer a potential solution to this challenge. However, canonical techniques for learning SFs from pixel-level observations often lead to representation collapse, wherein representations degenerate and fa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22133v2-abstract-full').style.display = 'inline'; document.getElementById('2410.22133v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22133v2-abstract-full" style="display: none;"> In Deep Reinforcement Learning (RL), it is a challenge to learn representations that do not exhibit catastrophic forgetting or interference in non-stationary environments. Successor Features (SFs) offer a potential solution to this challenge. However, canonical techniques for learning SFs from pixel-level observations often lead to representation collapse, wherein representations degenerate and fail to capture meaningful variations in the data. More recent methods for learning SFs can avoid representation collapse, but they often involve complex losses and multiple learning phases, reducing their efficiency. We introduce a novel, simple method for learning SFs directly from pixels. Our approach uses a combination of a Temporal-difference (TD) loss and a reward prediction loss, which together capture the basic mathematical definition of SFs. We show that our approach matches or outperforms existing SF learning techniques in both 2D (Minigrid), 3D (Miniworld) mazes and Mujoco, for both single and continual learning scenarios. As well, our technique is efficient, and can reach higher levels of performance in less time than other approaches. Our work provides a new, streamlined technique for learning SFs directly from pixel observations, with no pretraining required. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22133v2-abstract-full').style.display = 'none'; document.getElementById('2410.22133v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Main Paper: 10 pages and 8 figures. Accepted at Neural Information Processing Systems (NeurIPS) 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07096">arXiv:2410.07096</a> <span> [<a href="https://arxiv.org/pdf/2410.07096">pdf</a>, <a href="https://arxiv.org/format/2410.07096">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Identifying and Addressing Delusions for Target-Directed Decision-Making </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+M">Mingde Zhao</a>, <a href="/search/cs?searchtype=author&query=Sylvain%2C+T">Tristan Sylvain</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Bengio%2C+Y">Yoshua Bengio</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07096v5-abstract-short" style="display: inline;"> Target-directed agents utilize self-generated targets, to guide their behaviors for better generalization. These agents are prone to blindly chasing problematic targets, resulting in worse generalization and safety catastrophes. We show that these behaviors can be results of delusions, stemming from improper designs around training: the agent may naturally come to hold false beliefs about certain… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07096v5-abstract-full').style.display = 'inline'; document.getElementById('2410.07096v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07096v5-abstract-full" style="display: none;"> Target-directed agents utilize self-generated targets, to guide their behaviors for better generalization. These agents are prone to blindly chasing problematic targets, resulting in worse generalization and safety catastrophes. We show that these behaviors can be results of delusions, stemming from improper designs around training: the agent may naturally come to hold false beliefs about certain targets. We identify delusions via intuitive examples in controlled environments, and investigate their causes and mitigations. With the insights, we demonstrate how we can make agents address delusions preemptively and autonomously. We validate empirically the effectiveness of the proposed strategies in correcting delusional behaviors and improving out-of-distribution generalization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07096v5-abstract-full').style.display = 'none'; document.getElementById('2410.07096v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20241118 12h40: incorporated changes of rebuttal</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02230">arXiv:2410.02230</a> <span> [<a href="https://arxiv.org/pdf/2410.02230">pdf</a>, <a href="https://arxiv.org/format/2410.02230">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Mitigating Downstream Model Risks via Model Provenance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+K">Keyu Wang</a>, <a href="/search/cs?searchtype=author&query=Iranzad%2C+A+N">Abdullah Norozi Iranzad</a>, <a href="/search/cs?searchtype=author&query=Schaffter%2C+S">Scott Schaffter</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Lebensold%2C+J">Jonathan Lebensold</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02230v1-abstract-short" style="display: inline;"> Research and industry are rapidly advancing the innovation and adoption of foundation model-based systems, yet the tools for managing these models have not kept pace. Understanding the provenance and lineage of models is critical for researchers, industry, regulators, and public trust. While model cards and system cards were designed to provide transparency, they fall short in key areas: tracing m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02230v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02230v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02230v1-abstract-full" style="display: none;"> Research and industry are rapidly advancing the innovation and adoption of foundation model-based systems, yet the tools for managing these models have not kept pace. Understanding the provenance and lineage of models is critical for researchers, industry, regulators, and public trust. While model cards and system cards were designed to provide transparency, they fall short in key areas: tracing model genealogy, enabling machine readability, offering reliable centralized management systems, and fostering consistent creation incentives. This challenge mirrors issues in software supply chain security, but AI/ML remains at an earlier stage of maturity. Addressing these gaps requires industry-standard tooling that can be adopted by foundation model publishers, open-source model innovators, and major distribution platforms. We propose a machine-readable model specification format to simplify the creation of model records, thereby reducing error-prone human effort, notably when a new model inherits most of its design from a foundation model. Our solution explicitly traces relationships between upstream and downstream models, enhancing transparency and traceability across the model lifecycle. To facilitate the adoption, we introduce the unified model record (UMR) repository , a semantically versioned system that automates the publication of model records to multiple formats (PDF, HTML, LaTeX) and provides a hosted web interface (https://modelrecord.com/). This proof of concept aims to set a new standard for managing foundation models, bridging the gap between innovation and responsible model management. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02230v1-abstract-full').style.display = 'none'; document.getElementById('2410.02230v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00327">arXiv:2410.00327</a> <span> [<a href="https://arxiv.org/pdf/2410.00327">pdf</a>, <a href="https://arxiv.org/format/2410.00327">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> EnzymeFlow: Generating Reaction-specific Enzyme Catalytic Pockets through Flow Matching and Co-Evolutionary Dynamics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hua%2C+C">Chenqing Hua</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yong Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dinghuai Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+O">Odin Zhang</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+S">Sitao Luan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K+K">Kevin K. Yang</a>, <a href="/search/cs?searchtype=author&query=Wolf%2C+G">Guy Wolf</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Shuangjia Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00327v1-abstract-short" style="display: inline;"> Enzyme design is a critical area in biotechnology, with applications ranging from drug development to synthetic biology. Traditional methods for enzyme function prediction or protein binding pocket design often fall short in capturing the dynamic and complex nature of enzyme-substrate interactions, particularly in catalytic processes. To address the challenges, we introduce EnzymeFlow, a generativ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00327v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00327v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00327v1-abstract-full" style="display: none;"> Enzyme design is a critical area in biotechnology, with applications ranging from drug development to synthetic biology. Traditional methods for enzyme function prediction or protein binding pocket design often fall short in capturing the dynamic and complex nature of enzyme-substrate interactions, particularly in catalytic processes. To address the challenges, we introduce EnzymeFlow, a generative model that employs flow matching with hierarchical pre-training and enzyme-reaction co-evolution to generate catalytic pockets for specific substrates and catalytic reactions. Additionally, we introduce a large-scale, curated, and validated dataset of enzyme-reaction pairs, specifically designed for the catalytic pocket generation task, comprising a total of $328,192$ pairs. By incorporating evolutionary dynamics and reaction-specific adaptations, EnzymeFlow becomes a powerful model for designing enzyme pockets, which is capable of catalyzing a wide range of biochemical reactions. Experiments on the new dataset demonstrate the model's effectiveness in designing high-quality, functional enzyme catalytic pockets, paving the way for advancements in enzyme engineering and synthetic biology. We provide EnzymeFlow code at https://github.com/WillHua127/EnzymeFlow with notebook demonstration at https://github.com/WillHua127/EnzymeFlow/blob/main/enzymeflow_demo.ipynb. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00327v1-abstract-full').style.display = 'none'; document.getElementById('2410.00327v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12917">arXiv:2409.12917</a> <span> [<a href="https://arxiv.org/pdf/2409.12917">pdf</a>, <a href="https://arxiv.org/format/2409.12917">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Training Language Models to Self-Correct via Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kumar%2C+A">Aviral Kumar</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+V">Vincent Zhuang</a>, <a href="/search/cs?searchtype=author&query=Agarwal%2C+R">Rishabh Agarwal</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yi Su</a>, <a href="/search/cs?searchtype=author&query=Co-Reyes%2C+J+D">John D Co-Reyes</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+A">Avi Singh</a>, <a href="/search/cs?searchtype=author&query=Baumli%2C+K">Kate Baumli</a>, <a href="/search/cs?searchtype=author&query=Iqbal%2C+S">Shariq Iqbal</a>, <a href="/search/cs?searchtype=author&query=Bishop%2C+C">Colton Bishop</a>, <a href="/search/cs?searchtype=author&query=Roelofs%2C+R">Rebecca Roelofs</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L+M">Lei M Zhang</a>, <a href="/search/cs?searchtype=author&query=McKinney%2C+K">Kay McKinney</a>, <a href="/search/cs?searchtype=author&query=Shrivastava%2C+D">Disha Shrivastava</a>, <a href="/search/cs?searchtype=author&query=Paduraru%2C+C">Cosmin Paduraru</a>, <a href="/search/cs?searchtype=author&query=Tucker%2C+G">George Tucker</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Behbahani%2C+F">Feryal Behbahani</a>, <a href="/search/cs?searchtype=author&query=Faust%2C+A">Aleksandra Faust</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12917v2-abstract-short" style="display: inline;"> Self-correction is a highly desirable capability of large language models (LLMs), yet it has consistently been found to be largely ineffective in modern LLMs. Current methods for training self-correction typically depend on either multiple models, a more advanced model, or additional forms of supervision. To address these shortcomings, we develop a multi-turn online reinforcement learning (RL) app… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12917v2-abstract-full').style.display = 'inline'; document.getElementById('2409.12917v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12917v2-abstract-full" style="display: none;"> Self-correction is a highly desirable capability of large language models (LLMs), yet it has consistently been found to be largely ineffective in modern LLMs. Current methods for training self-correction typically depend on either multiple models, a more advanced model, or additional forms of supervision. To address these shortcomings, we develop a multi-turn online reinforcement learning (RL) approach, SCoRe, that significantly improves an LLM's self-correction ability using entirely self-generated data. To build SCoRe, we first show that variants of supervised fine-tuning (SFT) on offline model-generated correction traces are often insufficient for instilling self-correction behavior. In particular, we observe that training via SFT falls prey to either a distribution mismatch between mistakes made by the data-collection policy and the model's own responses, or to behavior collapse, where learning implicitly prefers only a certain mode of correction behavior that is often not effective at self-correction on test problems. SCoRe addresses these challenges by training under the model's own distribution of self-generated correction traces and using appropriate regularization to steer the learning process into learning a self-correction behavior that is effective at test time as opposed to fitting high-reward responses for a given prompt. This regularization process includes an initial phase of multi-turn RL on a base model to generate a policy initialization that is less susceptible to collapse, followed by using a reward bonus to amplify self-correction. With Gemini 1.0 Pro and 1.5 Flash models, we find that SCoRe achieves state-of-the-art self-correction performance, improving the base models' self-correction by 15.6% and 9.1% respectively on MATH and HumanEval. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12917v2-abstract-full').style.display = 'none'; document.getElementById('2409.12917v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13659">arXiv:2408.13659</a> <span> [<a href="https://arxiv.org/pdf/2408.13659">pdf</a>, <a href="https://arxiv.org/format/2408.13659">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> ReactZyme: A Benchmark for Enzyme-Reaction Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hua%2C+C">Chenqing Hua</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+B">Bozitao Zhong</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+S">Sitao Luan</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Liang Hong</a>, <a href="/search/cs?searchtype=author&query=Wolf%2C+G">Guy Wolf</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Shuangjia Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13659v3-abstract-short" style="display: inline;"> Enzymes, with their specific catalyzed reactions, are necessary for all aspects of life, enabling diverse biological processes and adaptations. Predicting enzyme functions is essential for understanding biological pathways, guiding drug development, enhancing bioproduct yields, and facilitating evolutionary studies. Addressing the inherent complexities, we introduce a new approach to annotating en… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13659v3-abstract-full').style.display = 'inline'; document.getElementById('2408.13659v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13659v3-abstract-full" style="display: none;"> Enzymes, with their specific catalyzed reactions, are necessary for all aspects of life, enabling diverse biological processes and adaptations. Predicting enzyme functions is essential for understanding biological pathways, guiding drug development, enhancing bioproduct yields, and facilitating evolutionary studies. Addressing the inherent complexities, we introduce a new approach to annotating enzymes based on their catalyzed reactions. This method provides detailed insights into specific reactions and is adaptable to newly discovered reactions, diverging from traditional classifications by protein family or expert-derived reaction classes. We employ machine learning algorithms to analyze enzyme reaction datasets, delivering a much more refined view on the functionality of enzymes. Our evaluation leverages the largest enzyme-reaction dataset to date, derived from the SwissProt and Rhea databases with entries up to January 8, 2024. We frame the enzyme-reaction prediction as a retrieval problem, aiming to rank enzymes by their catalytic ability for specific reactions. With our model, we can recruit proteins for novel reactions and predict reactions in novel proteins, facilitating enzyme discovery and function annotation (https://github.com/WillHua127/ReactZyme). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13659v3-abstract-full').style.display = 'none'; document.getElementById('2408.13659v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> 38th Conference on Neural Information Processing Systems (NeurIPS 2024) Track on Datasets and Benchmarks </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.16602">arXiv:2407.16602</a> <span> [<a href="https://arxiv.org/pdf/2407.16602">pdf</a>, <a href="https://arxiv.org/format/2407.16602">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Functional Acceleration for Policy Mirror Descent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chelu%2C+V">Veronica Chelu</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.16602v1-abstract-short" style="display: inline;"> We apply functional acceleration to the Policy Mirror Descent (PMD) general family of algorithms, which cover a wide range of novel and fundamental methods in Reinforcement Learning (RL). Leveraging duality, we propose a momentum-based PMD update. By taking the functional route, our approach is independent of the policy parametrization and applicable to large-scale optimization, covering previous… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16602v1-abstract-full').style.display = 'inline'; document.getElementById('2407.16602v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.16602v1-abstract-full" style="display: none;"> We apply functional acceleration to the Policy Mirror Descent (PMD) general family of algorithms, which cover a wide range of novel and fundamental methods in Reinforcement Learning (RL). Leveraging duality, we propose a momentum-based PMD update. By taking the functional route, our approach is independent of the policy parametrization and applicable to large-scale optimization, covering previous applications of momentum at the level of policy parameters as a special case. We theoretically analyze several properties of this approach and complement with a numerical ablation study, which serves to illustrate the policy optimization dynamics on the value polytope, relative to different algorithmic design choices in this space. We further characterize numerically several features of the problem setting relevant for functional acceleration, and lastly, we investigate the impact of approximation on their learning mechanics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16602v1-abstract-full').style.display = 'none'; document.getElementById('2407.16602v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.09618">arXiv:2407.09618</a> <span> [<a href="https://arxiv.org/pdf/2407.09618">pdf</a>, <a href="https://arxiv.org/format/2407.09618">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> The Heterophilic Graph Learning Handbook: Benchmarks, Models, Theoretical Analysis, Applications and Challenges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luan%2C+S">Sitao Luan</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+C">Chenqing Hua</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Q">Qincheng Lu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Liheng Ma</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lirong Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinyu Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Minkai Xu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xiao-Wen Chang</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Ying%2C+R">Rex Ying</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S+Z">Stan Z. Li</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jian Tang</a>, <a href="/search/cs?searchtype=author&query=Wolf%2C+G">Guy Wolf</a>, <a href="/search/cs?searchtype=author&query=Jegelka%2C+S">Stefanie Jegelka</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.09618v1-abstract-short" style="display: inline;"> Homophily principle, \ie{} nodes with the same labels or similar attributes are more likely to be connected, has been commonly believed to be the main reason for the superiority of Graph Neural Networks (GNNs) over traditional Neural Networks (NNs) on graph-structured data, especially on node-level tasks. However, recent work has identified a non-trivial set of datasets where GNN's performance com… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09618v1-abstract-full').style.display = 'inline'; document.getElementById('2407.09618v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.09618v1-abstract-full" style="display: none;"> Homophily principle, \ie{} nodes with the same labels or similar attributes are more likely to be connected, has been commonly believed to be the main reason for the superiority of Graph Neural Networks (GNNs) over traditional Neural Networks (NNs) on graph-structured data, especially on node-level tasks. However, recent work has identified a non-trivial set of datasets where GNN's performance compared to the NN's is not satisfactory. Heterophily, i.e. low homophily, has been considered the main cause of this empirical observation. People have begun to revisit and re-evaluate most existing graph models, including graph transformer and its variants, in the heterophily scenario across various kinds of graphs, e.g. heterogeneous graphs, temporal graphs and hypergraphs. Moreover, numerous graph-related applications are found to be closely related to the heterophily problem. In the past few years, considerable effort has been devoted to studying and addressing the heterophily issue. In this survey, we provide a comprehensive review of the latest progress on heterophilic graph learning, including an extensive summary of benchmark datasets and evaluation of homophily metrics on synthetic graphs, meticulous classification of the most updated supervised and unsupervised learning methods, thorough digestion of the theoretical analysis on homophily/heterophily, and broad exploration of the heterophily-related applications. Notably, through detailed experiments, we are the first to categorize benchmark heterophilic datasets into three sub-categories: malignant, benign and ambiguous heterophily. Malignant and ambiguous datasets are identified as the real challenging datasets to test the effectiveness of new models on the heterophily challenge. Finally, we propose several challenges and future directions for heterophilic graph representation learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09618v1-abstract-full').style.display = 'none'; document.getElementById('2407.09618v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Suggestions and comments are welcomed at sitao.luan@mail.mcgill.ca!</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.12241">arXiv:2406.12241</a> <span> [<a href="https://arxiv.org/pdf/2406.12241">pdf</a>, <a href="https://arxiv.org/format/2406.12241">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> More Efficient Randomized Exploration for Reinforcement Learning via Approximate Sampling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ishfaq%2C+H">Haque Ishfaq</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Y">Yixin Tan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yu Yang</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+Q">Qingfeng Lan</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jianfeng Lu</a>, <a href="/search/cs?searchtype=author&query=Mahmood%2C+A+R">A. Rupam Mahmood</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+P">Pan Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.12241v1-abstract-short" style="display: inline;"> Thompson sampling (TS) is one of the most popular exploration techniques in reinforcement learning (RL). However, most TS algorithms with theoretical guarantees are difficult to implement and not generalizable to Deep RL. While the emerging approximate sampling-based exploration schemes are promising, most existing algorithms are specific to linear Markov Decision Processes (MDP) with suboptimal r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12241v1-abstract-full').style.display = 'inline'; document.getElementById('2406.12241v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.12241v1-abstract-full" style="display: none;"> Thompson sampling (TS) is one of the most popular exploration techniques in reinforcement learning (RL). However, most TS algorithms with theoretical guarantees are difficult to implement and not generalizable to Deep RL. While the emerging approximate sampling-based exploration schemes are promising, most existing algorithms are specific to linear Markov Decision Processes (MDP) with suboptimal regret bounds, or only use the most basic samplers such as Langevin Monte Carlo. In this work, we propose an algorithmic framework that incorporates different approximate sampling methods with the recently proposed Feel-Good Thompson Sampling (FGTS) approach (Zhang, 2022; Dann et al., 2021), which was previously known to be computationally intractable in general. When applied to linear MDPs, our regret analysis yields the best known dependency of regret on dimensionality, surpassing existing randomized algorithms. Additionally, we provide explicit sampling complexity for each employed sampler. Empirically, we show that in tasks where deep exploration is necessary, our proposed algorithms that combine FGTS and approximate sampling perform significantly better compared to other strong baselines. On several challenging games from the Atari 57 suite, our algorithms achieve performance that is either better than or on par with other strong baselines from the deep RL literature. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12241v1-abstract-full').style.display = 'none'; document.getElementById('2406.12241v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">First two authors contributed equally. Accepted to the Reinforcement Learning Conference (RLC) 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.18751">arXiv:2405.18751</a> <span> [<a href="https://arxiv.org/pdf/2405.18751">pdf</a>, <a href="https://arxiv.org/format/2405.18751">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> On the Limits of Multi-modal Meta-Learning with Auxiliary Task Modulation Using Conditional Batch Normalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Armengol-Estap%C3%A9%2C+J">Jordi Armengol-Estap茅</a>, <a href="/search/cs?searchtype=author&query=Michalski%2C+V">Vincent Michalski</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+R">Ramnath Kumar</a>, <a href="/search/cs?searchtype=author&query=St-Charles%2C+P">Pierre-Luc St-Charles</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Kahou%2C+S+E">Samira Ebrahimi Kahou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.18751v2-abstract-short" style="display: inline;"> Few-shot learning aims to learn representations that can tackle novel tasks given a small number of examples. Recent studies show that cross-modal learning can improve representations for few-shot classification. More specifically, language is a rich modality that can be used to guide visual learning. In this work, we experiment with a multi-modal architecture for few-shot learning that consists o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18751v2-abstract-full').style.display = 'inline'; document.getElementById('2405.18751v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.18751v2-abstract-full" style="display: none;"> Few-shot learning aims to learn representations that can tackle novel tasks given a small number of examples. Recent studies show that cross-modal learning can improve representations for few-shot classification. More specifically, language is a rich modality that can be used to guide visual learning. In this work, we experiment with a multi-modal architecture for few-shot learning that consists of three components: a classifier, an auxiliary network, and a bridge network. While the classifier performs the main classification task, the auxiliary network learns to predict language representations from the same input, and the bridge network transforms high-level features of the auxiliary network into modulation parameters for layers of the few-shot classifier using conditional batch normalization. The bridge should encourage a form of lightweight semantic alignment between language and vision which could be useful for the classifier. However, after evaluating the proposed approach on two popular few-shot classification benchmarks we find that a) the improvements do not reproduce across benchmarks, and b) when they do, the improvements are due to the additional compute and parameters introduced by the bridge network. We contribute insights and recommendations for future work in multi-modal meta-learning, especially when using language representations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18751v2-abstract-full').style.display = 'none'; document.getElementById('2405.18751v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.16899">arXiv:2405.16899</a> <span> [<a href="https://arxiv.org/pdf/2405.16899">pdf</a>, <a href="https://arxiv.org/format/2405.16899">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Partial Models for Building Adaptive Model-Based Reinforcement Learning Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Alver%2C+S">Safa Alver</a>, <a href="/search/cs?searchtype=author&query=Rahimi-Kalahroudi%2C+A">Ali Rahimi-Kalahroudi</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.16899v1-abstract-short" style="display: inline;"> In neuroscience, one of the key behavioral tests for determining whether a subject of study exhibits model-based behavior is to study its adaptiveness to local changes in the environment. In reinforcement learning, however, recent studies have shown that modern model-based agents display poor adaptivity to such changes. The main reason for this is that modern agents are typically designed to impro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16899v1-abstract-full').style.display = 'inline'; document.getElementById('2405.16899v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.16899v1-abstract-full" style="display: none;"> In neuroscience, one of the key behavioral tests for determining whether a subject of study exhibits model-based behavior is to study its adaptiveness to local changes in the environment. In reinforcement learning, however, recent studies have shown that modern model-based agents display poor adaptivity to such changes. The main reason for this is that modern agents are typically designed to improve sample efficiency in single task settings and thus do not take into account the challenges that can arise in other settings. In local adaptation settings, one particularly important challenge is in quickly building and maintaining a sufficiently accurate model after a local change. This is challenging for deep model-based agents as their models and replay buffers are monolithic structures lacking distribution shift handling capabilities. In this study, we show that the conceptually simple idea of partial models can allow deep model-based agents to overcome this challenge and thus allow for building locally adaptive model-based agents. By modeling the different parts of the state space through different models, the agent can not only maintain a model that is accurate across the state space, but it can also quickly adapt it in the presence of a local change in the environment. We demonstrate this by showing that the use of partial models in agents such as deep Dyna-Q, PlaNet and Dreamer can allow for them to effectively adapt to the local changes in their environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16899v1-abstract-full').style.display = 'none'; document.getElementById('2405.16899v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published as a conference paper at CoLLAs 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.07838">arXiv:2405.07838</a> <span> [<a href="https://arxiv.org/pdf/2405.07838">pdf</a>, <a href="https://arxiv.org/format/2405.07838">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Exploration for Data-Efficient General Value Function Evaluations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jain%2C+A">Arushi Jain</a>, <a href="/search/cs?searchtype=author&query=Hanna%2C+J+P">Josiah P. Hanna</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.07838v2-abstract-short" style="display: inline;"> General Value Functions (GVFs) (Sutton et al., 2011) represent predictive knowledge in reinforcement learning. Each GVF computes the expected return for a given policy, based on a unique reward. Existing methods relying on fixed behavior policies or pre-collected data often face data efficiency issues when learning multiple GVFs in parallel using off-policy methods. To address this, we introduce G… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.07838v2-abstract-full').style.display = 'inline'; document.getElementById('2405.07838v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.07838v2-abstract-full" style="display: none;"> General Value Functions (GVFs) (Sutton et al., 2011) represent predictive knowledge in reinforcement learning. Each GVF computes the expected return for a given policy, based on a unique reward. Existing methods relying on fixed behavior policies or pre-collected data often face data efficiency issues when learning multiple GVFs in parallel using off-policy methods. To address this, we introduce GVFExplorer, which adaptively learns a single behavior policy that efficiently collects data for evaluating multiple GVFs in parallel. Our method optimizes the behavior policy by minimizing the total variance in return across GVFs, thereby reducing the required environmental interactions. We use an existing temporal-difference-style variance estimator to approximate the return variance. We prove that each behavior policy update decreases the overall mean squared error in GVF predictions. We empirically show our method's performance in tabular and nonlinear function approximation settings, including Mujoco environments, with stationary and non-stationary reward signals, optimizing data usage and reducing prediction errors across multiple GVFs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.07838v2-abstract-full').style.display = 'none'; document.getElementById('2405.07838v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages, 16 figures, Accepted in NeurIPS 2024 Conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.01616">arXiv:2405.01616</a> <span> [<a href="https://arxiv.org/pdf/2405.01616">pdf</a>, <a href="https://arxiv.org/format/2405.01616">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Generative Active Learning for the Search of Small-molecule Protein Binders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Korablyov%2C+M">Maksym Korablyov</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Cheng-Hao Liu</a>, <a href="/search/cs?searchtype=author&query=Jain%2C+M">Moksh Jain</a>, <a href="/search/cs?searchtype=author&query=van+der+Sloot%2C+A+M">Almer M. van der Sloot</a>, <a href="/search/cs?searchtype=author&query=Jolicoeur%2C+E">Eric Jolicoeur</a>, <a href="/search/cs?searchtype=author&query=Ruediger%2C+E">Edward Ruediger</a>, <a href="/search/cs?searchtype=author&query=Nica%2C+A+C">Andrei Cristian Nica</a>, <a href="/search/cs?searchtype=author&query=Bengio%2C+E">Emmanuel Bengio</a>, <a href="/search/cs?searchtype=author&query=Lapchevskyi%2C+K">Kostiantyn Lapchevskyi</a>, <a href="/search/cs?searchtype=author&query=St-Cyr%2C+D">Daniel St-Cyr</a>, <a href="/search/cs?searchtype=author&query=Schuetz%2C+D+A">Doris Alexandra Schuetz</a>, <a href="/search/cs?searchtype=author&query=Butoi%2C+V+I">Victor Ion Butoi</a>, <a href="/search/cs?searchtype=author&query=Rector-Brooks%2C+J">Jarrid Rector-Brooks</a>, <a href="/search/cs?searchtype=author&query=Blackburn%2C+S">Simon Blackburn</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+L">Leo Feng</a>, <a href="/search/cs?searchtype=author&query=Nekoei%2C+H">Hadi Nekoei</a>, <a href="/search/cs?searchtype=author&query=Gottipati%2C+S">SaiKrishna Gottipati</a>, <a href="/search/cs?searchtype=author&query=Vijayan%2C+P">Priyesh Vijayan</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+P">Prateek Gupta</a>, <a href="/search/cs?searchtype=author&query=Ramp%C3%A1%C5%A1ek%2C+L">Ladislav Ramp谩拧ek</a>, <a href="/search/cs?searchtype=author&query=Avancha%2C+S">Sasikanth Avancha</a>, <a href="/search/cs?searchtype=author&query=Bacon%2C+P">Pierre-Luc Bacon</a>, <a href="/search/cs?searchtype=author&query=Hamilton%2C+W+L">William L. Hamilton</a>, <a href="/search/cs?searchtype=author&query=Paige%2C+B">Brooks Paige</a>, <a href="/search/cs?searchtype=author&query=Misra%2C+S">Sanchit Misra</a> , et al. (9 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.01616v1-abstract-short" style="display: inline;"> Despite substantial progress in machine learning for scientific discovery in recent years, truly de novo design of small molecules which exhibit a property of interest remains a significant challenge. We introduce LambdaZero, a generative active learning approach to search for synthesizable molecules. Powered by deep reinforcement learning, LambdaZero learns to search over the vast space of molecu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.01616v1-abstract-full').style.display = 'inline'; document.getElementById('2405.01616v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.01616v1-abstract-full" style="display: none;"> Despite substantial progress in machine learning for scientific discovery in recent years, truly de novo design of small molecules which exhibit a property of interest remains a significant challenge. We introduce LambdaZero, a generative active learning approach to search for synthesizable molecules. Powered by deep reinforcement learning, LambdaZero learns to search over the vast space of molecules to discover candidates with a desired property. We apply LambdaZero with molecular docking to design novel small molecules that inhibit the enzyme soluble Epoxide Hydrolase 2 (sEH), while enforcing constraints on synthesizability and drug-likeliness. LambdaZero provides an exponential speedup in terms of the number of calls to the expensive molecular docking oracle, and LambdaZero de novo designed molecules reach docking scores that would otherwise require the virtual screening of a hundred billion molecules. Importantly, LambdaZero discovers novel scaffolds of synthesizable, drug-like inhibitors for sEH. In in vitro experimental validation, a series of ligands from a generated quinazoline-based scaffold were synthesized, and the lead inhibitor N-(4,6-di(pyrrolidin-1-yl)quinazolin-2-yl)-N-methylbenzamide (UM0152893) displayed sub-micromolar enzyme inhibition of sEH. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.01616v1-abstract-full').style.display = 'none'; document.getElementById('2405.01616v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.11574">arXiv:2403.11574</a> <span> [<a href="https://arxiv.org/pdf/2403.11574">pdf</a>, <a href="https://arxiv.org/format/2403.11574">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Offline Multitask Representation Learning for Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ishfaq%2C+H">Haque Ishfaq</a>, <a href="/search/cs?searchtype=author&query=Nguyen-Tang%2C+T">Thanh Nguyen-Tang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+S">Songtao Feng</a>, <a href="/search/cs?searchtype=author&query=Arora%2C+R">Raman Arora</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mengdi Wang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+M">Ming Yin</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.11574v2-abstract-short" style="display: inline;"> We study offline multitask representation learning in reinforcement learning (RL), where a learner is provided with an offline dataset from different tasks that share a common representation and is asked to learn the shared representation. We theoretically investigate offline multitask low-rank RL, and propose a new algorithm called MORL for offline multitask representation learning. Furthermore,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.11574v2-abstract-full').style.display = 'inline'; document.getElementById('2403.11574v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.11574v2-abstract-full" style="display: none;"> We study offline multitask representation learning in reinforcement learning (RL), where a learner is provided with an offline dataset from different tasks that share a common representation and is asked to learn the shared representation. We theoretically investigate offline multitask low-rank RL, and propose a new algorithm called MORL for offline multitask representation learning. Furthermore, we examine downstream RL in reward-free, offline and online scenarios, where a new task is introduced to the agent that shares the same representation as the upstream offline tasks. Our theoretical results demonstrate the benefits of using the learned representation from the upstream offline task instead of directly learning the representation of the low-rank model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.11574v2-abstract-full').style.display = 'none'; document.getElementById('2403.11574v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to 38th Conference on Neural Information Processing Systems (NeurIPS 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.10309">arXiv:2402.10309</a> <span> [<a href="https://arxiv.org/pdf/2402.10309">pdf</a>, <a href="https://arxiv.org/format/2402.10309">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Discrete Probabilistic Inference as Control in Multi-path Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deleu%2C+T">Tristan Deleu</a>, <a href="/search/cs?searchtype=author&query=Nouri%2C+P">Padideh Nouri</a>, <a href="/search/cs?searchtype=author&query=Malkin%2C+N">Nikolay Malkin</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Bengio%2C+Y">Yoshua Bengio</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.10309v2-abstract-short" style="display: inline;"> We consider the problem of sampling from a discrete and structured distribution as a sequential decision problem, where the objective is to find a stochastic policy such that objects are sampled at the end of this sequential process proportionally to some predefined reward. While we could use maximum entropy Reinforcement Learning (MaxEnt RL) to solve this problem for some distributions, it has be… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.10309v2-abstract-full').style.display = 'inline'; document.getElementById('2402.10309v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.10309v2-abstract-full" style="display: none;"> We consider the problem of sampling from a discrete and structured distribution as a sequential decision problem, where the objective is to find a stochastic policy such that objects are sampled at the end of this sequential process proportionally to some predefined reward. While we could use maximum entropy Reinforcement Learning (MaxEnt RL) to solve this problem for some distributions, it has been shown that in general, the distribution over states induced by the optimal policy may be biased in cases where there are multiple ways to generate the same object. To address this issue, Generative Flow Networks (GFlowNets) learn a stochastic policy that samples objects proportionally to their reward by approximately enforcing a conservation of flows across the whole Markov Decision Process (MDP). In this paper, we extend recent methods correcting the reward in order to guarantee that the marginal distribution induced by the optimal MaxEnt RL policy is proportional to the original reward, regardless of the structure of the underlying MDP. We also prove that some flow-matching objectives found in the GFlowNet literature are in fact equivalent to well-established MaxEnt RL algorithms with a corrected reward. Finally, we study empirically the performance of multiple MaxEnt RL and GFlowNet algorithms on multiple problems involving sampling from discrete distributions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.10309v2-abstract-full').style.display = 'none'; document.getElementById('2402.10309v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.08609">arXiv:2402.08609</a> <span> [<a href="https://arxiv.org/pdf/2402.08609">pdf</a>, <a href="https://arxiv.org/format/2402.08609">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Mixtures of Experts Unlock Parameter Scaling for Deep RL </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Obando-Ceron%2C+J">Johan Obando-Ceron</a>, <a href="/search/cs?searchtype=author&query=Sokar%2C+G">Ghada Sokar</a>, <a href="/search/cs?searchtype=author&query=Willi%2C+T">Timon Willi</a>, <a href="/search/cs?searchtype=author&query=Lyle%2C+C">Clare Lyle</a>, <a href="/search/cs?searchtype=author&query=Farebrother%2C+J">Jesse Farebrother</a>, <a href="/search/cs?searchtype=author&query=Foerster%2C+J">Jakob Foerster</a>, <a href="/search/cs?searchtype=author&query=Dziugaite%2C+G+K">Gintare Karolina Dziugaite</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Castro%2C+P+S">Pablo Samuel Castro</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.08609v3-abstract-short" style="display: inline;"> The recent rapid progress in (self) supervised learning models is in large part predicted by empirical scaling laws: a model's performance scales proportionally to its size. Analogous scaling laws remain elusive for reinforcement learning domains, however, where increasing the parameter count of a model often hurts its final performance. In this paper, we demonstrate that incorporating Mixture-of-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.08609v3-abstract-full').style.display = 'inline'; document.getElementById('2402.08609v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.08609v3-abstract-full" style="display: none;"> The recent rapid progress in (self) supervised learning models is in large part predicted by empirical scaling laws: a model's performance scales proportionally to its size. Analogous scaling laws remain elusive for reinforcement learning domains, however, where increasing the parameter count of a model often hurts its final performance. In this paper, we demonstrate that incorporating Mixture-of-Expert (MoE) modules, and in particular Soft MoEs (Puigcerver et al., 2023), into value-based networks results in more parameter-scalable models, evidenced by substantial performance increases across a variety of training regimes and model sizes. This work thus provides strong empirical evidence towards developing scaling laws for reinforcement learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.08609v3-abstract-full').style.display = 'none'; document.getElementById('2402.08609v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.06137">arXiv:2402.06137</a> <span> [<a href="https://arxiv.org/pdf/2402.06137">pdf</a>, <a href="https://arxiv.org/format/2402.06137">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> On the Privacy of Selection Mechanisms with Gaussian Noise </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lebensold%2C+J">Jonathan Lebensold</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Balle%2C+B">Borja Balle</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.06137v2-abstract-short" style="display: inline;"> Report Noisy Max and Above Threshold are two classical differentially private (DP) selection mechanisms. Their output is obtained by adding noise to a sequence of low-sensitivity queries and reporting the identity of the query whose (noisy) answer satisfies a certain condition. Pure DP guarantees for these mechanisms are easy to obtain when Laplace noise is added to the queries. On the other hand,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.06137v2-abstract-full').style.display = 'inline'; document.getElementById('2402.06137v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.06137v2-abstract-full" style="display: none;"> Report Noisy Max and Above Threshold are two classical differentially private (DP) selection mechanisms. Their output is obtained by adding noise to a sequence of low-sensitivity queries and reporting the identity of the query whose (noisy) answer satisfies a certain condition. Pure DP guarantees for these mechanisms are easy to obtain when Laplace noise is added to the queries. On the other hand, when instantiated using Gaussian noise, standard analyses only yield approximate DP guarantees despite the fact that the outputs of these mechanisms lie in a discrete space. In this work, we revisit the analysis of Report Noisy Max and Above Threshold with Gaussian noise and show that, under the additional assumption that the underlying queries are bounded, it is possible to provide pure ex-ante DP bounds for Report Noisy Max and pure ex-post DP bounds for Above Threshold. The resulting bounds are tight and depend on closed-form expressions that can be numerically evaluated using standard methods. Empirically we find these lead to tighter privacy accounting in the high privacy, low data regime. Further, we propose a simple privacy filter for composing pure ex-post DP guarantees, and use it to derive a fully adaptive Gaussian Sparse Vector Technique mechanism. Finally, we provide experiments on mobility and energy consumption datasets demonstrating that our Sparse Vector Technique is practically competitive with previous approaches and requires less hyper-parameter tuning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.06137v2-abstract-full').style.display = 'none'; document.getElementById('2402.06137v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AISTATS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.05234">arXiv:2402.05234</a> <span> [<a href="https://arxiv.org/pdf/2402.05234">pdf</a>, <a href="https://arxiv.org/format/2402.05234">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> QGFN: Controllable Greediness with Action Values </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lau%2C+E">Elaine Lau</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+S+Z">Stephen Zhewen Lu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+L">Ling Pan</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Bengio%2C+E">Emmanuel Bengio</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.05234v3-abstract-short" style="display: inline;"> Generative Flow Networks (GFlowNets; GFNs) are a family of energy-based generative methods for combinatorial objects, capable of generating diverse and high-utility samples. However, consistently biasing GFNs towards producing high-utility samples is non-trivial. In this work, we leverage connections between GFNs and reinforcement learning (RL) and propose to combine the GFN policy with an action-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.05234v3-abstract-full').style.display = 'inline'; document.getElementById('2402.05234v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.05234v3-abstract-full" style="display: none;"> Generative Flow Networks (GFlowNets; GFNs) are a family of energy-based generative methods for combinatorial objects, capable of generating diverse and high-utility samples. However, consistently biasing GFNs towards producing high-utility samples is non-trivial. In this work, we leverage connections between GFNs and reinforcement learning (RL) and propose to combine the GFN policy with an action-value estimate, $Q$, to create greedier sampling policies which can be controlled by a mixing parameter. We show that several variants of the proposed method, QGFN, are able to improve on the number of high-reward samples generated in a variety of tasks without sacrificing diversity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.05234v3-abstract-full').style.display = 'none'; document.getElementById('2402.05234v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by 38th Conference on Neural Information Processing Systems (NeurIPS 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.04764">arXiv:2402.04764</a> <span> [<a href="https://arxiv.org/pdf/2402.04764">pdf</a>, <a href="https://arxiv.org/format/2402.04764">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Code as Reward: Empowering Reinforcement Learning with VLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Venuto%2C+D">David Venuto</a>, <a href="/search/cs?searchtype=author&query=Islam%2C+S+N">Sami Nur Islam</a>, <a href="/search/cs?searchtype=author&query=Klissarov%2C+M">Martin Klissarov</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Sherry Yang</a>, <a href="/search/cs?searchtype=author&query=Anand%2C+A">Ankit Anand</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.04764v1-abstract-short" style="display: inline;"> Pre-trained Vision-Language Models (VLMs) are able to understand visual concepts, describe and decompose complex tasks into sub-tasks, and provide feedback on task completion. In this paper, we aim to leverage these capabilities to support the training of reinforcement learning (RL) agents. In principle, VLMs are well suited for this purpose, as they can naturally analyze image-based observations… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.04764v1-abstract-full').style.display = 'inline'; document.getElementById('2402.04764v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.04764v1-abstract-full" style="display: none;"> Pre-trained Vision-Language Models (VLMs) are able to understand visual concepts, describe and decompose complex tasks into sub-tasks, and provide feedback on task completion. In this paper, we aim to leverage these capabilities to support the training of reinforcement learning (RL) agents. In principle, VLMs are well suited for this purpose, as they can naturally analyze image-based observations and provide feedback (reward) on learning progress. However, inference in VLMs is computationally expensive, so querying them frequently to compute rewards would significantly slowdown the training of an RL agent. To address this challenge, we propose a framework named Code as Reward (VLM-CaR). VLM-CaR produces dense reward functions from VLMs through code generation, thereby significantly reducing the computational burden of querying the VLM directly. We show that the dense rewards generated through our approach are very accurate across a diverse set of discrete and continuous environments, and can be more effective in training RL policies than the original sparse environment rewards. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.04764v1-abstract-full').style.display = 'none'; document.getElementById('2402.04764v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.03675">arXiv:2402.03675</a> <span> [<a href="https://arxiv.org/pdf/2402.03675">pdf</a>, <a href="https://arxiv.org/format/2402.03675">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Effective Protein-Protein Interaction Exploration with PPIretrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hua%2C+C">Chenqing Hua</a>, <a href="/search/cs?searchtype=author&query=Coley%2C+C">Connor Coley</a>, <a href="/search/cs?searchtype=author&query=Wolf%2C+G">Guy Wolf</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Shuangjia Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.03675v1-abstract-short" style="display: inline;"> Protein-protein interactions (PPIs) are crucial in regulating numerous cellular functions, including signal transduction, transportation, and immune defense. As the accuracy of multi-chain protein complex structure prediction improves, the challenge has shifted towards effectively navigating the vast complex universe to identify potential PPIs. Herein, we propose PPIretrieval, the first deep learn… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.03675v1-abstract-full').style.display = 'inline'; document.getElementById('2402.03675v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.03675v1-abstract-full" style="display: none;"> Protein-protein interactions (PPIs) are crucial in regulating numerous cellular functions, including signal transduction, transportation, and immune defense. As the accuracy of multi-chain protein complex structure prediction improves, the challenge has shifted towards effectively navigating the vast complex universe to identify potential PPIs. Herein, we propose PPIretrieval, the first deep learning-based model for protein-protein interaction exploration, which leverages existing PPI data to effectively search for potential PPIs in an embedding space, capturing rich geometric and chemical information of protein surfaces. When provided with an unseen query protein with its associated binding site, PPIretrieval effectively identifies a potential binding partner along with its corresponding binding site in an embedding space, facilitating the formation of protein-protein complexes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.03675v1-abstract-full').style.display = 'none'; document.getElementById('2402.03675v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.11669">arXiv:2312.11669</a> <span> [<a href="https://arxiv.org/pdf/2312.11669">pdf</a>, <a href="https://arxiv.org/format/2312.11669">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Prediction and Control in Continual Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Anand%2C+N">Nishanth Anand</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.11669v1-abstract-short" style="display: inline;"> Temporal difference (TD) learning is often used to update the estimate of the value function which is used by RL agents to extract useful policies. In this paper, we focus on value function estimation in continual reinforcement learning. We propose to decompose the value function into two components which update at different timescales: a permanent value function, which holds general knowledge tha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.11669v1-abstract-full').style.display = 'inline'; document.getElementById('2312.11669v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.11669v1-abstract-full" style="display: none;"> Temporal difference (TD) learning is often used to update the estimate of the value function which is used by RL agents to extract useful policies. In this paper, we focus on value function estimation in continual reinforcement learning. We propose to decompose the value function into two components which update at different timescales: a permanent value function, which holds general knowledge that persists over time, and a transient value function, which allows quick adaptation to new situations. We establish theoretical results showing that our approach is well suited for continual learning and draw connections to the complementary learning systems (CLS) theory from neuroscience. Empirically, this approach improves performance significantly on both prediction and control problems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.11669v1-abstract-full').style.display = 'none'; document.getElementById('2312.11669v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published at the 37th Conference on Neural Information Processing Systems (NeurIPS 2023)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.00886">arXiv:2312.00886</a> <span> [<a href="https://arxiv.org/pdf/2312.00886">pdf</a>, <a href="https://arxiv.org/format/2312.00886">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Nash Learning from Human Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&query=Calandriello%2C+D">Daniele Calandriello</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z+D">Zhaohan Daniel Guo</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Geist%2C+M">Matthieu Geist</a>, <a href="/search/cs?searchtype=author&query=Mesnard%2C+T">Thomas Mesnard</a>, <a href="/search/cs?searchtype=author&query=Michi%2C+A">Andrea Michi</a>, <a href="/search/cs?searchtype=author&query=Selvi%2C+M">Marco Selvi</a>, <a href="/search/cs?searchtype=author&query=Girgin%2C+S">Sertan Girgin</a>, <a href="/search/cs?searchtype=author&query=Momchev%2C+N">Nikola Momchev</a>, <a href="/search/cs?searchtype=author&query=Bachem%2C+O">Olivier Bachem</a>, <a href="/search/cs?searchtype=author&query=Mankowitz%2C+D+J">Daniel J. Mankowitz</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.00886v4-abstract-short" style="display: inline;"> Reinforcement learning from human feedback (RLHF) has emerged as the main paradigm for aligning large language models (LLMs) with human preferences. Typically, RLHF involves the initial step of learning a reward model from human feedback, often expressed as preferences between pairs of text generations produced by a pre-trained LLM. Subsequently, the LLM's policy is fine-tuned by optimizing it to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.00886v4-abstract-full').style.display = 'inline'; document.getElementById('2312.00886v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.00886v4-abstract-full" style="display: none;"> Reinforcement learning from human feedback (RLHF) has emerged as the main paradigm for aligning large language models (LLMs) with human preferences. Typically, RLHF involves the initial step of learning a reward model from human feedback, often expressed as preferences between pairs of text generations produced by a pre-trained LLM. Subsequently, the LLM's policy is fine-tuned by optimizing it to maximize the reward model through a reinforcement learning algorithm. However, an inherent limitation of current reward models is their inability to fully represent the richness of human preferences and their dependency on the sampling distribution. In this study, we introduce an alternative pipeline for the fine-tuning of LLMs using pairwise human feedback. Our approach entails the initial learning of a preference model, which is conditioned on two inputs given a prompt, followed by the pursuit of a policy that consistently generates responses preferred over those generated by any competing policy, thus defining the Nash equilibrium of this preference model. We term this approach Nash learning from human feedback (NLHF). In the context of a tabular policy representation, we present a novel algorithmic solution, Nash-MD, founded on the principles of mirror descent. This algorithm produces a sequence of policies, with the last iteration converging to the regularized Nash equilibrium. Additionally, we explore parametric representations of policies and introduce gradient descent algorithms for deep-learning architectures. To demonstrate the effectiveness of our approach, we present experimental results involving the fine-tuning of a LLM for a text summarization task. We believe NLHF offers a compelling avenue for preference learning and policy optimization with the potential of advancing the field of aligning LLMs with human preferences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.00886v4-abstract-full').style.display = 'none'; document.getElementById('2312.00886v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.03583">arXiv:2311.03583</a> <span> [<a href="https://arxiv.org/pdf/2311.03583">pdf</a>, <a href="https://arxiv.org/format/2311.03583">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Discrete Mathematics">cs.DM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Finding Increasingly Large Extremal Graphs with AlphaZero and Tabu Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mehrabian%2C+A">Abbas Mehrabian</a>, <a href="/search/cs?searchtype=author&query=Anand%2C+A">Ankit Anand</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H">Hyunjik Kim</a>, <a href="/search/cs?searchtype=author&query=Sonnerat%2C+N">Nicolas Sonnerat</a>, <a href="/search/cs?searchtype=author&query=Balog%2C+M">Matej Balog</a>, <a href="/search/cs?searchtype=author&query=Comanici%2C+G">Gheorghe Comanici</a>, <a href="/search/cs?searchtype=author&query=Berariu%2C+T">Tudor Berariu</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+A">Andrew Lee</a>, <a href="/search/cs?searchtype=author&query=Ruoss%2C+A">Anian Ruoss</a>, <a href="/search/cs?searchtype=author&query=Bulanova%2C+A">Anna Bulanova</a>, <a href="/search/cs?searchtype=author&query=Toyama%2C+D">Daniel Toyama</a>, <a href="/search/cs?searchtype=author&query=Blackwell%2C+S">Sam Blackwell</a>, <a href="/search/cs?searchtype=author&query=Paredes%2C+B+R">Bernardino Romera Paredes</a>, <a href="/search/cs?searchtype=author&query=Veli%C4%8Dkovi%C4%87%2C+P">Petar Veli膷kovi膰</a>, <a href="/search/cs?searchtype=author&query=Orseau%2C+L">Laurent Orseau</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Joonkyung Lee</a>, <a href="/search/cs?searchtype=author&query=Naredla%2C+A+M">Anurag Murty Naredla</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Wagner%2C+A+Z">Adam Zsolt Wagner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.03583v2-abstract-short" style="display: inline;"> This work studies a central extremal graph theory problem inspired by a 1975 conjecture of Erd艖s, which aims to find graphs with a given size (number of nodes) that maximize the number of edges without having 3- or 4-cycles. We formulate this problem as a sequential decision-making problem and compare AlphaZero, a neural network-guided tree search, with tabu search, a heuristic local search method… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.03583v2-abstract-full').style.display = 'inline'; document.getElementById('2311.03583v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.03583v2-abstract-full" style="display: none;"> This work studies a central extremal graph theory problem inspired by a 1975 conjecture of Erd艖s, which aims to find graphs with a given size (number of nodes) that maximize the number of edges without having 3- or 4-cycles. We formulate this problem as a sequential decision-making problem and compare AlphaZero, a neural network-guided tree search, with tabu search, a heuristic local search method. Using either method, by introducing a curriculum -- jump-starting the search for larger graphs using good graphs found at smaller sizes -- we improve the state-of-the-art lower bounds for several sizes. We also propose a flexible graph-generation environment and a permutation-invariant network architecture for learning to search in the space of graphs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.03583v2-abstract-full').style.display = 'none'; document.getElementById('2311.03583v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in the proceedings of IJCAI 2024. First three authors contributed equally, last two authors made equal senior contribution</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.01990">arXiv:2311.01990</a> <span> [<a href="https://arxiv.org/pdf/2311.01990">pdf</a>, <a href="https://arxiv.org/format/2311.01990">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Conditions on Preference Relations that Guarantee the Existence of Optimal Policies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Carr%2C+J+C">Jonathan Cola莽o Carr</a>, <a href="/search/cs?searchtype=author&query=Panangaden%2C+P">Prakash Panangaden</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.01990v2-abstract-short" style="display: inline;"> Learning from Preferential Feedback (LfPF) plays an essential role in training Large Language Models, as well as certain types of interactive learning agents. However, a substantial gap exists between the theory and application of LfPF algorithms. Current results guaranteeing the existence of optimal policies in LfPF problems assume that both the preferences and transition dynamics are determined… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.01990v2-abstract-full').style.display = 'inline'; document.getElementById('2311.01990v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.01990v2-abstract-full" style="display: none;"> Learning from Preferential Feedback (LfPF) plays an essential role in training Large Language Models, as well as certain types of interactive learning agents. However, a substantial gap exists between the theory and application of LfPF algorithms. Current results guaranteeing the existence of optimal policies in LfPF problems assume that both the preferences and transition dynamics are determined by a Markov Decision Process. We introduce the Direct Preference Process, a new framework for analyzing LfPF problems in partially-observable, non-Markovian environments. Within this framework, we establish conditions that guarantee the existence of optimal policies by considering the ordinal structure of the preferences. We show that a decision-making problem can have optimal policies -- that are characterized by recursive optimality equations -- even when no reward function can express the learning goal. These findings underline the need to explore preference-based learning strategies which do not assume that preferences are generated by reward. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.01990v2-abstract-full').style.display = 'none'; document.getElementById('2311.01990v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">v2: replaced with accepted AISTATS 2024 version, containing a new summary figure and one extra example. Results and conclusions are unchanged</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.19685">arXiv:2310.19685</a> <span> [<a href="https://arxiv.org/pdf/2310.19685">pdf</a>, <a href="https://arxiv.org/format/2310.19685">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> DGFN: Double Generative Flow Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lau%2C+E">Elaine Lau</a>, <a href="/search/cs?searchtype=author&query=Vemgal%2C+N">Nikhil Vemgal</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Bengio%2C+E">Emmanuel Bengio</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.19685v3-abstract-short" style="display: inline;"> Deep learning is emerging as an effective tool in drug discovery, with potential applications in both predictive and generative models. Generative Flow Networks (GFlowNets/GFNs) are a recently introduced method recognized for the ability to generate diverse candidates, in particular in small molecule generation tasks. In this work, we introduce double GFlowNets (DGFNs). Drawing inspiration from re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.19685v3-abstract-full').style.display = 'inline'; document.getElementById('2310.19685v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.19685v3-abstract-full" style="display: none;"> Deep learning is emerging as an effective tool in drug discovery, with potential applications in both predictive and generative models. Generative Flow Networks (GFlowNets/GFNs) are a recently introduced method recognized for the ability to generate diverse candidates, in particular in small molecule generation tasks. In this work, we introduce double GFlowNets (DGFNs). Drawing inspiration from reinforcement learning and Double Deep Q-Learning, we introduce a target network used to sample trajectories, while updating the main network with these sampled trajectories. Empirical results confirm that DGFNs effectively enhance exploration in sparse reward domains and high-dimensional state spaces, both challenging aspects of de-novo design in drug discovery. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.19685v3-abstract-full').style.display = 'none'; document.getElementById('2310.19685v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2023 Workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.09997">arXiv:2310.09997</a> <span> [<a href="https://arxiv.org/pdf/2310.09997">pdf</a>, <a href="https://arxiv.org/format/2310.09997">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Forecaster: Towards Temporally Abstract Tree-Search Planning from Pixels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiralerspong%2C+T">Thomas Jiralerspong</a>, <a href="/search/cs?searchtype=author&query=Kondrup%2C+F">Flemming Kondrup</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Khetarpal%2C+K">Khimya Khetarpal</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.09997v1-abstract-short" style="display: inline;"> The ability to plan at many different levels of abstraction enables agents to envision the long-term repercussions of their decisions and thus enables sample-efficient learning. This becomes particularly beneficial in complex environments from high-dimensional state space such as pixels, where the goal is distant and the reward sparse. We introduce Forecaster, a deep hierarchical reinforcement lea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.09997v1-abstract-full').style.display = 'inline'; document.getElementById('2310.09997v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.09997v1-abstract-full" style="display: none;"> The ability to plan at many different levels of abstraction enables agents to envision the long-term repercussions of their decisions and thus enables sample-efficient learning. This becomes particularly beneficial in complex environments from high-dimensional state space such as pixels, where the goal is distant and the reward sparse. We introduce Forecaster, a deep hierarchical reinforcement learning approach which plans over high-level goals leveraging a temporally abstract world model. Forecaster learns an abstract model of its environment by modelling the transitions dynamics at an abstract level and training a world model on such transition. It then uses this world model to choose optimal high-level goals through a tree-search planning procedure. It additionally trains a low-level policy that learns to reach those goals. Our method not only captures building world models with longer horizons, but also, planning with such models in downstream tasks. We empirically demonstrate Forecaster's potential in both single-task learning and generalization to new tasks in the AntMaze domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.09997v1-abstract-full').style.display = 'none'; document.getElementById('2310.09997v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.08338">arXiv:2310.08338</a> <span> [<a href="https://arxiv.org/pdf/2310.08338">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> A cry for help: Early detection of brain injury in newborns </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Onu%2C+C+C">Charles C. Onu</a>, <a href="/search/cs?searchtype=author&query=Latremouille%2C+S">Samantha Latremouille</a>, <a href="/search/cs?searchtype=author&query=Gorin%2C+A">Arsenii Gorin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junhao Wang</a>, <a href="/search/cs?searchtype=author&query=Udeogu%2C+I">Innocent Udeogu</a>, <a href="/search/cs?searchtype=author&query=Ekwochi%2C+U">Uchenna Ekwochi</a>, <a href="/search/cs?searchtype=author&query=Ubuane%2C+P+O">Peter O. Ubuane</a>, <a href="/search/cs?searchtype=author&query=Kehinde%2C+O+A">Omolara A. Kehinde</a>, <a href="/search/cs?searchtype=author&query=Salisu%2C+M+A">Muhammad A. Salisu</a>, <a href="/search/cs?searchtype=author&query=Briggs%2C+D">Datonye Briggs</a>, <a href="/search/cs?searchtype=author&query=Bengio%2C+Y">Yoshua Bengio</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.08338v3-abstract-short" style="display: inline;"> Since the 1960s, neonatal clinicians have known that newborns suffering from certain neurological conditions exhibit altered crying patterns such as the high-pitched cry in birth asphyxia. Despite an annual burden of over 1.5 million infant deaths and disabilities, early detection of neonatal brain injuries due to asphyxia remains a challenge, particularly in developing countries where the majorit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.08338v3-abstract-full').style.display = 'inline'; document.getElementById('2310.08338v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.08338v3-abstract-full" style="display: none;"> Since the 1960s, neonatal clinicians have known that newborns suffering from certain neurological conditions exhibit altered crying patterns such as the high-pitched cry in birth asphyxia. Despite an annual burden of over 1.5 million infant deaths and disabilities, early detection of neonatal brain injuries due to asphyxia remains a challenge, particularly in developing countries where the majority of births are not attended by a trained physician. Here we report on the first inter-continental clinical study to demonstrate that neonatal brain injury can be reliably determined from recorded infant cries using an AI algorithm we call Roseline. Previous and recent work has been limited by the lack of a large, high-quality clinical database of cry recordings, constraining the application of state-of-the-art machine learning. We develop a new training methodology for audio-based pathology detection models and evaluate this system on a large database of newborn cry sounds acquired from geographically diverse settings -- 5 hospitals across 3 continents. Our system extracts interpretable acoustic biomarkers that support clinical decisions and is able to accurately detect neurological injury from newborns' cries with an AUC of 92.5% (88.7% sensitivity at 80% specificity). Cry-based neurological monitoring opens the door for low-cost, easy-to-use, non-invasive and contact-free screening of at-risk babies, especially when integrated into simple devices like smartphones or neonatal ICU monitors. This would provide a reliable tool where there are no alternatives, but also curtail the need to regularly exert newborns to physically-exhausting or radiation-exposing assessments such as brain CT scans. This work sets the stage for embracing the infant cry as a vital sign and indicates the potential of AI-driven sound monitoring for the future of affordable healthcare. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.08338v3-abstract-full').style.display = 'none'; document.getElementById('2310.08338v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.00229">arXiv:2310.00229</a> <span> [<a href="https://arxiv.org/pdf/2310.00229">pdf</a>, <a href="https://arxiv.org/format/2310.00229">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Consciousness-Inspired Spatio-Temporal Abstractions for Better Generalization in Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+M">Mingde Zhao</a>, <a href="/search/cs?searchtype=author&query=Alver%2C+S">Safa Alver</a>, <a href="/search/cs?searchtype=author&query=van+Seijen%2C+H">Harm van Seijen</a>, <a href="/search/cs?searchtype=author&query=Laroche%2C+R">Romain Laroche</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Bengio%2C+Y">Yoshua Bengio</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.00229v4-abstract-short" style="display: inline;"> Inspired by human conscious planning, we propose Skipper, a model-based reinforcement learning framework utilizing spatio-temporal abstractions to generalize better in novel situations. It automatically decomposes the given task into smaller, more manageable subtasks, and thus enables sparse decision-making and focused computation on the relevant parts of the environment. The decomposition relies… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.00229v4-abstract-full').style.display = 'inline'; document.getElementById('2310.00229v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.00229v4-abstract-full" style="display: none;"> Inspired by human conscious planning, we propose Skipper, a model-based reinforcement learning framework utilizing spatio-temporal abstractions to generalize better in novel situations. It automatically decomposes the given task into smaller, more manageable subtasks, and thus enables sparse decision-making and focused computation on the relevant parts of the environment. The decomposition relies on the extraction of an abstracted proxy problem represented as a directed graph, in which vertices and edges are learned end-to-end from hindsight. Our theoretical analyses provide performance guarantees under appropriate assumptions and establish where our approach is expected to be helpful. Generalization-focused experiments validate Skipper's significant advantage in zero-shot generalization, compared to some existing state-of-the-art hierarchical planning methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.00229v4-abstract-full').style.display = 'none'; document.getElementById('2310.00229v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2024 Camera-Ready</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.15470">arXiv:2308.15470</a> <span> [<a href="https://arxiv.org/pdf/2308.15470">pdf</a>, <a href="https://arxiv.org/format/2308.15470">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Policy composition in reinforcement learning via multi-objective policy optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mishra%2C+S">Shruti Mishra</a>, <a href="/search/cs?searchtype=author&query=Anand%2C+A">Ankit Anand</a>, <a href="/search/cs?searchtype=author&query=Hoffmann%2C+J">Jordan Hoffmann</a>, <a href="/search/cs?searchtype=author&query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&query=Riedmiller%2C+M">Martin Riedmiller</a>, <a href="/search/cs?searchtype=author&query=Abdolmaleki%2C+A">Abbas Abdolmaleki</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.15470v2-abstract-short" style="display: inline;"> We enable reinforcement learning agents to learn successful behavior policies by utilizing relevant pre-existing teacher policies. The teacher policies are introduced as objectives, in addition to the task objective, in a multi-objective policy optimization setting. Using the Multi-Objective Maximum a Posteriori Policy Optimization algorithm (Abdolmaleki et al. 2020), we show that teacher policies… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.15470v2-abstract-full').style.display = 'inline'; document.getElementById('2308.15470v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.15470v2-abstract-full" style="display: none;"> We enable reinforcement learning agents to learn successful behavior policies by utilizing relevant pre-existing teacher policies. The teacher policies are introduced as objectives, in addition to the task objective, in a multi-objective policy optimization setting. Using the Multi-Objective Maximum a Posteriori Policy Optimization algorithm (Abdolmaleki et al. 2020), we show that teacher policies can help speed up learning, particularly in the absence of shaping rewards. In two domains with continuous observation and action spaces, our agents successfully compose teacher policies in sequence and in parallel, and are also able to further extend the policies of the teachers in order to solve the task. Depending on the specified combination of task and teacher(s), teacher(s) may naturally act to limit the final performance of an agent. The extent to which agents are required to adhere to teacher policies are determined by hyperparameters which determine both the effect of teachers on learning speed and the eventual performance of the agent on the task. In the humanoid domain (Tassa et al. 2018), we also equip agents with the ability to control the selection of teachers. With this ability, agents are able to meaningfully compose from the teacher policies to achieve a superior task reward on the walk task than in cases without access to the teacher policies. We show the resemblance of composed task policies with the corresponding teacher policies through videos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.15470v2-abstract-full').style.display = 'none'; document.getElementById('2308.15470v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.11046">arXiv:2307.11046</a> <span> [<a href="https://arxiv.org/pdf/2307.11046">pdf</a>, <a href="https://arxiv.org/format/2307.11046">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Definition of Continual Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Abel%2C+D">David Abel</a>, <a href="/search/cs?searchtype=author&query=Barreto%2C+A">Andr茅 Barreto</a>, <a href="/search/cs?searchtype=author&query=Van+Roy%2C+B">Benjamin Van Roy</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=van+Hasselt%2C+H">Hado van Hasselt</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+S">Satinder Singh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.11046v2-abstract-short" style="display: inline;"> In a standard view of the reinforcement learning problem, an agent's goal is to efficiently identify a policy that maximizes long-term reward. However, this perspective is based on a restricted view of learning as finding a solution, rather than treating learning as endless adaptation. In contrast, continual reinforcement learning refers to the setting in which the best agents never stop learning.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.11046v2-abstract-full').style.display = 'inline'; document.getElementById('2307.11046v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.11046v2-abstract-full" style="display: none;"> In a standard view of the reinforcement learning problem, an agent's goal is to efficiently identify a policy that maximizes long-term reward. However, this perspective is based on a restricted view of learning as finding a solution, rather than treating learning as endless adaptation. In contrast, continual reinforcement learning refers to the setting in which the best agents never stop learning. Despite the importance of continual reinforcement learning, the community lacks a simple definition of the problem that highlights its commitments and makes its primary concepts precise and clear. To this end, this paper is dedicated to carefully defining the continual reinforcement learning problem. We formalize the notion of agents that "never stop learning" through a new mathematical language for analyzing and cataloging agents. Using this new language, we define a continual learning agent as one that can be understood as carrying out an implicit search process indefinitely, and continual reinforcement learning as the setting in which the best agents are all continual learning agents. We provide two motivating examples, illustrating that traditional views of multi-task reinforcement learning and continual supervised learning are special cases of our definition. Collectively, these definitions and perspectives formalize many intuitive concepts at the heart of learning, and open new research pathways surrounding continual learning agents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.11046v2-abstract-full').style.display = 'none'; document.getElementById('2307.11046v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.11044">arXiv:2307.11044</a> <span> [<a href="https://arxiv.org/pdf/2307.11044">pdf</a>, <a href="https://arxiv.org/format/2307.11044">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> On the Convergence of Bounded Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Abel%2C+D">David Abel</a>, <a href="/search/cs?searchtype=author&query=Barreto%2C+A">Andr茅 Barreto</a>, <a href="/search/cs?searchtype=author&query=van+Hasselt%2C+H">Hado van Hasselt</a>, <a href="/search/cs?searchtype=author&query=Van+Roy%2C+B">Benjamin Van Roy</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+S">Satinder Singh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.11044v1-abstract-short" style="display: inline;"> When has an agent converged? Standard models of the reinforcement learning problem give rise to a straightforward definition of convergence: An agent converges when its behavior or performance in each environment state stops changing. However, as we shift the focus of our learning problem from the environment's state to the agent's state, the concept of an agent's convergence becomes significantly… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.11044v1-abstract-full').style.display = 'inline'; document.getElementById('2307.11044v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.11044v1-abstract-full" style="display: none;"> When has an agent converged? Standard models of the reinforcement learning problem give rise to a straightforward definition of convergence: An agent converges when its behavior or performance in each environment state stops changing. However, as we shift the focus of our learning problem from the environment's state to the agent's state, the concept of an agent's convergence becomes significantly less clear. In this paper, we propose two complementary accounts of agent convergence in a framing of the reinforcement learning problem that centers around bounded agents. The first view says that a bounded agent has converged when the minimal number of states needed to describe the agent's future behavior cannot decrease. The second view says that a bounded agent has converged just when the agent's performance only changes if the agent's internal state changes. We establish basic properties of these two definitions, show that they accommodate typical views of convergence in standard settings, and prove several facts about their nature and relationship. We take these perspectives, definitions, and analysis to bring clarity to a central idea of the field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.11044v1-abstract-full').style.display = 'none'; document.getElementById('2307.11044v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.07674">arXiv:2307.07674</a> <span> [<a href="https://arxiv.org/pdf/2307.07674">pdf</a>, <a href="https://arxiv.org/format/2307.07674">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> An Empirical Study of the Effectiveness of Using a Replay Buffer on Mode Discovery in GFlowNets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Vemgal%2C+N">Nikhil Vemgal</a>, <a href="/search/cs?searchtype=author&query=Lau%2C+E">Elaine Lau</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.07674v2-abstract-short" style="display: inline;"> Reinforcement Learning (RL) algorithms aim to learn an optimal policy by iteratively sampling actions to learn how to maximize the total expected return, $R(x)$. GFlowNets are a special class of algorithms designed to generate diverse candidates, $x$, from a discrete set, by learning a policy that approximates the proportional sampling of $R(x)$. GFlowNets exhibit improved mode discovery compared… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.07674v2-abstract-full').style.display = 'inline'; document.getElementById('2307.07674v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.07674v2-abstract-full" style="display: none;"> Reinforcement Learning (RL) algorithms aim to learn an optimal policy by iteratively sampling actions to learn how to maximize the total expected return, $R(x)$. GFlowNets are a special class of algorithms designed to generate diverse candidates, $x$, from a discrete set, by learning a policy that approximates the proportional sampling of $R(x)$. GFlowNets exhibit improved mode discovery compared to conventional RL algorithms, which is very useful for applications such as drug discovery and combinatorial search. However, since GFlowNets are a relatively recent class of algorithms, many techniques which are useful in RL have not yet been associated with them. In this paper, we study the utilization of a replay buffer for GFlowNets. We explore empirically various replay buffer sampling techniques and assess the impact on the speed of mode discovery and the quality of the modes discovered. Our experimental results in the Hypergrid toy domain and a molecule synthesis environment demonstrate significant improvements in mode discovery when training with a replay buffer, compared to training only with trajectories generated on-policy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.07674v2-abstract-full').style.display = 'none'; document.getElementById('2307.07674v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICML 2023 workshop on Structured Probabilistic Inference & Generative Modeling</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.10587">arXiv:2306.10587</a> <span> [<a href="https://arxiv.org/pdf/2306.10587">pdf</a>, <a href="https://arxiv.org/format/2306.10587">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Acceleration in Policy Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chelu%2C+V">Veronica Chelu</a>, <a href="/search/cs?searchtype=author&query=Zahavy%2C+T">Tom Zahavy</a>, <a href="/search/cs?searchtype=author&query=Guez%2C+A">Arthur Guez</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Flennerhag%2C+S">Sebastian Flennerhag</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.10587v2-abstract-short" style="display: inline;"> We work towards a unifying paradigm for accelerating policy optimization methods in reinforcement learning (RL) by integrating foresight in the policy improvement step via optimistic and adaptive updates. Leveraging the connection between policy iteration and policy gradient methods, we view policy optimization algorithms as iteratively solving a sequence of surrogate objectives, local lower bound… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.10587v2-abstract-full').style.display = 'inline'; document.getElementById('2306.10587v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.10587v2-abstract-full" style="display: none;"> We work towards a unifying paradigm for accelerating policy optimization methods in reinforcement learning (RL) by integrating foresight in the policy improvement step via optimistic and adaptive updates. Leveraging the connection between policy iteration and policy gradient methods, we view policy optimization algorithms as iteratively solving a sequence of surrogate objectives, local lower bounds on the original objective. We define optimism as predictive modelling of the future behavior of a policy, and adaptivity as taking immediate and anticipatory corrective actions to mitigate accumulating errors from overshooting predictions or delayed responses to change. We use this shared lens to jointly express other well-known algorithms, including model-based policy improvement based on forward search, and optimistic meta-learning algorithms. We analyze properties of this formulation, and show connections to other accelerated optimization algorithms. Then, we design an optimistic policy gradient algorithm, adaptive via meta-gradient learning, and empirically highlight several design choices pertaining to acceleration, in an illustrative task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.10587v2-abstract-full').style.display = 'none'; document.getElementById('2306.10587v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.02451">arXiv:2306.02451</a> <span> [<a href="https://arxiv.org/pdf/2306.02451">pdf</a>, <a href="https://arxiv.org/format/2306.02451">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> For SALE: State-Action Representation Learning for Deep Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fujimoto%2C+S">Scott Fujimoto</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+W">Wei-Di Chang</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+E+J">Edward J. Smith</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+S+S">Shixiang Shane Gu</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Meger%2C+D">David Meger</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.02451v2-abstract-short" style="display: inline;"> In the field of reinforcement learning (RL), representation learning is a proven tool for complex image-based tasks, but is often overlooked for environments with low-level states, such as physical control problems. This paper introduces SALE, a novel approach for learning embeddings that model the nuanced interaction between state and action, enabling effective representation learning from low-le… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.02451v2-abstract-full').style.display = 'inline'; document.getElementById('2306.02451v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.02451v2-abstract-full" style="display: none;"> In the field of reinforcement learning (RL), representation learning is a proven tool for complex image-based tasks, but is often overlooked for environments with low-level states, such as physical control problems. This paper introduces SALE, a novel approach for learning embeddings that model the nuanced interaction between state and action, enabling effective representation learning from low-level states. We extensively study the design space of these embeddings and highlight important design considerations. We integrate SALE and an adaptation of checkpoints for RL into TD3 to form the TD7 algorithm, which significantly outperforms existing continuous control algorithms. On OpenAI gym benchmark tasks, TD7 has an average performance gain of 276.7% and 50.7% over TD3 at 300k and 5M time steps, respectively, and works in both the online and offline settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.02451v2-abstract-full').style.display = 'none'; document.getElementById('2306.02451v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.18246">arXiv:2305.18246</a> <span> [<a href="https://arxiv.org/pdf/2305.18246">pdf</a>, <a href="https://arxiv.org/format/2305.18246">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Provable and Practical: Efficient Exploration in Reinforcement Learning via Langevin Monte Carlo </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ishfaq%2C+H">Haque Ishfaq</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+Q">Qingfeng Lan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+P">Pan Xu</a>, <a href="/search/cs?searchtype=author&query=Mahmood%2C+A+R">A. Rupam Mahmood</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Anandkumar%2C+A">Anima Anandkumar</a>, <a href="/search/cs?searchtype=author&query=Azizzadenesheli%2C+K">Kamyar Azizzadenesheli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.18246v2-abstract-short" style="display: inline;"> We present a scalable and effective exploration strategy based on Thompson sampling for reinforcement learning (RL). One of the key shortcomings of existing Thompson sampling algorithms is the need to perform a Gaussian approximation of the posterior distribution, which is not a good surrogate in most practical settings. We instead directly sample the Q function from its posterior distribution, by… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.18246v2-abstract-full').style.display = 'inline'; document.getElementById('2305.18246v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.18246v2-abstract-full" style="display: none;"> We present a scalable and effective exploration strategy based on Thompson sampling for reinforcement learning (RL). One of the key shortcomings of existing Thompson sampling algorithms is the need to perform a Gaussian approximation of the posterior distribution, which is not a good surrogate in most practical settings. We instead directly sample the Q function from its posterior distribution, by using Langevin Monte Carlo, an efficient type of Markov Chain Monte Carlo (MCMC) method. Our method only needs to perform noisy gradient descent updates to learn the exact posterior distribution of the Q function, which makes our approach easy to deploy in deep RL. We provide a rigorous theoretical analysis for the proposed method and demonstrate that, in the linear Markov decision process (linear MDP) setting, it has a regret bound of $\tilde{O}(d^{3/2}H^{3/2}\sqrt{T})$, where $d$ is the dimension of the feature mapping, $H$ is the planning horizon, and $T$ is the total number of steps. We apply this approach to deep RL, by using Adam optimizer to perform gradient updates. Our approach achieves better or similar results compared with state-of-the-art deep RL algorithms on several challenging exploration tasks from the Atari57 suite. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.18246v2-abstract-full').style.display = 'none'; document.getElementById('2305.18246v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in The Twelfth International Conference on Learning Representations (ICLR) 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.05666">arXiv:2305.05666</a> <span> [<a href="https://arxiv.org/pdf/2305.05666">pdf</a>, <a href="https://arxiv.org/format/2305.05666">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Policy Gradient Methods in the Presence of Symmetries and State Abstractions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Panangaden%2C+P">Prakash Panangaden</a>, <a href="/search/cs?searchtype=author&query=Rezaei-Shoshtari%2C+S">Sahand Rezaei-Shoshtari</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Rosie Zhao</a>, <a href="/search/cs?searchtype=author&query=Meger%2C+D">David Meger</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.05666v2-abstract-short" style="display: inline;"> Reinforcement learning (RL) on high-dimensional and complex problems relies on abstraction for improved efficiency and generalization. In this paper, we study abstraction in the continuous-control setting, and extend the definition of Markov decision process (MDP) homomorphisms to the setting of continuous state and action spaces. We derive a policy gradient theorem on the abstract MDP for both st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.05666v2-abstract-full').style.display = 'inline'; document.getElementById('2305.05666v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.05666v2-abstract-full" style="display: none;"> Reinforcement learning (RL) on high-dimensional and complex problems relies on abstraction for improved efficiency and generalization. In this paper, we study abstraction in the continuous-control setting, and extend the definition of Markov decision process (MDP) homomorphisms to the setting of continuous state and action spaces. We derive a policy gradient theorem on the abstract MDP for both stochastic and deterministic policies. Our policy gradient results allow for leveraging approximate symmetries of the environment for policy optimization. Based on these theorems, we propose a family of actor-critic algorithms that are able to learn the policy and the MDP homomorphism map simultaneously, using the lax bisimulation metric. Finally, we introduce a series of environments with continuous symmetries to further demonstrate the ability of our algorithm for action abstraction in the presence of such symmetries. We demonstrate the effectiveness of our method on our environments, as well as on challenging visual control tasks from the DeepMind Control Suite. Our method's ability to utilize MDP homomorphisms for representation learning leads to improved performance, and the visualizations of the latent space clearly demonstrate the structure of the learned abstraction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.05666v2-abstract-full').style.display = 'none'; document.getElementById('2305.05666v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in the Journal of Machine Learning Research (JMLR). arXiv admin note: text overlap with arXiv:2209.07364</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.00969">arXiv:2305.00969</a> <span> [<a href="https://arxiv.org/pdf/2305.00969">pdf</a>, <a href="https://arxiv.org/format/2305.00969">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CryCeleb: A Speaker Verification Dataset Based on Infant Cry Sounds </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Budaghyan%2C+D">David Budaghyan</a>, <a href="/search/cs?searchtype=author&query=Onu%2C+C+C">Charles C. Onu</a>, <a href="/search/cs?searchtype=author&query=Gorin%2C+A">Arsenii Gorin</a>, <a href="/search/cs?searchtype=author&query=Subakan%2C+C">Cem Subakan</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.00969v7-abstract-short" style="display: inline;"> This paper describes the Ubenwa CryCeleb dataset - a labeled collection of infant cries - and the accompanying CryCeleb 2023 task, which is a public speaker verification challenge based on cry sounds. We released more than 6 hours of manually segmented cry sounds from 786 newborns for academic use, aiming to encourage research in infant cry analysis. The inaugural public competition attracted 59 p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.00969v7-abstract-full').style.display = 'inline'; document.getElementById('2305.00969v7-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.00969v7-abstract-full" style="display: none;"> This paper describes the Ubenwa CryCeleb dataset - a labeled collection of infant cries - and the accompanying CryCeleb 2023 task, which is a public speaker verification challenge based on cry sounds. We released more than 6 hours of manually segmented cry sounds from 786 newborns for academic use, aiming to encourage research in infant cry analysis. The inaugural public competition attracted 59 participants, 11 of whom improved the baseline performance. The top-performing system achieved a significant improvement scoring 25.8% equal error rate, which is still far from the performance of state-of-the-art adult speaker verification systems. Therefore, we believe there is room for further research on this dataset, potentially extending beyond the verification task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.00969v7-abstract-full').style.display = 'none'; document.getElementById('2305.00969v7-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.14621">arXiv:2304.14621</a> <span> [<a href="https://arxiv.org/pdf/2304.14621">pdf</a>, <a href="https://arxiv.org/format/2304.14621">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> MUDiff: Unified Diffusion for Complete Molecule Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hua%2C+C">Chenqing Hua</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+S">Sitao Luan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Minkai Xu</a>, <a href="/search/cs?searchtype=author&query=Ying%2C+R">Rex Ying</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+J">Jie Fu</a>, <a href="/search/cs?searchtype=author&query=Ermon%2C+S">Stefano Ermon</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.14621v3-abstract-short" style="display: inline;"> Molecule generation is a very important practical problem, with uses in drug discovery and material design, and AI methods promise to provide useful solutions. However, existing methods for molecule generation focus either on 2D graph structure or on 3D geometric structure, which is not sufficient to represent a complete molecule as 2D graph captures mainly topology while 3D geometry captures main… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.14621v3-abstract-full').style.display = 'inline'; document.getElementById('2304.14621v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.14621v3-abstract-full" style="display: none;"> Molecule generation is a very important practical problem, with uses in drug discovery and material design, and AI methods promise to provide useful solutions. However, existing methods for molecule generation focus either on 2D graph structure or on 3D geometric structure, which is not sufficient to represent a complete molecule as 2D graph captures mainly topology while 3D geometry captures mainly spatial atom arrangements. Combining these representations is essential to better represent a molecule. In this paper, we present a new model for generating a comprehensive representation of molecules, including atom features, 2D discrete molecule structures, and 3D continuous molecule coordinates, by combining discrete and continuous diffusion processes. The use of diffusion processes allows for capturing the probabilistic nature of molecular processes and exploring the effect of different factors on molecular structures. Additionally, we propose a novel graph transformer architecture to denoise the diffusion process. The transformer adheres to 3D roto-translation equivariance constraints, allowing it to learn invariant atom and edge representations while preserving the equivariance of atom coordinates. This transformer can be used to learn molecular representations robust to geometric transformations. We evaluate the performance of our model through experiments and comparisons with existing methods, showing its ability to generate more stable and valid molecules. Our model is a promising approach for designing stable and diverse molecules and can be applied to a wide range of tasks in molecular modeling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.14621v3-abstract-full').style.display = 'none'; document.getElementById('2304.14621v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.14274">arXiv:2304.14274</a> <span> [<a href="https://arxiv.org/pdf/2304.14274">pdf</a>, <a href="https://arxiv.org/format/2304.14274">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> When Do Graph Neural Networks Help with Node Classification? Investigating the Impact of Homophily Principle on Node Distinguishability </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luan%2C+S">Sitao Luan</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+C">Chenqing Hua</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Minkai Xu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Q">Qincheng Lu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiaqi Zhu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xiao-Wen Chang</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+J">Jie Fu</a>, <a href="/search/cs?searchtype=author&query=Leskovec%2C+J">Jure Leskovec</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.14274v4-abstract-short" style="display: inline;"> Homophily principle, i.e., nodes with the same labels are more likely to be connected, has been believed to be the main reason for the performance superiority of Graph Neural Networks (GNNs) over Neural Networks on node classification tasks. Recent research suggests that, even in the absence of homophily, the advantage of GNNs still exists as long as nodes from the same class share similar neighbo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.14274v4-abstract-full').style.display = 'inline'; document.getElementById('2304.14274v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.14274v4-abstract-full" style="display: none;"> Homophily principle, i.e., nodes with the same labels are more likely to be connected, has been believed to be the main reason for the performance superiority of Graph Neural Networks (GNNs) over Neural Networks on node classification tasks. Recent research suggests that, even in the absence of homophily, the advantage of GNNs still exists as long as nodes from the same class share similar neighborhood patterns. However, this argument only considers intra-class Node Distinguishability (ND) but neglects inter-class ND, which provides incomplete understanding of homophily on GNNs. In this paper, we first demonstrate such deficiency with examples and argue that an ideal situation for ND is to have smaller intra-class ND than inter-class ND. To formulate this idea and study ND deeply, we propose Contextual Stochastic Block Model for Homophily (CSBM-H) and define two metrics, Probabilistic Bayes Error (PBE) and negative generalized Jeffreys divergence, to quantify ND. With the metrics, we visualize and analyze how graph filters, node degree distributions and class variances influence ND, and investigate the combined effect of intra- and inter-class ND. Besides, we discovered the mid-homophily pitfall, which occurs widely in graph datasets. Furthermore, we verified that, in real-work tasks, the superiority of GNNs is indeed closely related to both intra- and inter-class ND regardless of homophily levels. Grounded in this observation, we propose a new hypothesis-testing based performance metric beyond homophily, which is non-linear, feature-based and can provide statistical threshold value for GNNs' the superiority. Experiments indicate that it is significantly more effective than the existing homophily metrics on revealing the advantage and disadvantage of graph-aware modes on both synthetic and benchmark real-world datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.14274v4-abstract-full').style.display = 'none'; document.getElementById('2304.14274v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by 37th Conference on Neural Information Processing Systems (NeurIPS 2023)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.00046">arXiv:2304.00046</a> <span> [<a href="https://arxiv.org/pdf/2304.00046">pdf</a>, <a href="https://arxiv.org/format/2304.00046">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Accelerating exploration and representation learning with offline pre-training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mazoure%2C+B">Bogdan Mazoure</a>, <a href="/search/cs?searchtype=author&query=Bruce%2C+J">Jake Bruce</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Fergus%2C+R">Rob Fergus</a>, <a href="/search/cs?searchtype=author&query=Anand%2C+A">Ankit Anand</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.00046v1-abstract-short" style="display: inline;"> Sequential decision-making agents struggle with long horizon tasks, since solving them requires multi-step reasoning. Most reinforcement learning (RL) algorithms address this challenge by improved credit assignment, introducing memory capability, altering the agent's intrinsic motivation (i.e. exploration) or its worldview (i.e. knowledge representation). Many of these components could be learned… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.00046v1-abstract-full').style.display = 'inline'; document.getElementById('2304.00046v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.00046v1-abstract-full" style="display: none;"> Sequential decision-making agents struggle with long horizon tasks, since solving them requires multi-step reasoning. Most reinforcement learning (RL) algorithms address this challenge by improved credit assignment, introducing memory capability, altering the agent's intrinsic motivation (i.e. exploration) or its worldview (i.e. knowledge representation). Many of these components could be learned from offline data. In this work, we follow the hypothesis that exploration and representation learning can be improved by separately learning two different models from a single offline dataset. We show that learning a state representation using noise-contrastive estimation and a model of auxiliary reward separately from a single collection of human demonstrations can significantly improve the sample efficiency on the challenging NetHack benchmark. We also ablate various components of our experimental setting and highlight crucial insights. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.00046v1-abstract-full').style.display = 'none'; document.getElementById('2304.00046v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.06784">arXiv:2302.06784</a> <span> [<a href="https://arxiv.org/pdf/2302.06784">pdf</a>, <a href="https://arxiv.org/format/2302.06784">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> The Stable Entropy Hypothesis and Entropy-Aware Decoding: An Analysis and Algorithm for Robust Natural Language Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Arora%2C+K">Kushal Arora</a>, <a href="/search/cs?searchtype=author&query=O%27Donnell%2C+T+J">Timothy J. O'Donnell</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Cheung%2C+J+C+K">Jackie C. K. Cheung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.06784v1-abstract-short" style="display: inline;"> State-of-the-art language generation models can degenerate when applied to open-ended generation problems such as text completion, story generation, or dialog modeling. This degeneration usually shows up in the form of incoherence, lack of vocabulary diversity, and self-repetition or copying from the context. In this paper, we postulate that ``human-like'' generations usually lie in a narrow and n… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.06784v1-abstract-full').style.display = 'inline'; document.getElementById('2302.06784v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.06784v1-abstract-full" style="display: none;"> State-of-the-art language generation models can degenerate when applied to open-ended generation problems such as text completion, story generation, or dialog modeling. This degeneration usually shows up in the form of incoherence, lack of vocabulary diversity, and self-repetition or copying from the context. In this paper, we postulate that ``human-like'' generations usually lie in a narrow and nearly flat entropy band, and violation of these entropy bounds correlates with degenerate behavior. Our experiments show that this stable narrow entropy zone exists across models, tasks, and domains and confirm the hypothesis that violations of this zone correlate with degeneration. We then use this insight to propose an entropy-aware decoding algorithm that respects these entropy bounds resulting in less degenerate, more contextual, and "human-like" language generation in open-ended text generation settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.06784v1-abstract-full').style.display = 'none'; document.getElementById('2302.06784v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.10119">arXiv:2301.10119</a> <span> [<a href="https://arxiv.org/pdf/2301.10119">pdf</a>, <a href="https://arxiv.org/format/2301.10119">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Minimal Value-Equivalent Partial Models for Scalable and Robust Planning in Lifelong Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Alver%2C+S">Safa Alver</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.10119v2-abstract-short" style="display: inline;"> Learning models of the environment from pure interaction is often considered an essential component of building lifelong reinforcement learning agents. However, the common practice in model-based reinforcement learning is to learn models that model every aspect of the agent's environment, regardless of whether they are important in coming up with optimal decisions or not. In this paper, we argue t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.10119v2-abstract-full').style.display = 'inline'; document.getElementById('2301.10119v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.10119v2-abstract-full" style="display: none;"> Learning models of the environment from pure interaction is often considered an essential component of building lifelong reinforcement learning agents. However, the common practice in model-based reinforcement learning is to learn models that model every aspect of the agent's environment, regardless of whether they are important in coming up with optimal decisions or not. In this paper, we argue that such models are not particularly well-suited for performing scalable and robust planning in lifelong reinforcement learning scenarios and we propose new kinds of models that only model the relevant aspects of the environment, which we call "minimal value-equivalent partial models". After providing a formal definition for these models, we provide theoretical results demonstrating the scalability advantages of performing planning with such models and then perform experiments to empirically illustrate our theoretical results. Then, we provide some useful heuristics on how to learn these kinds of models with deep learning architectures and empirically demonstrate that models learned in such a way can allow for performing planning that is robust to distribution shifts and compounding model errors. Overall, both our theoretical and empirical results suggest that minimal value-equivalent partial models can provide significant benefits to performing scalable and robust planning in lifelong reinforcement learning scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.10119v2-abstract-full').style.display = 'none'; document.getElementById('2301.10119v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published as a conference paper at CoLLAs 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.00512">arXiv:2301.00512</a> <span> [<a href="https://arxiv.org/pdf/2301.00512">pdf</a>, <a href="https://arxiv.org/format/2301.00512">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> On the Challenges of using Reinforcement Learning in Precision Drug Dosing: Delay and Prolongedness of Action Effects </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Basu%2C+S">Sumana Basu</a>, <a href="/search/cs?searchtype=author&query=Legault%2C+M">Marc-Andr茅 Legault</a>, <a href="/search/cs?searchtype=author&query=Romero-Soriano%2C+A">Adriana Romero-Soriano</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.00512v1-abstract-short" style="display: inline;"> Drug dosing is an important application of AI, which can be formulated as a Reinforcement Learning (RL) problem. In this paper, we identify two major challenges of using RL for drug dosing: delayed and prolonged effects of administering medications, which break the Markov assumption of the RL framework. We focus on prolongedness and define PAE-POMDP (Prolonged Action Effect-Partially Observable Ma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.00512v1-abstract-full').style.display = 'inline'; document.getElementById('2301.00512v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.00512v1-abstract-full" style="display: none;"> Drug dosing is an important application of AI, which can be formulated as a Reinforcement Learning (RL) problem. In this paper, we identify two major challenges of using RL for drug dosing: delayed and prolonged effects of administering medications, which break the Markov assumption of the RL framework. We focus on prolongedness and define PAE-POMDP (Prolonged Action Effect-Partially Observable Markov Decision Process), a subclass of POMDPs in which the Markov assumption does not hold specifically due to prolonged effects of actions. Motivated by the pharmacology literature, we propose a simple and effective approach to converting drug dosing PAE-POMDPs into MDPs, enabling the use of the existing RL algorithms to solve such problems. We validate the proposed approach on a toy task, and a challenging glucose control task, for which we devise a clinically-inspired reward function. Our results demonstrate that: (1) the proposed method to restore the Markov assumption leads to significant improvements over a vanilla baseline; (2) the approach is competitive with recurrent policies which may inherently capture the prolonged effect of actions; (3) it is remarkably more time and memory efficient than the recurrent baseline and hence more suitable for real-time dosing control systems; and (4) it exhibits favorable qualitative behavior in our policy analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.00512v1-abstract-full').style.display = 'none'; document.getElementById('2301.00512v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to AAAI 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.14405">arXiv:2212.14405</a> <span> [<a href="https://arxiv.org/pdf/2212.14405">pdf</a>, <a href="https://arxiv.org/format/2212.14405">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Offline Policy Optimization in RL with Variance Regularizaton </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Islam%2C+R">Riashat Islam</a>, <a href="/search/cs?searchtype=author&query=Sinha%2C+S">Samarth Sinha</a>, <a href="/search/cs?searchtype=author&query=Bharadhwaj%2C+H">Homanga Bharadhwaj</a>, <a href="/search/cs?searchtype=author&query=Arnob%2C+S+Y">Samin Yeasar Arnob</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhuoran Yang</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+A">Animesh Garg</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhaoran Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lihong Li</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.14405v1-abstract-short" style="display: inline;"> Learning policies from fixed offline datasets is a key challenge to scale up reinforcement learning (RL) algorithms towards practical applications. This is often because off-policy RL algorithms suffer from distributional shift, due to mismatch between dataset and the target policy, leading to high variance and over-estimation of value functions. In this work, we propose variance regularization fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.14405v1-abstract-full').style.display = 'inline'; document.getElementById('2212.14405v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.14405v1-abstract-full" style="display: none;"> Learning policies from fixed offline datasets is a key challenge to scale up reinforcement learning (RL) algorithms towards practical applications. This is often because off-policy RL algorithms suffer from distributional shift, due to mismatch between dataset and the target policy, leading to high variance and over-estimation of value functions. In this work, we propose variance regularization for offline RL algorithms, using stationary distribution corrections. We show that by using Fenchel duality, we can avoid double sampling issues for computing the gradient of the variance regularizer. The proposed algorithm for offline variance regularization (OVAR) can be used to augment any existing offline policy optimization algorithms. We show that the regularizer leads to a lower bound to the offline policy optimization objective, which can help avoid over-estimation errors, and explains the benefits of our approach across a range of continuous control domains when compared to existing state-of-the-art algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.14405v1-abstract-full').style.display = 'none'; document.getElementById('2212.14405v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Old Draft, Offline RL Workshop, NeurIPS'20;</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.10822">arXiv:2212.10822</a> <span> [<a href="https://arxiv.org/pdf/2212.10822">pdf</a>, <a href="https://arxiv.org/format/2212.10822">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Complete the Missing Half: Augmenting Aggregation Filtering with Diversification for Graph Convolutional Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luan%2C+S">Sitao Luan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+M">Mingde Zhao</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+C">Chenqing Hua</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xiao-Wen Chang</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.10822v1-abstract-short" style="display: inline;"> The core operation of current Graph Neural Networks (GNNs) is the aggregation enabled by the graph Laplacian or message passing, which filters the neighborhood information of nodes. Though effective for various tasks, in this paper, we show that they are potentially a problematic factor underlying all GNN models for learning on certain datasets, as they force the node representations similar, maki… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.10822v1-abstract-full').style.display = 'inline'; document.getElementById('2212.10822v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.10822v1-abstract-full" style="display: none;"> The core operation of current Graph Neural Networks (GNNs) is the aggregation enabled by the graph Laplacian or message passing, which filters the neighborhood information of nodes. Though effective for various tasks, in this paper, we show that they are potentially a problematic factor underlying all GNN models for learning on certain datasets, as they force the node representations similar, making the nodes gradually lose their identity and become indistinguishable. Hence, we augment the aggregation operations with their dual, i.e. diversification operators that make the node more distinct and preserve the identity. Such augmentation replaces the aggregation with a two-channel filtering process that, in theory, is beneficial for enriching the node representations. In practice, the proposed two-channel filters can be easily patched on existing GNN methods with diverse training strategies, including spectral and spatial (message passing) methods. In the experiments, we observe desired characteristics of the models and significant performance boost upon the baselines on 9 node classification tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.10822v1-abstract-full').style.display = 'none'; document.getElementById('2212.10822v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted as Oral Presentation at NeurIPS 2022 New Frontiers in Graph Learning Workshop (NeurIPS GLFrontiers 2022)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.13337">arXiv:2211.13337</a> <span> [<a href="https://arxiv.org/pdf/2211.13337">pdf</a>, <a href="https://arxiv.org/format/2211.13337">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Multi-Environment Pretraining Enables Transfer to Action Limited Datasets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Venuto%2C+D">David Venuto</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Sherry Yang</a>, <a href="/search/cs?searchtype=author&query=Abbeel%2C+P">Pieter Abbeel</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Mordatch%2C+I">Igor Mordatch</a>, <a href="/search/cs?searchtype=author&query=Nachum%2C+O">Ofir Nachum</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.13337v2-abstract-short" style="display: inline;"> Using massive datasets to train large-scale models has emerged as a dominant approach for broad generalization in natural language and vision applications. In reinforcement learning, however, a key challenge is that available data of sequential decision making is often not annotated with actions - for example, videos of game-play are much more available than sequences of frames paired with their l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.13337v2-abstract-full').style.display = 'inline'; document.getElementById('2211.13337v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.13337v2-abstract-full" style="display: none;"> Using massive datasets to train large-scale models has emerged as a dominant approach for broad generalization in natural language and vision applications. In reinforcement learning, however, a key challenge is that available data of sequential decision making is often not annotated with actions - for example, videos of game-play are much more available than sequences of frames paired with their logged game controls. We propose to circumvent this challenge by combining large but sparsely-annotated datasets from a \emph{target} environment of interest with fully-annotated datasets from various other \emph{source} environments. Our method, Action Limited PreTraining (ALPT), leverages the generalization capabilities of inverse dynamics modelling (IDM) to label missing action data in the target environment. We show that utilizing even one additional environment dataset of labelled data during IDM pretraining gives rise to substantial improvements in generating action labels for unannotated sequences. We evaluate our method on benchmark game-playing environments and show that we can significantly improve game performance and generalization capability compared to other approaches, using annotated datasets equivalent to only $12$ minutes of gameplay. Highlighting the power of IDM, we show that these benefits remain even when target and source environments share no common actions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.13337v2-abstract-full').style.display = 'none'; document.getElementById('2211.13337v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.12100">arXiv:2211.12100</a> <span> [<a href="https://arxiv.org/pdf/2211.12100">pdf</a>, <a href="https://arxiv.org/format/2211.12100">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Simulating Human Gaze with Neural Visual Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Schwinn%2C+L">Leo Schwinn</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Eskofier%2C+B">Bjoern Eskofier</a>, <a href="/search/cs?searchtype=author&query=Zanca%2C+D">Dario Zanca</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.12100v1-abstract-short" style="display: inline;"> Existing models of human visual attention are generally unable to incorporate direct task guidance and therefore cannot model an intent or goal when exploring a scene. To integrate guidance of any downstream visual task into attention modeling, we propose the Neural Visual Attention (NeVA) algorithm. To this end, we impose to neural networks the biological constraint of foveated vision and train a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.12100v1-abstract-full').style.display = 'inline'; document.getElementById('2211.12100v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.12100v1-abstract-full" style="display: none;"> Existing models of human visual attention are generally unable to incorporate direct task guidance and therefore cannot model an intent or goal when exploring a scene. To integrate guidance of any downstream visual task into attention modeling, we propose the Neural Visual Attention (NeVA) algorithm. To this end, we impose to neural networks the biological constraint of foveated vision and train an attention mechanism to generate visual explorations that maximize the performance with respect to the downstream task. We observe that biologically constrained neural networks generate human-like scanpaths without being trained for this objective. Extensive experiments on three common benchmark datasets show that our method outperforms state-of-the-art unsupervised human attention models in generating human-like scanpaths. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.12100v1-abstract-full').style.display = 'none'; document.getElementById('2211.12100v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.03011">arXiv:2211.03011</a> <span> [<a href="https://arxiv.org/pdf/2211.03011">pdf</a>, <a href="https://arxiv.org/format/2211.03011">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> On learning history based policies for controlling Markov decision processes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Patil%2C+G">Gandharv Patil</a>, <a href="/search/cs?searchtype=author&query=Mahajan%2C+A">Aditya Mahajan</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.03011v1-abstract-short" style="display: inline;"> Reinforcementlearning(RL)folkloresuggeststhathistory-basedfunctionapproximationmethods,suchas recurrent neural nets or history-based state abstraction, perform better than their memory-less counterparts, due to the fact that function approximation in Markov decision processes (MDP) can be viewed as inducing a Partially observable MDP. However, there has been little formal analysis of such history-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.03011v1-abstract-full').style.display = 'inline'; document.getElementById('2211.03011v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.03011v1-abstract-full" style="display: none;"> Reinforcementlearning(RL)folkloresuggeststhathistory-basedfunctionapproximationmethods,suchas recurrent neural nets or history-based state abstraction, perform better than their memory-less counterparts, due to the fact that function approximation in Markov decision processes (MDP) can be viewed as inducing a Partially observable MDP. However, there has been little formal analysis of such history-based algorithms, as most existing frameworks focus exclusively on memory-less features. In this paper, we introduce a theoretical framework for studying the behaviour of RL algorithms that learn to control an MDP using history-based feature abstraction mappings. Furthermore, we use this framework to design a practical RL algorithm and we numerically evaluate its effectiveness on a set of continuous control tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.03011v1-abstract-full').style.display = 'none'; document.getElementById('2211.03011v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.16979">arXiv:2210.16979</a> <span> [<a href="https://arxiv.org/pdf/2210.16979">pdf</a>, <a href="https://arxiv.org/ps/2210.16979">ps</a>, <a href="https://arxiv.org/format/2210.16979">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> When Do We Need Graph Neural Networks for Node Classification? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luan%2C+S">Sitao Luan</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+C">Chenqing Hua</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Q">Qincheng Lu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiaqi Zhu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xiao-Wen Chang</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.16979v2-abstract-short" style="display: inline;"> Graph Neural Networks (GNNs) extend basic Neural Networks (NNs) by additionally making use of graph structure based on the relational inductive bias (edge bias), rather than treating the nodes as collections of independent and identically distributed (i.i.d.) samples. Though GNNs are believed to outperform basic NNs in real-world tasks, it is found that in some cases, GNNs have little performance… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.16979v2-abstract-full').style.display = 'inline'; document.getElementById('2210.16979v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.16979v2-abstract-full" style="display: none;"> Graph Neural Networks (GNNs) extend basic Neural Networks (NNs) by additionally making use of graph structure based on the relational inductive bias (edge bias), rather than treating the nodes as collections of independent and identically distributed (i.i.d.) samples. Though GNNs are believed to outperform basic NNs in real-world tasks, it is found that in some cases, GNNs have little performance gain or even underperform graph-agnostic NNs. To identify these cases, based on graph signal processing and statistical hypothesis testing, we propose two measures which analyze the cases in which the edge bias in features and labels does not provide advantages. Based on the measures, a threshold value can be given to predict the potential performance advantages of graph-aware models over graph-agnostic models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.16979v2-abstract-full').style.display = 'none'; document.getElementById('2210.16979v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by 12th International Conference on Complex Networks and Their Applications</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Precup%2C+D&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Precup%2C+D&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Precup%2C+D&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Precup%2C+D&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Precup%2C+D&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository