Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–31 of 31 results for author: <span class="mathjax">Chandak, Y</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Chandak%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Chandak, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Chandak%2C+Y&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Chandak, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.09975">arXiv:2407.09975</a> <span> [<a href="https://arxiv.org/pdf/2407.09975">pdf</a>, <a href="https://arxiv.org/format/2407.09975">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applications">stat.AP</span> </div> </div> <p class="title is-5 mathjax"> The GPT Surprise: Offering Large Language Model Chat in a Massive Coding Class Reduced Engagement but Increased Adopters Exam Performances </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nie%2C+A">Allen Nie</a>, <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Suzara%2C+M">Miroslav Suzara</a>, <a href="/search/cs?searchtype=author&query=Ali%2C+M">Malika Ali</a>, <a href="/search/cs?searchtype=author&query=Woodrow%2C+J">Juliette Woodrow</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+M">Matt Peng</a>, <a href="/search/cs?searchtype=author&query=Sahami%2C+M">Mehran Sahami</a>, <a href="/search/cs?searchtype=author&query=Brunskill%2C+E">Emma Brunskill</a>, <a href="/search/cs?searchtype=author&query=Piech%2C+C">Chris Piech</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.09975v1-abstract-short" style="display: inline;"> Large language models (LLMs) are quickly being adopted in a wide range of learning experiences, especially via ubiquitous and broadly accessible chat interfaces like ChatGPT and Copilot. This type of interface is readily available to students and teachers around the world, yet relatively little research has been done to assess the impact of such generic tools on student learning. Coding education… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09975v1-abstract-full').style.display = 'inline'; document.getElementById('2407.09975v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.09975v1-abstract-full" style="display: none;"> Large language models (LLMs) are quickly being adopted in a wide range of learning experiences, especially via ubiquitous and broadly accessible chat interfaces like ChatGPT and Copilot. This type of interface is readily available to students and teachers around the world, yet relatively little research has been done to assess the impact of such generic tools on student learning. Coding education is an interesting test case, both because LLMs have strong performance on coding tasks, and because LLM-powered support tools are rapidly becoming part of the workflow of professional software engineers. To help understand the impact of generic LLM use on coding education, we conducted a large-scale randomized control trial with 5,831 students from 146 countries in an online coding class in which we provided some students with access to a chat interface with GPT-4. We estimate positive benefits on exam performance for adopters, the students who used the tool, but over all students, the advertisement of GPT-4 led to a significant average decrease in exam participation. We observe similar decreases in other forms of course engagement. However, this decrease is modulated by the student's country of origin. Offering access to LLMs to students from low human development index countries increased their exam participation rate on average. Our results suggest there may be promising benefits to using LLMs in an introductory coding class, but also potential harms for engagement, which makes their longer term impact on student success unclear. Our work highlights the need for additional investigations to help understand the potential impact of future adoption and integration of LLMs into classrooms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09975v1-abstract-full').style.display = 'none'; document.getElementById('2407.09975v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">32 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.03674">arXiv:2407.03674</a> <span> [<a href="https://arxiv.org/pdf/2407.03674">pdf</a>, <a href="https://arxiv.org/format/2407.03674">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Short-Long Policy Evaluation with Novel Actions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nam%2C+H+A">Hyunji Alex Nam</a>, <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Brunskill%2C+E">Emma Brunskill</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.03674v2-abstract-short" style="display: inline;"> From incorporating LLMs in education, to identifying new drugs and improving ways to charge batteries, innovators constantly try new strategies in search of better long-term outcomes for students, patients and consumers. One major bottleneck in this innovation cycle is the amount of time it takes to observe the downstream effects of a decision policy that incorporates new interventions. The key qu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03674v2-abstract-full').style.display = 'inline'; document.getElementById('2407.03674v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.03674v2-abstract-full" style="display: none;"> From incorporating LLMs in education, to identifying new drugs and improving ways to charge batteries, innovators constantly try new strategies in search of better long-term outcomes for students, patients and consumers. One major bottleneck in this innovation cycle is the amount of time it takes to observe the downstream effects of a decision policy that incorporates new interventions. The key question is whether we can quickly evaluate long-term outcomes of a new decision policy without making long-term observations. Organizations often have access to prior data about past decision policies and their outcomes, evaluated over the full horizon of interest. Motivated by this, we introduce a new setting for short-long policy evaluation for sequential decision making tasks. Our proposed methods significantly outperform prior results on simulators of HIV treatment, kidney dialysis and battery charging. We also demonstrate that our methods can be useful for applications in AI safety by quickly identifying when a new decision policy is likely to have substantially lower performance than past policies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03674v2-abstract-full').style.display = 'none'; document.getElementById('2407.03674v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Added references for related work</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19188">arXiv:2406.19188</a> <span> [<a href="https://arxiv.org/pdf/2406.19188">pdf</a>, <a href="https://arxiv.org/format/2406.19188">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Averaging log-likelihoods in direct alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Grinsztajn%2C+N">Nathan Grinsztajn</a>, <a href="/search/cs?searchtype=author&query=Flet-Berliac%2C+Y">Yannis Flet-Berliac</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Strub%2C+F">Florian Strub</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Bill Wu</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+E">Eugene Choi</a>, <a href="/search/cs?searchtype=author&query=Cremer%2C+C">Chris Cremer</a>, <a href="/search/cs?searchtype=author&query=Ahmadian%2C+A">Arash Ahmadian</a>, <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Pietquin%2C+O">Olivier Pietquin</a>, <a href="/search/cs?searchtype=author&query=Geist%2C+M">Matthieu Geist</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19188v1-abstract-short" style="display: inline;"> To better align Large Language Models (LLMs) with human judgment, Reinforcement Learning from Human Feedback (RLHF) learns a reward model and then optimizes it using regularized RL. Recently, direct alignment methods were introduced to learn such a fine-tuned model directly from a preference dataset without computing a proxy reward function. These methods are built upon contrastive losses involvin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19188v1-abstract-full').style.display = 'inline'; document.getElementById('2406.19188v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19188v1-abstract-full" style="display: none;"> To better align Large Language Models (LLMs) with human judgment, Reinforcement Learning from Human Feedback (RLHF) learns a reward model and then optimizes it using regularized RL. Recently, direct alignment methods were introduced to learn such a fine-tuned model directly from a preference dataset without computing a proxy reward function. These methods are built upon contrastive losses involving the log-likelihood of (dis)preferred completions according to the trained model. However, completions have various lengths, and the log-likelihood is not length-invariant. On the other side, the cross-entropy loss used in supervised training is length-invariant, as batches are typically averaged token-wise. To reconcile these approaches, we introduce a principled approach for making direct alignment length-invariant. Formally, we introduce a new averaging operator, to be composed with the optimality operator giving the best policy for the underlying RL problem. It translates into averaging the log-likelihood within the loss. We empirically study the effect of such averaging, observing a trade-off between the length of generations and their scores. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19188v1-abstract-full').style.display = 'none'; document.getElementById('2406.19188v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19185">arXiv:2406.19185</a> <span> [<a href="https://arxiv.org/pdf/2406.19185">pdf</a>, <a href="https://arxiv.org/format/2406.19185">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Contrastive Policy Gradient: Aligning LLMs on sequence-level scores in a supervised-friendly fashion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Flet-Berliac%2C+Y">Yannis Flet-Berliac</a>, <a href="/search/cs?searchtype=author&query=Grinsztajn%2C+N">Nathan Grinsztajn</a>, <a href="/search/cs?searchtype=author&query=Strub%2C+F">Florian Strub</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+E">Eugene Choi</a>, <a href="/search/cs?searchtype=author&query=Cremer%2C+C">Chris Cremer</a>, <a href="/search/cs?searchtype=author&query=Ahmadian%2C+A">Arash Ahmadian</a>, <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Pietquin%2C+O">Olivier Pietquin</a>, <a href="/search/cs?searchtype=author&query=Geist%2C+M">Matthieu Geist</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19185v1-abstract-short" style="display: inline;"> Reinforcement Learning (RL) has been used to finetune Large Language Models (LLMs) using a reward model trained from preference data, to better align with human judgment. The recently introduced direct alignment methods, which are often simpler, more stable, and computationally lighter, can more directly achieve this. However, these approaches cannot optimize arbitrary rewards, and the preference-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19185v1-abstract-full').style.display = 'inline'; document.getElementById('2406.19185v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19185v1-abstract-full" style="display: none;"> Reinforcement Learning (RL) has been used to finetune Large Language Models (LLMs) using a reward model trained from preference data, to better align with human judgment. The recently introduced direct alignment methods, which are often simpler, more stable, and computationally lighter, can more directly achieve this. However, these approaches cannot optimize arbitrary rewards, and the preference-based ones are not the only rewards of interest for LLMs (eg., unit tests for code generation or textual entailment for summarization, among others). RL-finetuning is usually done with a variation of policy gradient, which calls for on-policy or near-on-policy samples, requiring costly generations. We introduce Contrastive Policy Gradient, or CoPG, a simple and mathematically principled new RL algorithm that can estimate the optimal policy even from off-policy data. It can be seen as an off-policy policy gradient approach that does not rely on important sampling techniques and highlights the importance of using (the right) state baseline. We show this approach to generalize the direct alignment method IPO (identity preference optimization) and classic policy gradient. We experiment with the proposed CoPG on a toy bandit problem to illustrate its properties, as well as for finetuning LLMs on a summarization task, using a learned reward function considered as ground truth for the purpose of the experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19185v1-abstract-full').style.display = 'none'; document.getElementById('2406.19185v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.17708">arXiv:2405.17708</a> <span> [<a href="https://arxiv.org/pdf/2405.17708">pdf</a>, <a href="https://arxiv.org/format/2405.17708">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> OPERA: Automatic Offline Policy Evaluation with Re-weighted Aggregates of Multiple Estimators </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nie%2C+A">Allen Nie</a>, <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+C+J">Christina J. Yuan</a>, <a href="/search/cs?searchtype=author&query=Badrinath%2C+A">Anirudhan Badrinath</a>, <a href="/search/cs?searchtype=author&query=Flet-Berliac%2C+Y">Yannis Flet-Berliac</a>, <a href="/search/cs?searchtype=author&query=Brunskil%2C+E">Emma Brunskil</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.17708v2-abstract-short" style="display: inline;"> Offline policy evaluation (OPE) allows us to evaluate and estimate a new sequential decision-making policy's performance by leveraging historical interaction data collected from other policies. Evaluating a new policy online without a confident estimate of its performance can lead to costly, unsafe, or hazardous outcomes, especially in education and healthcare. Several OPE estimators have been pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17708v2-abstract-full').style.display = 'inline'; document.getElementById('2405.17708v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.17708v2-abstract-full" style="display: none;"> Offline policy evaluation (OPE) allows us to evaluate and estimate a new sequential decision-making policy's performance by leveraging historical interaction data collected from other policies. Evaluating a new policy online without a confident estimate of its performance can lead to costly, unsafe, or hazardous outcomes, especially in education and healthcare. Several OPE estimators have been proposed in the last decade, many of which have hyperparameters and require training. Unfortunately, choosing the best OPE algorithm for each task and domain is still unclear. In this paper, we propose a new algorithm that adaptively blends a set of OPE estimators given a dataset without relying on an explicit selection using a statistical procedure. We prove that our estimator is consistent and satisfies several desirable properties for policy evaluation. Additionally, we demonstrate that when compared to alternative approaches, our estimator can be used to select higher-performing policies in healthcare and robotics. Our work contributes to improving ease of use for a general-purpose, estimator-agnostic, off-policy evaluation framework for offline RL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17708v2-abstract-full').style.display = 'none'; document.getElementById('2405.17708v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.10547">arXiv:2404.10547</a> <span> [<a href="https://arxiv.org/pdf/2404.10547">pdf</a>, <a href="https://arxiv.org/format/2404.10547">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A/B testing under Interference with Partial Network Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shankar%2C+S">Shiv Shankar</a>, <a href="/search/cs?searchtype=author&query=Sinha%2C+R">Ritwik Sinha</a>, <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Mitra%2C+S">Saayan Mitra</a>, <a href="/search/cs?searchtype=author&query=Fiterau%2C+M">Madalina Fiterau</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.10547v1-abstract-short" style="display: inline;"> A/B tests are often required to be conducted on subjects that might have social connections. For e.g., experiments on social media, or medical and social interventions to control the spread of an epidemic. In such settings, the SUTVA assumption for randomized-controlled trials is violated due to network interference, or spill-over effects, as treatments to group A can potentially also affect the c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10547v1-abstract-full').style.display = 'inline'; document.getElementById('2404.10547v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.10547v1-abstract-full" style="display: none;"> A/B tests are often required to be conducted on subjects that might have social connections. For e.g., experiments on social media, or medical and social interventions to control the spread of an epidemic. In such settings, the SUTVA assumption for randomized-controlled trials is violated due to network interference, or spill-over effects, as treatments to group A can potentially also affect the control group B. When the underlying social network is known exactly, prior works have demonstrated how to conduct A/B tests adequately to estimate the global average treatment effect (GATE). However, in practice, it is often impossible to obtain knowledge about the exact underlying network. In this paper, we present UNITE: a novel estimator that relax this assumption and can identify GATE while only relying on knowledge of the superset of neighbors for any subject in the graph. Through theoretical analysis and extensive experiments, we show that the proposed approach performs better in comparison to standard estimators. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10547v1-abstract-full').style.display = 'none'; document.getElementById('2404.10547v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AISTATS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.02438">arXiv:2312.02438</a> <span> [<a href="https://arxiv.org/pdf/2312.02438">pdf</a>, <a href="https://arxiv.org/format/2312.02438">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Instrument Design for Indirect Experiments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Shankar%2C+S">Shiv Shankar</a>, <a href="/search/cs?searchtype=author&query=Syrgkanis%2C+V">Vasilis Syrgkanis</a>, <a href="/search/cs?searchtype=author&query=Brunskill%2C+E">Emma Brunskill</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.02438v1-abstract-short" style="display: inline;"> Indirect experiments provide a valuable framework for estimating treatment effects in situations where conducting randomized control trials (RCTs) is impractical or unethical. Unlike RCTs, indirect experiments estimate treatment effects by leveraging (conditional) instrumental variables, enabling estimation through encouragement and recommendation rather than strict treatment assignment. However,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.02438v1-abstract-full').style.display = 'inline'; document.getElementById('2312.02438v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.02438v1-abstract-full" style="display: none;"> Indirect experiments provide a valuable framework for estimating treatment effects in situations where conducting randomized control trials (RCTs) is impractical or unethical. Unlike RCTs, indirect experiments estimate treatment effects by leveraging (conditional) instrumental variables, enabling estimation through encouragement and recommendation rather than strict treatment assignment. However, the sample efficiency of such estimators depends not only on the inherent variability in outcomes but also on the varying compliance levels of users with the instrumental variables and the choice of estimator being used, especially when dealing with numerous instrumental variables. While adaptive experiment design has a rich literature for direct experiments, in this paper we take the initial steps towards enhancing sample efficiency for indirect experiments by adaptively designing a data collection policy over instrumental variables. Our main contribution is a practical computational procedure that utilizes influence functions to search for an optimal data collection policy, minimizing the mean-squared error of the desired (non-linear) estimator. Through experiments conducted in various domains inspired by real-world applications, we showcase how our method can significantly improve the sample efficiency of indirect experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.02438v1-abstract-full').style.display = 'none'; document.getElementById('2312.02438v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.19007">arXiv:2310.19007</a> <span> [<a href="https://arxiv.org/pdf/2310.19007">pdf</a>, <a href="https://arxiv.org/format/2310.19007">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Behavior Alignment via Reward Function Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gupta%2C+D">Dhawal Gupta</a>, <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Jordan%2C+S+M">Scott M. Jordan</a>, <a href="/search/cs?searchtype=author&query=Thomas%2C+P+S">Philip S. Thomas</a>, <a href="/search/cs?searchtype=author&query=da+Silva%2C+B+C">Bruno Castro da Silva</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.19007v2-abstract-short" style="display: inline;"> Designing reward functions for efficiently guiding reinforcement learning (RL) agents toward specific behaviors is a complex task. This is challenging since it requires the identification of reward structures that are not sparse and that avoid inadvertently inducing undesirable behaviors. Naively modifying the reward structure to offer denser and more frequent feedback can lead to unintended outco… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.19007v2-abstract-full').style.display = 'inline'; document.getElementById('2310.19007v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.19007v2-abstract-full" style="display: none;"> Designing reward functions for efficiently guiding reinforcement learning (RL) agents toward specific behaviors is a complex task. This is challenging since it requires the identification of reward structures that are not sparse and that avoid inadvertently inducing undesirable behaviors. Naively modifying the reward structure to offer denser and more frequent feedback can lead to unintended outcomes and promote behaviors that are not aligned with the designer's intended goal. Although potential-based reward shaping is often suggested as a remedy, we systematically investigate settings where deploying it often significantly impairs performance. To address these issues, we introduce a new framework that uses a bi-level objective to learn \emph{behavior alignment reward functions}. These functions integrate auxiliary rewards reflecting a designer's heuristics and domain knowledge with the environment's primary rewards. Our approach automatically determines the most effective way to blend these types of feedback, thereby enhancing robustness against heuristic reward misspecification. Remarkably, it can also adapt an agent's policy optimization process to mitigate suboptimalities resulting from limitations and biases inherent in the underlying RL algorithms. We evaluate our method's efficacy on a diverse set of tasks, from small-scale experiments to high-dimensional control challenges. We investigate heuristic auxiliary rewards of varying quality -- some of which are beneficial and others detrimental to the learning process. Our results show that our framework offers a robust and principled way to integrate designer-specified heuristics. It not only addresses key shortcomings of existing approaches but also consistently leads to high-performing solutions, even when given misaligned or poorly-specified auxiliary reward functions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.19007v2-abstract-full').style.display = 'none'; document.getElementById('2310.19007v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">(Spotlight) Thirty-seventh Conference on Neural Information Processing Systems (NeurIPS 2023)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.14892">arXiv:2306.14892</a> <span> [<a href="https://arxiv.org/pdf/2306.14892">pdf</a>, <a href="https://arxiv.org/format/2306.14892">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Supervised Pretraining Can Learn In-Context Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+J+N">Jonathan N. Lee</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+A">Annie Xie</a>, <a href="/search/cs?searchtype=author&query=Pacchiano%2C+A">Aldo Pacchiano</a>, <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Finn%2C+C">Chelsea Finn</a>, <a href="/search/cs?searchtype=author&query=Nachum%2C+O">Ofir Nachum</a>, <a href="/search/cs?searchtype=author&query=Brunskill%2C+E">Emma Brunskill</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.14892v1-abstract-short" style="display: inline;"> Large transformer models trained on diverse datasets have shown a remarkable ability to learn in-context, achieving high few-shot performance on tasks they were not explicitly trained to solve. In this paper, we study the in-context learning capabilities of transformers in decision-making problems, i.e., reinforcement learning (RL) for bandits and Markov decision processes. To do so, we introduce… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.14892v1-abstract-full').style.display = 'inline'; document.getElementById('2306.14892v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.14892v1-abstract-full" style="display: none;"> Large transformer models trained on diverse datasets have shown a remarkable ability to learn in-context, achieving high few-shot performance on tasks they were not explicitly trained to solve. In this paper, we study the in-context learning capabilities of transformers in decision-making problems, i.e., reinforcement learning (RL) for bandits and Markov decision processes. To do so, we introduce and study Decision-Pretrained Transformer (DPT), a supervised pretraining method where the transformer predicts an optimal action given a query state and an in-context dataset of interactions, across a diverse set of tasks. This procedure, while simple, produces a model with several surprising capabilities. We find that the pretrained transformer can be used to solve a range of RL problems in-context, exhibiting both exploration online and conservatism offline, despite not being explicitly trained to do so. The model also generalizes beyond the pretraining distribution to new tasks and automatically adapts its decision-making strategies to unknown structure. Theoretically, we show DPT can be viewed as an efficient implementation of Bayesian posterior sampling, a provably sample-efficient RL algorithm. We further leverage this connection to provide guarantees on the regret of the in-context algorithm yielded by DPT, and prove that it can learn faster than algorithms used to generate the pretraining data. These results suggest a promising yet simple path towards instilling strong in-context decision-making abilities in transformers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.14892v1-abstract-full').style.display = 'none'; document.getElementById('2306.14892v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.09838">arXiv:2305.09838</a> <span> [<a href="https://arxiv.org/pdf/2305.09838">pdf</a>, <a href="https://arxiv.org/format/2305.09838">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Coagent Networks: Generalized and Scaled </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kostas%2C+J+E">James E. Kostas</a>, <a href="/search/cs?searchtype=author&query=Jordan%2C+S+M">Scott M. Jordan</a>, <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Theocharous%2C+G">Georgios Theocharous</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+D">Dhawal Gupta</a>, <a href="/search/cs?searchtype=author&query=White%2C+M">Martha White</a>, <a href="/search/cs?searchtype=author&query=da+Silva%2C+B+C">Bruno Castro da Silva</a>, <a href="/search/cs?searchtype=author&query=Thomas%2C+P+S">Philip S. Thomas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.09838v1-abstract-short" style="display: inline;"> Coagent networks for reinforcement learning (RL) [Thomas and Barto, 2011] provide a powerful and flexible framework for deriving principled learning rules for arbitrary stochastic neural networks. The coagent framework offers an alternative to backpropagation-based deep learning (BDL) that overcomes some of backpropagation's main limitations. For example, coagent networks can compute different par… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.09838v1-abstract-full').style.display = 'inline'; document.getElementById('2305.09838v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.09838v1-abstract-full" style="display: none;"> Coagent networks for reinforcement learning (RL) [Thomas and Barto, 2011] provide a powerful and flexible framework for deriving principled learning rules for arbitrary stochastic neural networks. The coagent framework offers an alternative to backpropagation-based deep learning (BDL) that overcomes some of backpropagation's main limitations. For example, coagent networks can compute different parts of the network \emph{asynchronously} (at different rates or at different times), can incorporate non-differentiable components that cannot be used with backpropagation, and can explore at levels higher than their action spaces (that is, they can be designed as hierarchical networks for exploration and/or temporal abstraction). However, the coagent framework is not just an alternative to BDL; the two approaches can be blended: BDL can be combined with coagent learning rules to create architectures with the advantages of both approaches. This work generalizes the coagent theory and learning rules provided by previous works; this generalization provides more flexibility for network architecture design within the coagent framework. This work also studies one of the chief disadvantages of coagent networks: high variance updates for networks that have many coagents and do not use backpropagation. We show that a coagent algorithm with a policy network that does not use backpropagation can scale to a challenging RL domain with a high-dimensional state and action space (the MuJoCo Ant environment), learning reasonable (although not state-of-the-art) policies. These contributions motivate and provide a more general theoretical foundation for future work that studies coagent networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.09838v1-abstract-full').style.display = 'none'; document.getElementById('2305.09838v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.07839">arXiv:2305.07839</a> <span> [<a href="https://arxiv.org/pdf/2305.07839">pdf</a>, <a href="https://arxiv.org/format/2305.07839">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> The Geometry of Multilingual Language Models: An Equality Lens </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shah%2C+C">Cheril Shah</a>, <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yashashree Chandak</a>, <a href="/search/cs?searchtype=author&query=Suri%2C+M">Manan Suri</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.07839v1-abstract-short" style="display: inline;"> Understanding the representations of different languages in multilingual language models is essential for comprehending their cross-lingual properties, predicting their performance on downstream tasks, and identifying any biases across languages. In our study, we analyze the geometry of three multilingual language models in Euclidean space and find that all languages are represented by unique geom… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.07839v1-abstract-full').style.display = 'inline'; document.getElementById('2305.07839v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.07839v1-abstract-full" style="display: none;"> Understanding the representations of different languages in multilingual language models is essential for comprehending their cross-lingual properties, predicting their performance on downstream tasks, and identifying any biases across languages. In our study, we analyze the geometry of three multilingual language models in Euclidean space and find that all languages are represented by unique geometries. Using a geometric separability index we find that although languages tend to be closer according to their linguistic family, they are almost separable with languages from other families. We also introduce a Cross-Lingual Similarity Index to measure the distance of languages with each other in the semantic space. Our findings indicate that the low-resource languages are not represented as good as high resource languages in any of the models <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.07839v1-abstract-full').style.display = 'none'; document.getElementById('2305.07839v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 6 figues, 1st ICLR TinyPapers</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.00654">arXiv:2305.00654</a> <span> [<a href="https://arxiv.org/pdf/2305.00654">pdf</a>, <a href="https://arxiv.org/format/2305.00654">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Representations and Exploration for Deep Reinforcement Learning using Singular Value Decomposition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Thakoor%2C+S">Shantanu Thakoor</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z+D">Zhaohan Daniel Guo</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">Remi Munos</a>, <a href="/search/cs?searchtype=author&query=Dabney%2C+W">Will Dabney</a>, <a href="/search/cs?searchtype=author&query=Borsa%2C+D+L">Diana L Borsa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.00654v2-abstract-short" style="display: inline;"> Representation learning and exploration are among the key challenges for any deep reinforcement learning agent. In this work, we provide a singular value decomposition based method that can be used to obtain representations that preserve the underlying transition structure in the domain. Perhaps interestingly, we show that these representations also capture the relative frequency of state visitati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.00654v2-abstract-full').style.display = 'inline'; document.getElementById('2305.00654v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.00654v2-abstract-full" style="display: none;"> Representation learning and exploration are among the key challenges for any deep reinforcement learning agent. In this work, we provide a singular value decomposition based method that can be used to obtain representations that preserve the underlying transition structure in the domain. Perhaps interestingly, we show that these representations also capture the relative frequency of state visitations, thereby providing an estimate for pseudo-counts for free. To scale this decomposition method to large-scale domains, we provide an algorithm that never requires building the transition matrix, can make use of deep networks, and also permits mini-batch training. Further, we draw inspiration from predictive state representations and extend our decomposition method to partially observable environments. With experiments on multi-task settings with partially observable domains, we show that the proposed method can not only learn useful representation on DM-Lab-30 environments (that have inputs involving language instructions, pixel images, and rewards, among others) but it can also be effective at hard exploration tasks in DM-Hard-8 environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.00654v2-abstract-full').style.display = 'none'; document.getElementById('2305.00654v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at the 40th International Conference on Machine Learning (ICML 2023)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.11725">arXiv:2302.11725</a> <span> [<a href="https://arxiv.org/pdf/2302.11725">pdf</a>, <a href="https://arxiv.org/format/2302.11725">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Asymptotically Unbiased Off-Policy Policy Evaluation when Reusing Old Data in Nonstationary Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+V">Vincent Liu</a>, <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Thomas%2C+P">Philip Thomas</a>, <a href="/search/cs?searchtype=author&query=White%2C+M">Martha White</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.11725v1-abstract-short" style="display: inline;"> In this work, we consider the off-policy policy evaluation problem for contextual bandits and finite horizon reinforcement learning in the nonstationary setting. Reusing old data is critical for policy evaluation, but existing estimators that reuse old data introduce large bias such that we can not obtain a valid confidence interval. Inspired from a related field called survey sampling, we introdu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.11725v1-abstract-full').style.display = 'inline'; document.getElementById('2302.11725v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.11725v1-abstract-full" style="display: none;"> In this work, we consider the off-policy policy evaluation problem for contextual bandits and finite horizon reinforcement learning in the nonstationary setting. Reusing old data is critical for policy evaluation, but existing estimators that reuse old data introduce large bias such that we can not obtain a valid confidence interval. Inspired from a related field called survey sampling, we introduce a variant of the doubly robust (DR) estimator, called the regression-assisted DR estimator, that can incorporate the past data without introducing a large bias. The estimator unifies several existing off-policy policy evaluation methods and improves on them with the use of auxiliary information and a regression approach. We prove that the new estimator is asymptotically unbiased, and provide a consistent variance estimator to a construct a large sample confidence interval. Finally, we empirically show that the new estimator improves estimation for the current and future policy values, and provides a tight and valid interval estimation in several nonstationary recommendation environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.11725v1-abstract-full').style.display = 'none'; document.getElementById('2302.11725v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AISTATS 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.03161">arXiv:2302.03161</a> <span> [<a href="https://arxiv.org/pdf/2302.03161">pdf</a>, <a href="https://arxiv.org/format/2302.03161">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Optimization using Parallel Gradient Evaluations on Multiple Parameters </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Shankar%2C+S">Shiv Shankar</a>, <a href="/search/cs?searchtype=author&query=Gandikota%2C+V">Venkata Gandikota</a>, <a href="/search/cs?searchtype=author&query=Thomas%2C+P+S">Philip S. Thomas</a>, <a href="/search/cs?searchtype=author&query=Mazumdar%2C+A">Arya Mazumdar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.03161v1-abstract-short" style="display: inline;"> We propose a first-order method for convex optimization, where instead of being restricted to the gradient from a single parameter, gradients from multiple parameters can be used during each step of gradient descent. This setup is particularly useful when a few processors are available that can be used in parallel for optimization. Our method uses gradients from multiple parameters in synergy to u… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.03161v1-abstract-full').style.display = 'inline'; document.getElementById('2302.03161v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.03161v1-abstract-full" style="display: none;"> We propose a first-order method for convex optimization, where instead of being restricted to the gradient from a single parameter, gradients from multiple parameters can be used during each step of gradient descent. This setup is particularly useful when a few processors are available that can be used in parallel for optimization. Our method uses gradients from multiple parameters in synergy to update these parameters together towards the optima. While doing so, it is ensured that the computational and memory complexity is of the same order as that of gradient descent. Empirical results demonstrate that even using gradients from as low as \textit{two} parameters, our method can often obtain significant acceleration and provide robustness to hyper-parameter settings. We remark that the primary goal of this work is less theoretical, and is instead aimed at exploring the understudied case of using multiple gradients during each step of optimization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.03161v1-abstract-full').style.display = 'none'; document.getElementById('2302.03161v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at OPT workshop @ Thirty-sixth Conference on Neural Information Processing Systems (NeurIPS 2022)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.10330">arXiv:2301.10330</a> <span> [<a href="https://arxiv.org/pdf/2301.10330">pdf</a>, <a href="https://arxiv.org/format/2301.10330">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Off-Policy Evaluation for Action-Dependent Non-Stationary Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Shankar%2C+S">Shiv Shankar</a>, <a href="/search/cs?searchtype=author&query=Bastian%2C+N+D">Nathaniel D. Bastian</a>, <a href="/search/cs?searchtype=author&query=da+Silva%2C+B+C">Bruno Castro da Silva</a>, <a href="/search/cs?searchtype=author&query=Brunskil%2C+E">Emma Brunskil</a>, <a href="/search/cs?searchtype=author&query=Thomas%2C+P+S">Philip S. Thomas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.10330v1-abstract-short" style="display: inline;"> Methods for sequential decision-making are often built upon a foundational assumption that the underlying decision process is stationary. This limits the application of such methods because real-world problems are often subject to changes due to external factors (passive non-stationarity), changes induced by interactions with the system itself (active non-stationarity), or both (hybrid non-station… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.10330v1-abstract-full').style.display = 'inline'; document.getElementById('2301.10330v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.10330v1-abstract-full" style="display: none;"> Methods for sequential decision-making are often built upon a foundational assumption that the underlying decision process is stationary. This limits the application of such methods because real-world problems are often subject to changes due to external factors (passive non-stationarity), changes induced by interactions with the system itself (active non-stationarity), or both (hybrid non-stationarity). In this work, we take the first steps towards the fundamental challenge of on-policy and off-policy evaluation amidst structured changes due to active, passive, or hybrid non-stationarity. Towards this goal, we make a higher-order stationarity assumption such that non-stationarity results in changes over time, but the way changes happen is fixed. We propose, OPEN, an algorithm that uses a double application of counterfactual reasoning and a novel importance-weighted instrument-variable regression to obtain both a lower bias and a lower variance estimate of the structure in the changes of a policy's past performances. Finally, we show promising results on how OPEN can be used to predict future performances for several domains inspired by real-world applications that exhibit non-stationarity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.10330v1-abstract-full').style.display = 'none'; document.getElementById('2301.10330v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at Thirty-sixth Conference on Neural Information Processing Systems (NeurIPS 2022)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.03319">arXiv:2212.03319</a> <span> [<a href="https://arxiv.org/pdf/2212.03319">pdf</a>, <a href="https://arxiv.org/format/2212.03319">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Understanding Self-Predictive Learning for Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z+D">Zhaohan Daniel Guo</a>, <a href="/search/cs?searchtype=author&query=Richemond%2C+P+H">Pierre Harvey Richemond</a>, <a href="/search/cs?searchtype=author&query=Pires%2C+B+%C3%81">Bernardo 脕vila Pires</a>, <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a>, <a href="/search/cs?searchtype=author&query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+C+L">Charline Le Lan</a>, <a href="/search/cs?searchtype=author&query=Lyle%2C+C">Clare Lyle</a>, <a href="/search/cs?searchtype=author&query=Gy%C3%B6rgy%2C+A">Andr谩s Gy枚rgy</a>, <a href="/search/cs?searchtype=author&query=Thakoor%2C+S">Shantanu Thakoor</a>, <a href="/search/cs?searchtype=author&query=Dabney%2C+W">Will Dabney</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&query=Calandriello%2C+D">Daniele Calandriello</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.03319v1-abstract-short" style="display: inline;"> We study the learning dynamics of self-predictive learning for reinforcement learning, a family of algorithms that learn representations by minimizing the prediction error of their own future latent representations. Despite its recent empirical success, such algorithms have an apparent defect: trivial representations (such as constants) minimize the prediction error, yet it is obviously undesirabl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.03319v1-abstract-full').style.display = 'inline'; document.getElementById('2212.03319v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.03319v1-abstract-full" style="display: none;"> We study the learning dynamics of self-predictive learning for reinforcement learning, a family of algorithms that learn representations by minimizing the prediction error of their own future latent representations. Despite its recent empirical success, such algorithms have an apparent defect: trivial representations (such as constants) minimize the prediction error, yet it is obviously undesirable to converge to such solutions. Our central insight is that careful designs of the optimization dynamics are critical to learning meaningful representations. We identify that a faster paced optimization of the predictor and semi-gradient updates on the representation, are crucial to preventing the representation collapse. Then in an idealized setup, we show self-predictive learning dynamics carries out spectral decomposition on the state transition matrix, effectively capturing information of the transition dynamics. Building on the theoretical insights, we propose bidirectional self-predictive learning, a novel self-predictive algorithm that learns two representations simultaneously. We examine the robustness of our theoretical insights with a number of small-scale experiments and showcase the promise of the novel representation learning algorithm with large-scale experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.03319v1-abstract-full').style.display = 'none'; document.getElementById('2212.03319v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.09169">arXiv:2112.09169</a> <span> [<a href="https://arxiv.org/pdf/2112.09169">pdf</a>, <a href="https://arxiv.org/format/2112.09169">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> On Optimizing Interventions in Shared Autonomy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+W">Weihao Tan</a>, <a href="/search/cs?searchtype=author&query=Koleczek%2C+D">David Koleczek</a>, <a href="/search/cs?searchtype=author&query=Pradhan%2C+S">Siddhant Pradhan</a>, <a href="/search/cs?searchtype=author&query=Perello%2C+N">Nicholas Perello</a>, <a href="/search/cs?searchtype=author&query=Chettiar%2C+V">Vivek Chettiar</a>, <a href="/search/cs?searchtype=author&query=Rohra%2C+V">Vishal Rohra</a>, <a href="/search/cs?searchtype=author&query=Rajaram%2C+A">Aaslesha Rajaram</a>, <a href="/search/cs?searchtype=author&query=Srinivasan%2C+S">Soundararajan Srinivasan</a>, <a href="/search/cs?searchtype=author&query=Hossain%2C+H+M+S">H M Sajjad Hossain</a>, <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.09169v2-abstract-short" style="display: inline;"> Shared autonomy refers to approaches for enabling an autonomous agent to collaborate with a human with the aim of improving human performance. However, besides improving performance, it may often also be beneficial that the agent concurrently accounts for preserving the user's experience or satisfaction of collaboration. In order to address this additional goal, we examine approaches for improving… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.09169v2-abstract-full').style.display = 'inline'; document.getElementById('2112.09169v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.09169v2-abstract-full" style="display: none;"> Shared autonomy refers to approaches for enabling an autonomous agent to collaborate with a human with the aim of improving human performance. However, besides improving performance, it may often also be beneficial that the agent concurrently accounts for preserving the user's experience or satisfaction of collaboration. In order to address this additional goal, we examine approaches for improving the user experience by constraining the number of interventions by the autonomous agent. We propose two model-free reinforcement learning methods that can account for both hard and soft constraints on the number of interventions. We show that not only does our method outperform the existing baseline, but also eliminates the need to manually tune a black-box hyperparameter for controlling the level of assistance. We also provide an in-depth analysis of intervention scenarios in order to further illuminate system understanding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.09169v2-abstract-full').style.display = 'none'; document.getElementById('2112.09169v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.03936">arXiv:2111.03936</a> <span> [<a href="https://arxiv.org/pdf/2111.03936">pdf</a>, <a href="https://arxiv.org/format/2111.03936">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SOPE: Spectrum of Off-Policy Estimators </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+C+J">Christina J. Yuan</a>, <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Giguere%2C+S">Stephen Giguere</a>, <a href="/search/cs?searchtype=author&query=Thomas%2C+P+S">Philip S. Thomas</a>, <a href="/search/cs?searchtype=author&query=Niekum%2C+S">Scott Niekum</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.03936v3-abstract-short" style="display: inline;"> Many sequential decision making problems are high-stakes and require off-policy evaluation (OPE) of a new policy using historical data collected using some other policy. One of the most common OPE techniques that provides unbiased estimates is trajectory based importance sampling (IS). However, due to the high variance of trajectory IS estimates, importance sampling methods based on state-action v… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.03936v3-abstract-full').style.display = 'inline'; document.getElementById('2111.03936v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.03936v3-abstract-full" style="display: none;"> Many sequential decision making problems are high-stakes and require off-policy evaluation (OPE) of a new policy using historical data collected using some other policy. One of the most common OPE techniques that provides unbiased estimates is trajectory based importance sampling (IS). However, due to the high variance of trajectory IS estimates, importance sampling methods based on state-action visitation distributions (SIS) have recently been adopted. Unfortunately, while SIS often provides lower variance estimates for long horizons, estimating the state-action distribution ratios can be challenging and lead to biased estimates. In this paper, we present a new perspective on this bias-variance trade-off and show the existence of a spectrum of estimators whose endpoints are SIS and IS. Additionally, we also establish a spectrum for doubly-robust and weighted version of these estimators. We provide empirical evidence that estimators in this spectrum can be used to trade-off between the bias and variance of IS and SIS and can achieve lower mean-squared error than both IS and SIS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.03936v3-abstract-full').style.display = 'none'; document.getElementById('2111.03936v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at Thirty-fifth Conference on Neural Information Processing Systems (NeurIPS 2021)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.12820">arXiv:2104.12820</a> <span> [<a href="https://arxiv.org/pdf/2104.12820">pdf</a>, <a href="https://arxiv.org/format/2104.12820">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Universal Off-Policy Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Niekum%2C+S">Scott Niekum</a>, <a href="/search/cs?searchtype=author&query=da+Silva%2C+B+C">Bruno Castro da Silva</a>, <a href="/search/cs?searchtype=author&query=Learned-Miller%2C+E">Erik Learned-Miller</a>, <a href="/search/cs?searchtype=author&query=Brunskill%2C+E">Emma Brunskill</a>, <a href="/search/cs?searchtype=author&query=Thomas%2C+P+S">Philip S. Thomas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.12820v2-abstract-short" style="display: inline;"> When faced with sequential decision-making problems, it is often useful to be able to predict what would happen if decisions were made using a new policy. Those predictions must often be based on data collected under some previously used decision-making rule. Many previous methods enable such off-policy (or counterfactual) estimation of the expected value of a performance measure called the return… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.12820v2-abstract-full').style.display = 'inline'; document.getElementById('2104.12820v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.12820v2-abstract-full" style="display: none;"> When faced with sequential decision-making problems, it is often useful to be able to predict what would happen if decisions were made using a new policy. Those predictions must often be based on data collected under some previously used decision-making rule. Many previous methods enable such off-policy (or counterfactual) estimation of the expected value of a performance measure called the return. In this paper, we take the first steps towards a universal off-policy estimator (UnO) -- one that provides off-policy estimates and high-confidence bounds for any parameter of the return distribution. We use UnO for estimating and simultaneously bounding the mean, variance, quantiles/median, inter-quantile range, CVaR, and the entire cumulative distribution of returns. Finally, we also discuss Uno's applicability in various settings, including fully observable, partially observable (i.e., with unobserved confounders), Markovian, non-Markovian, stationary, smoothly non-stationary, and discrete distribution shifts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.12820v2-abstract-full').style.display = 'none'; document.getElementById('2104.12820v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at Thirty-fifth Conference on Neural Information Processing Systems (NeurIPS 2021)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2101.09847">arXiv:2101.09847</a> <span> [<a href="https://arxiv.org/pdf/2101.09847">pdf</a>, <a href="https://arxiv.org/format/2101.09847">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> High-Confidence Off-Policy (or Counterfactual) Variance Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Shankar%2C+S">Shiv Shankar</a>, <a href="/search/cs?searchtype=author&query=Thomas%2C+P+S">Philip S. Thomas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2101.09847v1-abstract-short" style="display: inline;"> Many sequential decision-making systems leverage data collected using prior policies to propose a new policy. For critical applications, it is important that high-confidence guarantees on the new policy's behavior are provided before deployment, to ensure that the policy will behave as desired. Prior works have studied high-confidence off-policy estimation of the expected return, however, high-con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.09847v1-abstract-full').style.display = 'inline'; document.getElementById('2101.09847v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2101.09847v1-abstract-full" style="display: none;"> Many sequential decision-making systems leverage data collected using prior policies to propose a new policy. For critical applications, it is important that high-confidence guarantees on the new policy's behavior are provided before deployment, to ensure that the policy will behave as desired. Prior works have studied high-confidence off-policy estimation of the expected return, however, high-confidence off-policy estimation of the variance of returns can be equally critical for high-risk applications. In this paper, we tackle the previously open problem of estimating and bounding, with high confidence, the variance of returns from off-policy data <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.09847v1-abstract-full').style.display = 'none'; document.getElementById('2101.09847v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Thirty-fifth AAAI Conference on Artificial Intelligence (AAAI 2021)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2010.12645">arXiv:2010.12645</a> <span> [<a href="https://arxiv.org/pdf/2010.12645">pdf</a>, <a href="https://arxiv.org/format/2010.12645">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Towards Safe Policy Improvement for Non-Stationary MDPs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Jordan%2C+S+M">Scott M. Jordan</a>, <a href="/search/cs?searchtype=author&query=Theocharous%2C+G">Georgios Theocharous</a>, <a href="/search/cs?searchtype=author&query=White%2C+M">Martha White</a>, <a href="/search/cs?searchtype=author&query=Thomas%2C+P+S">Philip S. Thomas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2010.12645v2-abstract-short" style="display: inline;"> Many real-world sequential decision-making problems involve critical systems with financial risks and human-life risks. While several works in the past have proposed methods that are safe for deployment, they assume that the underlying problem is stationary. However, many real-world problems of interest exhibit non-stationarity, and when stakes are high, the cost associated with a false stationari… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.12645v2-abstract-full').style.display = 'inline'; document.getElementById('2010.12645v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2010.12645v2-abstract-full" style="display: none;"> Many real-world sequential decision-making problems involve critical systems with financial risks and human-life risks. While several works in the past have proposed methods that are safe for deployment, they assume that the underlying problem is stationary. However, many real-world problems of interest exhibit non-stationarity, and when stakes are high, the cost associated with a false stationarity assumption may be unacceptable. We take the first steps towards ensuring safety, with high confidence, for smoothly-varying non-stationary decision problems. Our proposed method extends a type of safe algorithm, called a Seldonian algorithm, through a synthesis of model-free reinforcement learning with time-series analysis. Safety is ensured using sequential hypothesis testing of a policy's forecasted performance, and confidence intervals are obtained using wild bootstrap. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.12645v2-abstract-full').style.display = 'none'; document.getElementById('2010.12645v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Thirty-fourth Conference on Neural Information Processing Systems (NeurIPS 2020)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2009.07346">arXiv:2009.07346</a> <span> [<a href="https://arxiv.org/pdf/2009.07346">pdf</a>, <a href="https://arxiv.org/format/2009.07346">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Reinforcement Learning for Strategic Recommendations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Theocharous%2C+G">Georgios Theocharous</a>, <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Thomas%2C+P+S">Philip S. Thomas</a>, <a href="/search/cs?searchtype=author&query=de+Nijs%2C+F">Frits de Nijs</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2009.07346v1-abstract-short" style="display: inline;"> Strategic recommendations (SR) refer to the problem where an intelligent agent observes the sequential behaviors and activities of users and decides when and how to interact with them to optimize some long-term objectives, both for the user and the business. These systems are in their infancy in the industry and in need of practical solutions to some fundamental research challenges. At Adobe resea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2009.07346v1-abstract-full').style.display = 'inline'; document.getElementById('2009.07346v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2009.07346v1-abstract-full" style="display: none;"> Strategic recommendations (SR) refer to the problem where an intelligent agent observes the sequential behaviors and activities of users and decides when and how to interact with them to optimize some long-term objectives, both for the user and the business. These systems are in their infancy in the industry and in need of practical solutions to some fundamental research challenges. At Adobe research, we have been implementing such systems for various use-cases, including points of interest recommendations, tutorial recommendations, next step guidance in multi-media editing software, and ad recommendation for optimizing lifetime value. There are many research challenges when building these systems, such as modeling the sequential behavior of users, deciding when to intervene and offer recommendations without annoying the user, evaluating policies offline with high confidence, safe deployment, non-stationarity, building systems from passive data that do not contain past recommendations, resource constraint optimization in multi-user systems, scaling to large and dynamic actions spaces, and handling and incorporating human cognitive biases. In this paper we cover various use-cases and research challenges we solved to make these systems practical. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2009.07346v1-abstract-full').style.display = 'none'; document.getElementById('2009.07346v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 September, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2006.16958">arXiv:2006.16958</a> <span> [<a href="https://arxiv.org/pdf/2006.16958">pdf</a>, <a href="https://arxiv.org/format/2006.16958">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Evaluating the Performance of Reinforcement Learning Algorithms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jordan%2C+S+M">Scott M. Jordan</a>, <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Cohen%2C+D">Daniel Cohen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mengxue Zhang</a>, <a href="/search/cs?searchtype=author&query=Thomas%2C+P+S">Philip S. Thomas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2006.16958v2-abstract-short" style="display: inline;"> Performance evaluations are critical for quantifying algorithmic advances in reinforcement learning. Recent reproducibility analyses have shown that reported performance results are often inconsistent and difficult to replicate. In this work, we argue that the inconsistency of performance stems from the use of flawed evaluation metrics. Taking a step towards ensuring that reported results are cons… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.16958v2-abstract-full').style.display = 'inline'; document.getElementById('2006.16958v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2006.16958v2-abstract-full" style="display: none;"> Performance evaluations are critical for quantifying algorithmic advances in reinforcement learning. Recent reproducibility analyses have shown that reported performance results are often inconsistent and difficult to replicate. In this work, we argue that the inconsistency of performance stems from the use of flawed evaluation metrics. Taking a step towards ensuring that reported results are consistent, we propose a new comprehensive evaluation methodology for reinforcement learning algorithms that produces reliable measurements of performance both on a single environment and when aggregated across environments. We demonstrate this method by evaluating a broad class of reinforcement learning algorithms on standard benchmark tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.16958v2-abstract-full').style.display = 'none'; document.getElementById('2006.16958v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 August, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">30 pages, 9 figures, Thirty-seventh International Conference on Machine Learning (ICML 2020)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2005.08158">arXiv:2005.08158</a> <span> [<a href="https://arxiv.org/pdf/2005.08158">pdf</a>, <a href="https://arxiv.org/format/2005.08158">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Optimizing for the Future in Non-Stationary MDPs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Theocharous%2C+G">Georgios Theocharous</a>, <a href="/search/cs?searchtype=author&query=Shankar%2C+S">Shiv Shankar</a>, <a href="/search/cs?searchtype=author&query=White%2C+M">Martha White</a>, <a href="/search/cs?searchtype=author&query=Mahadevan%2C+S">Sridhar Mahadevan</a>, <a href="/search/cs?searchtype=author&query=Thomas%2C+P+S">Philip S. Thomas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2005.08158v4-abstract-short" style="display: inline;"> Most reinforcement learning methods are based upon the key assumption that the transition dynamics and reward functions are fixed, that is, the underlying Markov decision process is stationary. However, in many real-world applications, this assumption is violated, and using existing algorithms may result in a performance lag. To proactively search for a good future policy, we present a policy grad… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.08158v4-abstract-full').style.display = 'inline'; document.getElementById('2005.08158v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2005.08158v4-abstract-full" style="display: none;"> Most reinforcement learning methods are based upon the key assumption that the transition dynamics and reward functions are fixed, that is, the underlying Markov decision process is stationary. However, in many real-world applications, this assumption is violated, and using existing algorithms may result in a performance lag. To proactively search for a good future policy, we present a policy gradient algorithm that maximizes a forecast of future performance. This forecast is obtained by fitting a curve to the counter-factual estimates of policy performance over time, without explicitly modeling the underlying non-stationarity. The resulting algorithm amounts to a non-uniform reweighting of past data, and we observe that minimizing performance over some of the data from past episodes can be beneficial when searching for a policy that maximizes future performance. We show that our algorithm, called Prognosticator, is more robust to non-stationarity than two online adaptation techniques, on three simulated problems motivated by real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.08158v4-abstract-full').style.display = 'none'; document.getElementById('2005.08158v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Thirty-seventh International Conference on Machine Learning (ICML 2020)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1906.03063">arXiv:1906.03063</a> <span> [<a href="https://arxiv.org/pdf/1906.03063">pdf</a>, <a href="https://arxiv.org/ps/1906.03063">ps</a>, <a href="https://arxiv.org/format/1906.03063">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Classical Policy Gradient: Preserving Bellman's Principle of Optimality </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Thomas%2C+P+S">Philip S. Thomas</a>, <a href="/search/cs?searchtype=author&query=Jordan%2C+S+M">Scott M. Jordan</a>, <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Nota%2C+C">Chris Nota</a>, <a href="/search/cs?searchtype=author&query=Kostas%2C+J">James Kostas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1906.03063v1-abstract-short" style="display: inline;"> We propose a new objective function for finite-horizon episodic Markov decision processes that better captures Bellman's principle of optimality, and provide an expression for the gradient of the objective. </span> <span class="abstract-full has-text-grey-dark mathjax" id="1906.03063v1-abstract-full" style="display: none;"> We propose a new objective function for finite-horizon episodic Markov decision processes that better captures Bellman's principle of optimality, and provide an expression for the gradient of the objective. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.03063v1-abstract-full').style.display = 'none'; document.getElementById('1906.03063v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 June, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">1 page, 0 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1906.01772">arXiv:1906.01772</a> <span> [<a href="https://arxiv.org/pdf/1906.01772">pdf</a>, <a href="https://arxiv.org/format/1906.01772">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Reinforcement Learning When All Actions are Not Always Available </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Theocharous%2C+G">Georgios Theocharous</a>, <a href="/search/cs?searchtype=author&query=Metevier%2C+B">Blossom Metevier</a>, <a href="/search/cs?searchtype=author&query=Thomas%2C+P+S">Philip S. Thomas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1906.01772v2-abstract-short" style="display: inline;"> The Markov decision process (MDP) formulation used to model many real-world sequential decision making problems does not efficiently capture the setting where the set of available decisions (actions) at each time step is stochastic. Recently, the stochastic action set Markov decision process (SAS-MDP) formulation has been proposed, which better captures the concept of a stochastic action set. In t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.01772v2-abstract-full').style.display = 'inline'; document.getElementById('1906.01772v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1906.01772v2-abstract-full" style="display: none;"> The Markov decision process (MDP) formulation used to model many real-world sequential decision making problems does not efficiently capture the setting where the set of available decisions (actions) at each time step is stochastic. Recently, the stochastic action set Markov decision process (SAS-MDP) formulation has been proposed, which better captures the concept of a stochastic action set. In this paper we argue that existing RL algorithms for SAS-MDPs can suffer from potential divergence issues, and present new policy gradient algorithms for SAS-MDPs that incorporate variance reduction techniques unique to this setting, and provide conditions for their convergence. We conclude with experiments that demonstrate the practicality of our approaches on tasks inspired by real-life use cases wherein the action set is stochastic. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.01772v2-abstract-full').style.display = 'none'; document.getElementById('1906.01772v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 June, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Thirty-fourth Conference on Artificial Intelligence (AAAI 2020)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1906.01770">arXiv:1906.01770</a> <span> [<a href="https://arxiv.org/pdf/1906.01770">pdf</a>, <a href="https://arxiv.org/format/1906.01770">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Lifelong Learning with a Changing Action Set </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Theocharous%2C+G">Georgios Theocharous</a>, <a href="/search/cs?searchtype=author&query=Nota%2C+C">Chris Nota</a>, <a href="/search/cs?searchtype=author&query=Thomas%2C+P+S">Philip S. Thomas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1906.01770v3-abstract-short" style="display: inline;"> In many real-world sequential decision making problems, the number of available actions (decisions) can vary over time. While problems like catastrophic forgetting, changing transition dynamics, changing rewards functions, etc. have been well-studied in the lifelong learning literature, the setting where the action set changes remains unaddressed. In this paper, we present an algorithm that autono… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.01770v3-abstract-full').style.display = 'inline'; document.getElementById('1906.01770v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1906.01770v3-abstract-full" style="display: none;"> In many real-world sequential decision making problems, the number of available actions (decisions) can vary over time. While problems like catastrophic forgetting, changing transition dynamics, changing rewards functions, etc. have been well-studied in the lifelong learning literature, the setting where the action set changes remains unaddressed. In this paper, we present an algorithm that autonomously adapts to an action set whose size changes over time. To tackle this open problem, we break it into two problems that can be solved iteratively: inferring the underlying, unknown, structure in the space of actions and optimizing a policy that leverages this structure. We demonstrate the efficiency of this approach on large-scale real-world lifelong learning problems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.01770v3-abstract-full').style.display = 'none'; document.getElementById('1906.01770v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 June, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Thirty-fourth Conference on Artificial Intelligence (AAAI 2020) [Outstanding Student Paper Honorable Mention. ]</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1902.00183">arXiv:1902.00183</a> <span> [<a href="https://arxiv.org/pdf/1902.00183">pdf</a>, <a href="https://arxiv.org/format/1902.00183">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Learning Action Representations for Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Theocharous%2C+G">Georgios Theocharous</a>, <a href="/search/cs?searchtype=author&query=Kostas%2C+J">James Kostas</a>, <a href="/search/cs?searchtype=author&query=Jordan%2C+S">Scott Jordan</a>, <a href="/search/cs?searchtype=author&query=Thomas%2C+P+S">Philip S. Thomas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1902.00183v2-abstract-short" style="display: inline;"> Most model-free reinforcement learning methods leverage state representations (embeddings) for generalization, but either ignore structure in the space of actions or assume the structure is provided a priori. We show how a policy can be decomposed into a component that acts in a low-dimensional space of action representations and a component that transforms these representations into actual action… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1902.00183v2-abstract-full').style.display = 'inline'; document.getElementById('1902.00183v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1902.00183v2-abstract-full" style="display: none;"> Most model-free reinforcement learning methods leverage state representations (embeddings) for generalization, but either ignore structure in the space of actions or assume the structure is provided a priori. We show how a policy can be decomposed into a component that acts in a low-dimensional space of action representations and a component that transforms these representations into actual actions. These representations improve generalization over large, finite action sets by allowing the agent to infer the outcomes of actions similar to actions already taken. We provide an algorithm to both learn and use action representations and provide conditions for its convergence. The efficacy of the proposed method is demonstrated on large-scale real-world problems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1902.00183v2-abstract-full').style.display = 'none'; document.getElementById('1902.00183v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 May, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 January, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In Proceedings of the 36th International Conference on Machine Learning (ICML 2019)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1805.12528">arXiv:1805.12528</a> <span> [<a href="https://arxiv.org/pdf/1805.12528">pdf</a>, <a href="https://arxiv.org/format/1805.12528">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Fusion Graph Convolutional Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Vijayan%2C+P">Priyesh Vijayan</a>, <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Khapra%2C+M+M">Mitesh M. Khapra</a>, <a href="/search/cs?searchtype=author&query=Parthasarathy%2C+S">Srinivasan Parthasarathy</a>, <a href="/search/cs?searchtype=author&query=Ravindran%2C+B">Balaraman Ravindran</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1805.12528v5-abstract-short" style="display: inline;"> Semi-supervised node classification in attributed graphs, i.e., graphs with node features, involves learning to classify unlabeled nodes given a partially labeled graph. Label predictions are made by jointly modeling the node and its' neighborhood features. State-of-the-art models for node classification on such attributed graphs use differentiable recursive functions that enable aggregation and f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1805.12528v5-abstract-full').style.display = 'inline'; document.getElementById('1805.12528v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1805.12528v5-abstract-full" style="display: none;"> Semi-supervised node classification in attributed graphs, i.e., graphs with node features, involves learning to classify unlabeled nodes given a partially labeled graph. Label predictions are made by jointly modeling the node and its' neighborhood features. State-of-the-art models for node classification on such attributed graphs use differentiable recursive functions that enable aggregation and filtering of neighborhood information from multiple hops. In this work, we analyze the representation capacity of these models to regulate information from multiple hops independently. From our analysis, we conclude that these models despite being powerful, have limited representation capacity to capture multi-hop neighborhood information effectively. Further, we also propose a mathematically motivated, yet simple extension to existing graph convolutional networks (GCNs) which has improved representation capacity. We extensively evaluate the proposed model, F-GCN on eight popular datasets from different domains. F-GCN outperforms the state-of-the-art models for semi-supervised learning on six datasets while being extremely competitive on the other two. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1805.12528v5-abstract-full').style.display = 'none'; document.getElementById('1805.12528v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 May, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1805.12421">arXiv:1805.12421</a> <span> [<a href="https://arxiv.org/pdf/1805.12421">pdf</a>, <a href="https://arxiv.org/format/1805.12421">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> HOPF: Higher Order Propagation Framework for Deep Collective Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Vijayan%2C+P">Priyesh Vijayan</a>, <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Khapra%2C+M+M">Mitesh M. Khapra</a>, <a href="/search/cs?searchtype=author&query=Parthasarathy%2C+S">Srinivasan Parthasarathy</a>, <a href="/search/cs?searchtype=author&query=Ravindran%2C+B">Balaraman Ravindran</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1805.12421v6-abstract-short" style="display: inline;"> Given a graph where every node has certain attributes associated with it and some nodes have labels associated with them, Collective Classification (CC) is the task of assigning labels to every unlabeled node using information from the node as well as its neighbors. It is often the case that a node is not only influenced by its immediate neighbors but also by higher order neighbors, multiple hops… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1805.12421v6-abstract-full').style.display = 'inline'; document.getElementById('1805.12421v6-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1805.12421v6-abstract-full" style="display: none;"> Given a graph where every node has certain attributes associated with it and some nodes have labels associated with them, Collective Classification (CC) is the task of assigning labels to every unlabeled node using information from the node as well as its neighbors. It is often the case that a node is not only influenced by its immediate neighbors but also by higher order neighbors, multiple hops away. Recent state-of-the-art models for CC learn end-to-end differentiable variations of Weisfeiler-Lehman (WL) kernels to aggregate multi-hop neighborhood information. In this work, we propose a Higher Order Propagation Framework, HOPF, which provides an iterative inference mechanism for these powerful differentiable kernels. Such a combination of classical iterative inference mechanism with recent differentiable kernels allows the framework to learn graph convolutional filters that simultaneously exploit the attribute and label information available in the neighborhood. Further, these iterative differentiable kernels can scale to larger hops beyond the memory limitations of existing differentiable kernels. We also show that existing WL kernel-based models suffer from the problem of Node Information Morphing where the information of the node is morphed or overwhelmed by the information of its neighbors when considering multiple hops. To address this, we propose a specific instantiation of HOPF, called the NIP models, which preserves the node information at every propagation step. The iterative formulation of NIP models further helps in incorporating distant hop information concisely as summaries of the inferred labels. We do an extensive evaluation across 11 datasets from different domains. We show that existing CC models do not provide consistent performance across datasets, while the proposed NIP model with iterative inference is more robust. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1805.12421v6-abstract-full').style.display = 'none'; document.getElementById('1805.12421v6-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 May, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1509.07543">arXiv:1509.07543</a> <span> [<a href="https://arxiv.org/pdf/1509.07543">pdf</a>, <a href="https://arxiv.org/format/1509.07543">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> On Optimizing Human-Machine Task Assignments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Veit%2C+A">Andreas Veit</a>, <a href="/search/cs?searchtype=author&query=Wilber%2C+M">Michael Wilber</a>, <a href="/search/cs?searchtype=author&query=Vaish%2C+R">Rajan Vaish</a>, <a href="/search/cs?searchtype=author&query=Belongie%2C+S">Serge Belongie</a>, <a href="/search/cs?searchtype=author&query=Davis%2C+J">James Davis</a>, <a href="/search/cs?searchtype=author&query=Anand%2C+V">Vishal Anand</a>, <a href="/search/cs?searchtype=author&query=Aviral%2C+A">Anshu Aviral</a>, <a href="/search/cs?searchtype=author&query=Chakrabarty%2C+P">Prithvijit Chakrabarty</a>, <a href="/search/cs?searchtype=author&query=Chandak%2C+Y">Yash Chandak</a>, <a href="/search/cs?searchtype=author&query=Chaturvedi%2C+S">Sidharth Chaturvedi</a>, <a href="/search/cs?searchtype=author&query=Devaraj%2C+C">Chinmaya Devaraj</a>, <a href="/search/cs?searchtype=author&query=Dhall%2C+A">Ankit Dhall</a>, <a href="/search/cs?searchtype=author&query=Dwivedi%2C+U">Utkarsh Dwivedi</a>, <a href="/search/cs?searchtype=author&query=Gupte%2C+S">Sanket Gupte</a>, <a href="/search/cs?searchtype=author&query=Sridhar%2C+S+N">Sharath N. Sridhar</a>, <a href="/search/cs?searchtype=author&query=Paga%2C+K">Karthik Paga</a>, <a href="/search/cs?searchtype=author&query=Pahuja%2C+A">Anuj Pahuja</a>, <a href="/search/cs?searchtype=author&query=Raisinghani%2C+A">Aditya Raisinghani</a>, <a href="/search/cs?searchtype=author&query=Sharma%2C+A">Ayush Sharma</a>, <a href="/search/cs?searchtype=author&query=Sharma%2C+S">Shweta Sharma</a>, <a href="/search/cs?searchtype=author&query=Sinha%2C+D">Darpana Sinha</a>, <a href="/search/cs?searchtype=author&query=Thakkar%2C+N">Nisarg Thakkar</a>, <a href="/search/cs?searchtype=author&query=Vignesh%2C+K+B">K. Bala Vignesh</a>, <a href="/search/cs?searchtype=author&query=Verma%2C+U">Utkarsh Verma</a>, <a href="/search/cs?searchtype=author&query=Abhishek%2C+K">Kanniganti Abhishek</a> , et al. (26 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1509.07543v1-abstract-short" style="display: inline;"> When crowdsourcing systems are used in combination with machine inference systems in the real world, they benefit the most when the machine system is deeply integrated with the crowd workers. However, if researchers wish to integrate the crowd with "off-the-shelf" machine classifiers, this deep integration is not always possible. This work explores two strategies to increase accuracy and decrease… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1509.07543v1-abstract-full').style.display = 'inline'; document.getElementById('1509.07543v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1509.07543v1-abstract-full" style="display: none;"> When crowdsourcing systems are used in combination with machine inference systems in the real world, they benefit the most when the machine system is deeply integrated with the crowd workers. However, if researchers wish to integrate the crowd with "off-the-shelf" machine classifiers, this deep integration is not always possible. This work explores two strategies to increase accuracy and decrease cost under this setting. First, we show that reordering tasks presented to the human can create a significant accuracy improvement. Further, we show that greedily choosing parameters to maximize machine accuracy is sub-optimal, and joint optimization of the combined system improves performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1509.07543v1-abstract-full').style.display = 'none'; document.getElementById('1509.07543v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2015; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2015. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">HCOMP 2015 Work in Progress</span> </p> </li> </ol> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository