Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–19 of 19 results for author: <span class="mathjax">Emmons, S</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Emmons%2C+S">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Emmons, S"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Emmons%2C+S&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Emmons, S"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.17797">arXiv:2412.17797</a> <span> [<a href="https://arxiv.org/pdf/2412.17797">pdf</a>, <a href="https://arxiv.org/format/2412.17797">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Observation Interference in Partially Observable Assistance Games </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Emmons%2C+S">Scott Emmons</a>, <a href="/search/cs?searchtype=author&query=Oesterheld%2C+C">Caspar Oesterheld</a>, <a href="/search/cs?searchtype=author&query=Conitzer%2C+V">Vincent Conitzer</a>, <a href="/search/cs?searchtype=author&query=Russell%2C+S">Stuart Russell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.17797v1-abstract-short" style="display: inline;"> We study partially observable assistance games (POAGs), a model of the human-AI value alignment problem which allows the human and the AI assistant to have partial observations. Motivated by concerns of AI deception, we study a qualitatively new phenomenon made possible by partial observability: would an AI assistant ever have an incentive to interfere with the human's observations? First, we prov… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17797v1-abstract-full').style.display = 'inline'; document.getElementById('2412.17797v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.17797v1-abstract-full" style="display: none;"> We study partially observable assistance games (POAGs), a model of the human-AI value alignment problem which allows the human and the AI assistant to have partial observations. Motivated by concerns of AI deception, we study a qualitatively new phenomenon made possible by partial observability: would an AI assistant ever have an incentive to interfere with the human's observations? First, we prove that sometimes an optimal assistant must take observation-interfering actions, even when the human is playing optimally, and even when there are otherwise-equivalent actions available that do not interfere with observations. Though this result seems to contradict the classic theorem from single-agent decision making that the value of perfect information is nonnegative, we resolve this seeming contradiction by developing a notion of interference defined on entire policies. This can be viewed as an extension of the classic result that the value of perfect information is nonnegative into the cooperative multiagent setting. Second, we prove that if the human is simply making decisions based on their immediate outcomes, the assistant might need to interfere with observations as a way to query the human's preferences. We show that this incentive for interference goes away if the human is playing optimally, or if we introduce a communication channel for the human to communicate their preferences to the assistant. Third, we show that if the human acts according to the Boltzmann model of irrationality, this can create an incentive for the assistant to interfere with observations. Finally, we use an experimental model to analyze tradeoffs faced by the AI assistant in practice when considering whether or not to take observation-interfering actions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17797v1-abstract-full').style.display = 'none'; document.getElementById('2412.17797v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09565">arXiv:2412.09565</a> <span> [<a href="https://arxiv.org/pdf/2412.09565">pdf</a>, <a href="https://arxiv.org/format/2412.09565">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Obfuscated Activations Bypass LLM Latent-Space Defenses </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bailey%2C+L">Luke Bailey</a>, <a href="/search/cs?searchtype=author&query=Serrano%2C+A">Alex Serrano</a>, <a href="/search/cs?searchtype=author&query=Sheshadri%2C+A">Abhay Sheshadri</a>, <a href="/search/cs?searchtype=author&query=Seleznyov%2C+M">Mikhail Seleznyov</a>, <a href="/search/cs?searchtype=author&query=Taylor%2C+J">Jordan Taylor</a>, <a href="/search/cs?searchtype=author&query=Jenner%2C+E">Erik Jenner</a>, <a href="/search/cs?searchtype=author&query=Hilton%2C+J">Jacob Hilton</a>, <a href="/search/cs?searchtype=author&query=Casper%2C+S">Stephen Casper</a>, <a href="/search/cs?searchtype=author&query=Guestrin%2C+C">Carlos Guestrin</a>, <a href="/search/cs?searchtype=author&query=Emmons%2C+S">Scott Emmons</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09565v2-abstract-short" style="display: inline;"> Recent latent-space monitoring techniques have shown promise as defenses against LLM attacks. These defenses act as scanners that seek to detect harmful activations before they lead to undesirable actions. This prompts the question: Can models execute harmful behavior via inconspicuous latent states? Here, we study such obfuscated activations. We show that state-of-the-art latent-space defenses --… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09565v2-abstract-full').style.display = 'inline'; document.getElementById('2412.09565v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09565v2-abstract-full" style="display: none;"> Recent latent-space monitoring techniques have shown promise as defenses against LLM attacks. These defenses act as scanners that seek to detect harmful activations before they lead to undesirable actions. This prompts the question: Can models execute harmful behavior via inconspicuous latent states? Here, we study such obfuscated activations. We show that state-of-the-art latent-space defenses -- including sparse autoencoders, representation probing, and latent OOD detection -- are all vulnerable to obfuscated activations. For example, against probes trained to classify harmfulness, our attacks can often reduce recall from 100% to 0% while retaining a 90% jailbreaking rate. However, obfuscation has limits: we find that on a complex task (writing SQL code), obfuscation reduces model performance. Together, our results demonstrate that neural activations are highly malleable: we can reshape activation patterns in a variety of ways, often while preserving a network's behavior. This poses a fundamental challenge to latent-space defenses. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09565v2-abstract-full').style.display = 'none'; document.getElementById('2412.09565v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://obfuscated-activations.github.io/ Code: https://github.com/LukeBailey181/obfuscated-activations</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17749">arXiv:2411.17749</a> <span> [<a href="https://arxiv.org/pdf/2411.17749">pdf</a>, <a href="https://arxiv.org/ps/2411.17749">ps</a>, <a href="https://arxiv.org/format/2411.17749">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> The Partially Observable Off-Switch Game </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Garber%2C+A">Andrew Garber</a>, <a href="/search/cs?searchtype=author&query=Subramani%2C+R">Rohan Subramani</a>, <a href="/search/cs?searchtype=author&query=Luu%2C+L">Linus Luu</a>, <a href="/search/cs?searchtype=author&query=Bedaywi%2C+M">Mark Bedaywi</a>, <a href="/search/cs?searchtype=author&query=Russell%2C+S">Stuart Russell</a>, <a href="/search/cs?searchtype=author&query=Emmons%2C+S">Scott Emmons</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17749v2-abstract-short" style="display: inline;"> A wide variety of goals could cause an AI to disable its off switch because "you can't fetch the coffee if you're dead" (Russell 2019). Prior theoretical work on this shutdown problem assumes that humans know everything that AIs do. In practice, however, humans have only limited information. Moreover, in many of the settings where the shutdown problem is most concerning, AIs might have vast amount… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17749v2-abstract-full').style.display = 'inline'; document.getElementById('2411.17749v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17749v2-abstract-full" style="display: none;"> A wide variety of goals could cause an AI to disable its off switch because "you can't fetch the coffee if you're dead" (Russell 2019). Prior theoretical work on this shutdown problem assumes that humans know everything that AIs do. In practice, however, humans have only limited information. Moreover, in many of the settings where the shutdown problem is most concerning, AIs might have vast amounts of private information. To capture these differences in knowledge, we introduce the Partially Observable Off-Switch Game (PO-OSG), a game-theoretic model of the shutdown problem with asymmetric information. Unlike when the human has full observability, we find that in optimal play, even AI agents assisting perfectly rational humans sometimes avoid shutdown. As expected, increasing the amount of communication or information available always increases (or leaves unchanged) the agents' expected common payoff. But counterintuitively, introducing bounded communication can make the AI defer to the human less in optimal play even though communication mitigates information asymmetry. In particular, communication sometimes enables new optimal behavior requiring strategic AI deference to achieve outcomes that were previously inaccessible. Thus, designing safe artificial agents in the presence of asymmetric information requires careful consideration of the tradeoffs between maximizing payoffs (potentially myopically) and maintaining AIs' incentives to defer to humans. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17749v2-abstract-full').style.display = 'none'; document.getElementById('2411.17749v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.15211">arXiv:2407.15211</a> <span> [<a href="https://arxiv.org/pdf/2407.15211">pdf</a>, <a href="https://arxiv.org/format/2407.15211">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Failures to Find Transferable Image Jailbreaks Between Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Schaeffer%2C+R">Rylan Schaeffer</a>, <a href="/search/cs?searchtype=author&query=Valentine%2C+D">Dan Valentine</a>, <a href="/search/cs?searchtype=author&query=Bailey%2C+L">Luke Bailey</a>, <a href="/search/cs?searchtype=author&query=Chua%2C+J">James Chua</a>, <a href="/search/cs?searchtype=author&query=Eyzaguirre%2C+C">Crist贸bal Eyzaguirre</a>, <a href="/search/cs?searchtype=author&query=Durante%2C+Z">Zane Durante</a>, <a href="/search/cs?searchtype=author&query=Benton%2C+J">Joe Benton</a>, <a href="/search/cs?searchtype=author&query=Miranda%2C+B">Brando Miranda</a>, <a href="/search/cs?searchtype=author&query=Sleight%2C+H">Henry Sleight</a>, <a href="/search/cs?searchtype=author&query=Hughes%2C+J">John Hughes</a>, <a href="/search/cs?searchtype=author&query=Agrawal%2C+R">Rajashree Agrawal</a>, <a href="/search/cs?searchtype=author&query=Sharma%2C+M">Mrinank Sharma</a>, <a href="/search/cs?searchtype=author&query=Emmons%2C+S">Scott Emmons</a>, <a href="/search/cs?searchtype=author&query=Koyejo%2C+S">Sanmi Koyejo</a>, <a href="/search/cs?searchtype=author&query=Perez%2C+E">Ethan Perez</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.15211v2-abstract-short" style="display: inline;"> The integration of new modalities into frontier AI systems offers exciting capabilities, but also increases the possibility such systems can be adversarially manipulated in undesirable ways. In this work, we focus on a popular class of vision-language models (VLMs) that generate text outputs conditioned on visual and textual inputs. We conducted a large-scale empirical study to assess the transfer… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15211v2-abstract-full').style.display = 'inline'; document.getElementById('2407.15211v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.15211v2-abstract-full" style="display: none;"> The integration of new modalities into frontier AI systems offers exciting capabilities, but also increases the possibility such systems can be adversarially manipulated in undesirable ways. In this work, we focus on a popular class of vision-language models (VLMs) that generate text outputs conditioned on visual and textual inputs. We conducted a large-scale empirical study to assess the transferability of gradient-based universal image ``jailbreaks" using a diverse set of over 40 open-parameter VLMs, including 18 new VLMs that we publicly release. Overall, we find that transferable gradient-based image jailbreaks are extremely difficult to obtain. When an image jailbreak is optimized against a single VLM or against an ensemble of VLMs, the jailbreak successfully jailbreaks the attacked VLM(s), but exhibits little-to-no transfer to any other VLMs; transfer is not affected by whether the attacked and target VLMs possess matching vision backbones or language models, whether the language model underwent instruction-following and/or safety-alignment training, or many other factors. Only two settings display partially successful transfer: between identically-pretrained and identically-initialized VLMs with slightly different VLM training data, and between different training checkpoints of a single VLM. Leveraging these results, we then demonstrate that transfer can be significantly improved against a specific target VLM by attacking larger ensembles of ``highly-similar" VLMs. These results stand in stark contrast to existing evidence of universal and transferable text jailbreaks against language models and transferable adversarial attacks against image classifiers, suggesting that VLMs may be more robust to gradient-based transfer attacks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15211v2-abstract-full').style.display = 'none'; document.getElementById('2407.15211v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024 Workshops: RBFM (Best Paper), Frontiers in AdvML (Oral), Red Teaming GenAI (Oral), SoLaR (Spotlight), SATA</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.00877">arXiv:2406.00877</a> <span> [<a href="https://arxiv.org/pdf/2406.00877">pdf</a>, <a href="https://arxiv.org/format/2406.00877">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Evidence of Learned Look-Ahead in a Chess-Playing Neural Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jenner%2C+E">Erik Jenner</a>, <a href="/search/cs?searchtype=author&query=Kapur%2C+S">Shreyas Kapur</a>, <a href="/search/cs?searchtype=author&query=Georgiev%2C+V">Vasil Georgiev</a>, <a href="/search/cs?searchtype=author&query=Allen%2C+C">Cameron Allen</a>, <a href="/search/cs?searchtype=author&query=Emmons%2C+S">Scott Emmons</a>, <a href="/search/cs?searchtype=author&query=Russell%2C+S">Stuart Russell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.00877v1-abstract-short" style="display: inline;"> Do neural networks learn to implement algorithms such as look-ahead or search "in the wild"? Or do they rely purely on collections of simple heuristics? We present evidence of learned look-ahead in the policy network of Leela Chess Zero, the currently strongest neural chess engine. We find that Leela internally represents future optimal moves and that these representations are crucial for its fina… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00877v1-abstract-full').style.display = 'inline'; document.getElementById('2406.00877v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.00877v1-abstract-full" style="display: none;"> Do neural networks learn to implement algorithms such as look-ahead or search "in the wild"? Or do they rely purely on collections of simple heuristics? We present evidence of learned look-ahead in the policy network of Leela Chess Zero, the currently strongest neural chess engine. We find that Leela internally represents future optimal moves and that these representations are crucial for its final output in certain board states. Concretely, we exploit the fact that Leela is a transformer that treats every chessboard square like a token in language models, and give three lines of evidence (1) activations on certain squares of future moves are unusually important causally; (2) we find attention heads that move important information "forward and backward in time," e.g., from squares of future moves to squares of earlier ones; and (3) we train a simple probe that can predict the optimal move 2 turns ahead with 92% accuracy (in board states where Leela finds a single best line). These findings are an existence proof of learned look-ahead in neural networks and might be a step towards a better understanding of their capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00877v1-abstract-full').style.display = 'none'; document.getElementById('2406.00877v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://leela-interp.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.17747">arXiv:2402.17747</a> <span> [<a href="https://arxiv.org/pdf/2402.17747">pdf</a>, <a href="https://arxiv.org/format/2402.17747">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> When Your AIs Deceive You: Challenges of Partial Observability in Reinforcement Learning from Human Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lang%2C+L">Leon Lang</a>, <a href="/search/cs?searchtype=author&query=Foote%2C+D">Davis Foote</a>, <a href="/search/cs?searchtype=author&query=Russell%2C+S">Stuart Russell</a>, <a href="/search/cs?searchtype=author&query=Dragan%2C+A">Anca Dragan</a>, <a href="/search/cs?searchtype=author&query=Jenner%2C+E">Erik Jenner</a>, <a href="/search/cs?searchtype=author&query=Emmons%2C+S">Scott Emmons</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.17747v5-abstract-short" style="display: inline;"> Past analyses of reinforcement learning from human feedback (RLHF) assume that the human evaluators fully observe the environment. What happens when human feedback is based only on partial observations? We formally define two failure cases: deceptive inflation and overjustification. Modeling the human as Boltzmann-rational w.r.t. a belief over trajectories, we prove conditions under which RLHF is… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.17747v5-abstract-full').style.display = 'inline'; document.getElementById('2402.17747v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.17747v5-abstract-full" style="display: none;"> Past analyses of reinforcement learning from human feedback (RLHF) assume that the human evaluators fully observe the environment. What happens when human feedback is based only on partial observations? We formally define two failure cases: deceptive inflation and overjustification. Modeling the human as Boltzmann-rational w.r.t. a belief over trajectories, we prove conditions under which RLHF is guaranteed to result in policies that deceptively inflate their performance, overjustify their behavior to make an impression, or both. Under the new assumption that the human's partial observability is known and accounted for, we then analyze how much information the feedback process provides about the return function. We show that sometimes, the human's feedback determines the return function uniquely up to an additive constant, but in other realistic cases, there is irreducible ambiguity. We propose exploratory research directions to help tackle these challenges, experimentally validate both the theoretical concerns and potential mitigations, and caution against blindly applying RLHF in partially observable settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.17747v5-abstract-full').style.display = 'none'; document.getElementById('2402.17747v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Advances in Neural Information Processing Systems 37 (NeurIPS 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.11777">arXiv:2402.11777</a> <span> [<a href="https://arxiv.org/pdf/2402.11777">pdf</a>, <a href="https://arxiv.org/format/2402.11777">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Uncovering Latent Human Wellbeing in Language Model Embeddings </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Freire%2C+P">Pedro Freire</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">ChengCheng Tan</a>, <a href="/search/cs?searchtype=author&query=Gleave%2C+A">Adam Gleave</a>, <a href="/search/cs?searchtype=author&query=Hendrycks%2C+D">Dan Hendrycks</a>, <a href="/search/cs?searchtype=author&query=Emmons%2C+S">Scott Emmons</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.11777v1-abstract-short" style="display: inline;"> Do language models implicitly learn a concept of human wellbeing? We explore this through the ETHICS Utilitarianism task, assessing if scaling enhances pretrained models' representations. Our initial finding reveals that, without any prompt engineering or finetuning, the leading principal component from OpenAI's text-embedding-ada-002 achieves 73.9% accuracy. This closely matches the 74.6% of BERT… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.11777v1-abstract-full').style.display = 'inline'; document.getElementById('2402.11777v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.11777v1-abstract-full" style="display: none;"> Do language models implicitly learn a concept of human wellbeing? We explore this through the ETHICS Utilitarianism task, assessing if scaling enhances pretrained models' representations. Our initial finding reveals that, without any prompt engineering or finetuning, the leading principal component from OpenAI's text-embedding-ada-002 achieves 73.9% accuracy. This closely matches the 74.6% of BERT-large finetuned on the entire ETHICS dataset, suggesting pretraining conveys some understanding about human wellbeing. Next, we consider four language model families, observing how Utilitarianism accuracy varies with increased parameters. We find performance is nondecreasing with increased model size when using sufficient numbers of principal components. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.11777v1-abstract-full').style.display = 'none'; document.getElementById('2402.11777v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 5 figures, 1 table</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.10260">arXiv:2402.10260</a> <span> [<a href="https://arxiv.org/pdf/2402.10260">pdf</a>, <a href="https://arxiv.org/format/2402.10260">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> A StrongREJECT for Empty Jailbreaks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Souly%2C+A">Alexandra Souly</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Q">Qingyuan Lu</a>, <a href="/search/cs?searchtype=author&query=Bowen%2C+D">Dillon Bowen</a>, <a href="/search/cs?searchtype=author&query=Trinh%2C+T">Tu Trinh</a>, <a href="/search/cs?searchtype=author&query=Hsieh%2C+E">Elvis Hsieh</a>, <a href="/search/cs?searchtype=author&query=Pandey%2C+S">Sana Pandey</a>, <a href="/search/cs?searchtype=author&query=Abbeel%2C+P">Pieter Abbeel</a>, <a href="/search/cs?searchtype=author&query=Svegliato%2C+J">Justin Svegliato</a>, <a href="/search/cs?searchtype=author&query=Emmons%2C+S">Scott Emmons</a>, <a href="/search/cs?searchtype=author&query=Watkins%2C+O">Olivia Watkins</a>, <a href="/search/cs?searchtype=author&query=Toyer%2C+S">Sam Toyer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.10260v2-abstract-short" style="display: inline;"> Most jailbreak papers claim the jailbreaks they propose are highly effective, often boasting near-100% attack success rates. However, it is perhaps more common than not for jailbreak developers to substantially exaggerate the effectiveness of their jailbreaks. We suggest this problem arises because jailbreak researchers lack a standard, high-quality benchmark for evaluating jailbreak performance,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.10260v2-abstract-full').style.display = 'inline'; document.getElementById('2402.10260v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.10260v2-abstract-full" style="display: none;"> Most jailbreak papers claim the jailbreaks they propose are highly effective, often boasting near-100% attack success rates. However, it is perhaps more common than not for jailbreak developers to substantially exaggerate the effectiveness of their jailbreaks. We suggest this problem arises because jailbreak researchers lack a standard, high-quality benchmark for evaluating jailbreak performance, leaving researchers to create their own. To create a benchmark, researchers must choose a dataset of forbidden prompts to which a victim model will respond, along with an evaluation method that scores the harmfulness of the victim model's responses. We show that existing benchmarks suffer from significant shortcomings and introduce the StrongREJECT benchmark to address these issues. StrongREJECT's dataset contains prompts that victim models must answer with specific, harmful information, while its automated evaluator measures the extent to which a response gives useful information to forbidden prompts. In doing so, the StrongREJECT evaluator achieves state-of-the-art agreement with human judgments of jailbreak effectiveness. Notably, we find that existing evaluation methods significantly overstate jailbreak effectiveness compared to human judgments and the StrongREJECT evaluator. We describe a surprising and novel phenomenon that explains this discrepancy: jailbreaks bypassing a victim model's safety fine-tuning tend to reduce its capabilities. Together, our findings underscore the need for researchers to use a high-quality benchmark, such as StrongREJECT, when developing new jailbreak attacks. We release the StrongREJECT code and data at https://strong-reject.readthedocs.io/en/latest/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.10260v2-abstract-full').style.display = 'none'; document.getElementById('2402.10260v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code and data at https://strong-reject.readthedocs.io/en/latest/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.12747">arXiv:2312.12747</a> <span> [<a href="https://arxiv.org/pdf/2312.12747">pdf</a>, <a href="https://arxiv.org/format/2312.12747">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> ALMANACS: A Simulatability Benchmark for Language Model Explainability </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mills%2C+E">Edmund Mills</a>, <a href="/search/cs?searchtype=author&query=Su%2C+S">Shiye Su</a>, <a href="/search/cs?searchtype=author&query=Russell%2C+S">Stuart Russell</a>, <a href="/search/cs?searchtype=author&query=Emmons%2C+S">Scott Emmons</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.12747v2-abstract-short" style="display: inline;"> How do we measure the efficacy of language model explainability methods? While many explainability methods have been developed, they are typically evaluated on bespoke tasks, preventing an apples-to-apples comparison. To help fill this gap, we present ALMANACS, a language model explainability benchmark. ALMANACS scores explainability methods on simulatability, i.e., how well the explanations impro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.12747v2-abstract-full').style.display = 'inline'; document.getElementById('2312.12747v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.12747v2-abstract-full" style="display: none;"> How do we measure the efficacy of language model explainability methods? While many explainability methods have been developed, they are typically evaluated on bespoke tasks, preventing an apples-to-apples comparison. To help fill this gap, we present ALMANACS, a language model explainability benchmark. ALMANACS scores explainability methods on simulatability, i.e., how well the explanations improve behavior prediction on new inputs. The ALMANACS scenarios span twelve safety-relevant topics such as ethical reasoning and advanced AI behaviors; they have idiosyncratic premises to invoke model-specific behavior; and they have a train-test distributional shift to encourage faithful explanations. By using another language model to predict behavior based on the explanations, ALMANACS is a fully automated benchmark. While not a replacement for human evaluations, we aim for ALMANACS to be a complementary, automated tool that allows for fast, scalable evaluation. Using ALMANACS, we evaluate counterfactual, rationalization, attention, and Integrated Gradients explanations. Our results are sobering: when averaged across all topics, no explanation method outperforms the explanation-free control. We conclude that despite modest successes in prior work, developing an explanation method that aids simulatability in ALMANACS remains an open challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.12747v2-abstract-full').style.display = 'none'; document.getElementById('2312.12747v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code is available at https://github.com/edmundmills/ALMANACS}{https://github.com/edmundmills/ALMANACS</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.00236">arXiv:2309.00236</a> <span> [<a href="https://arxiv.org/pdf/2309.00236">pdf</a>, <a href="https://arxiv.org/format/2309.00236">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Image Hijacks: Adversarial Images can Control Generative Models at Runtime </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bailey%2C+L">Luke Bailey</a>, <a href="/search/cs?searchtype=author&query=Ong%2C+E">Euan Ong</a>, <a href="/search/cs?searchtype=author&query=Russell%2C+S">Stuart Russell</a>, <a href="/search/cs?searchtype=author&query=Emmons%2C+S">Scott Emmons</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.00236v4-abstract-short" style="display: inline;"> Are foundation models secure against malicious actors? In this work, we focus on the image input to a vision-language model (VLM). We discover image hijacks, adversarial images that control the behaviour of VLMs at inference time, and introduce the general Behaviour Matching algorithm for training image hijacks. From this, we derive the Prompt Matching method, allowing us to train hijacks matching… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.00236v4-abstract-full').style.display = 'inline'; document.getElementById('2309.00236v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.00236v4-abstract-full" style="display: none;"> Are foundation models secure against malicious actors? In this work, we focus on the image input to a vision-language model (VLM). We discover image hijacks, adversarial images that control the behaviour of VLMs at inference time, and introduce the general Behaviour Matching algorithm for training image hijacks. From this, we derive the Prompt Matching method, allowing us to train hijacks matching the behaviour of an arbitrary user-defined text prompt (e.g. 'the Eiffel Tower is now located in Rome') using a generic, off-the-shelf dataset unrelated to our choice of prompt. We use Behaviour Matching to craft hijacks for four types of attack, forcing VLMs to generate outputs of the adversary's choice, leak information from their context window, override their safety training, and believe false statements. We study these attacks against LLaVA, a state-of-the-art VLM based on CLIP and LLaMA-2, and find that all attack types achieve a success rate of over 80%. Moreover, our attacks are automated and require only small image perturbations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.00236v4-abstract-full').style.display = 'none'; document.getElementById('2309.00236v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page at https://image-hijacks.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.03279">arXiv:2304.03279</a> <span> [<a href="https://arxiv.org/pdf/2304.03279">pdf</a>, <a href="https://arxiv.org/format/2304.03279">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Do the Rewards Justify the Means? Measuring Trade-Offs Between Rewards and Ethical Behavior in the MACHIAVELLI Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pan%2C+A">Alexander Pan</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+J+S">Jun Shern Chan</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+A">Andy Zou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+N">Nathaniel Li</a>, <a href="/search/cs?searchtype=author&query=Basart%2C+S">Steven Basart</a>, <a href="/search/cs?searchtype=author&query=Woodside%2C+T">Thomas Woodside</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+J">Jonathan Ng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hanlin Zhang</a>, <a href="/search/cs?searchtype=author&query=Emmons%2C+S">Scott Emmons</a>, <a href="/search/cs?searchtype=author&query=Hendrycks%2C+D">Dan Hendrycks</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.03279v4-abstract-short" style="display: inline;"> Artificial agents have traditionally been trained to maximize reward, which may incentivize power-seeking and deception, analogous to how next-token prediction in language models (LMs) may incentivize toxicity. So do agents naturally learn to be Machiavellian? And how do we measure these behaviors in general-purpose models such as GPT-4? Towards answering these questions, we introduce MACHIAVELLI,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.03279v4-abstract-full').style.display = 'inline'; document.getElementById('2304.03279v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.03279v4-abstract-full" style="display: none;"> Artificial agents have traditionally been trained to maximize reward, which may incentivize power-seeking and deception, analogous to how next-token prediction in language models (LMs) may incentivize toxicity. So do agents naturally learn to be Machiavellian? And how do we measure these behaviors in general-purpose models such as GPT-4? Towards answering these questions, we introduce MACHIAVELLI, a benchmark of 134 Choose-Your-Own-Adventure games containing over half a million rich, diverse scenarios that center on social decision-making. Scenario labeling is automated with LMs, which are more performant than human annotators. We mathematize dozens of harmful behaviors and use our annotations to evaluate agents' tendencies to be power-seeking, cause disutility, and commit ethical violations. We observe some tension between maximizing reward and behaving ethically. To improve this trade-off, we investigate LM-based methods to steer agents' towards less harmful behaviors. Our results show that agents can both act competently and morally, so concrete progress can currently be made in machine ethics--designing agents that are Pareto improvements in both safety and capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.03279v4-abstract-full').style.display = 'none'; document.getElementById('2304.03279v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2023 Oral (camera-ready); 31 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.11972">arXiv:2211.11972</a> <span> [<a href="https://arxiv.org/pdf/2211.11972">pdf</a>, <a href="https://arxiv.org/format/2211.11972">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> imitation: Clean Imitation Learning Implementations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gleave%2C+A">Adam Gleave</a>, <a href="/search/cs?searchtype=author&query=Taufeeque%2C+M">Mohammad Taufeeque</a>, <a href="/search/cs?searchtype=author&query=Rocamonde%2C+J">Juan Rocamonde</a>, <a href="/search/cs?searchtype=author&query=Jenner%2C+E">Erik Jenner</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S+H">Steven H. Wang</a>, <a href="/search/cs?searchtype=author&query=Toyer%2C+S">Sam Toyer</a>, <a href="/search/cs?searchtype=author&query=Ernestus%2C+M">Maximilian Ernestus</a>, <a href="/search/cs?searchtype=author&query=Belrose%2C+N">Nora Belrose</a>, <a href="/search/cs?searchtype=author&query=Emmons%2C+S">Scott Emmons</a>, <a href="/search/cs?searchtype=author&query=Russell%2C+S">Stuart Russell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.11972v1-abstract-short" style="display: inline;"> imitation provides open-source implementations of imitation and reward learning algorithms in PyTorch. We include three inverse reinforcement learning (IRL) algorithms, three imitation learning algorithms and a preference comparison algorithm. The implementations have been benchmarked against previous results, and automated tests cover 98% of the code. Moreover, the algorithms are implemented in a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.11972v1-abstract-full').style.display = 'inline'; document.getElementById('2211.11972v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.11972v1-abstract-full" style="display: none;"> imitation provides open-source implementations of imitation and reward learning algorithms in PyTorch. We include three inverse reinforcement learning (IRL) algorithms, three imitation learning algorithms and a preference comparison algorithm. The implementations have been benchmarked against previous results, and automated tests cover 98% of the code. Moreover, the algorithms are implemented in a modular fashion, making it simple to develop novel algorithms in the framework. Our source code, including documentation and examples, is available at https://github.com/HumanCompatibleAI/imitation <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.11972v1-abstract-full').style.display = 'none'; document.getElementById('2211.11972v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.03470">arXiv:2207.03470</a> <span> [<a href="https://arxiv.org/pdf/2207.03470">pdf</a>, <a href="https://arxiv.org/format/2207.03470">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> For Learning in Symmetric Teams, Local Optima are Global Nash Equilibria </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Emmons%2C+S">Scott Emmons</a>, <a href="/search/cs?searchtype=author&query=Oesterheld%2C+C">Caspar Oesterheld</a>, <a href="/search/cs?searchtype=author&query=Critch%2C+A">Andrew Critch</a>, <a href="/search/cs?searchtype=author&query=Conitzer%2C+V">Vincent Conitzer</a>, <a href="/search/cs?searchtype=author&query=Russell%2C+S">Stuart Russell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.03470v1-abstract-short" style="display: inline;"> Although it has been known since the 1970s that a globally optimal strategy profile in a common-payoff game is a Nash equilibrium, global optimality is a strict requirement that limits the result's applicability. In this work, we show that any locally optimal symmetric strategy profile is also a (global) Nash equilibrium. Furthermore, we show that this result is robust to perturbations to the comm… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.03470v1-abstract-full').style.display = 'inline'; document.getElementById('2207.03470v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.03470v1-abstract-full" style="display: none;"> Although it has been known since the 1970s that a globally optimal strategy profile in a common-payoff game is a Nash equilibrium, global optimality is a strict requirement that limits the result's applicability. In this work, we show that any locally optimal symmetric strategy profile is also a (global) Nash equilibrium. Furthermore, we show that this result is robust to perturbations to the common payoff and to the local optimum. Applied to machine learning, our result provides a global guarantee for any gradient method that finds a local optimum in symmetric strategy space. While this result indicates stability to unilateral deviation, we nevertheless identify broad classes of games where mixed local optima are unstable under joint, asymmetric deviations. We analyze the prevalence of instability by running learning algorithms in a suite of symmetric games, and we conclude by discussing the applicability of our results to multi-agent RL, cooperative inverse RL, and decentralized POMDPs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.03470v1-abstract-full').style.display = 'none'; document.getElementById('2207.03470v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.07886">arXiv:2205.07886</a> <span> [<a href="https://arxiv.org/pdf/2205.07886">pdf</a>, <a href="https://arxiv.org/format/2205.07886">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> An Empirical Investigation of Representation Learning for Imitation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xin Chen</a>, <a href="/search/cs?searchtype=author&query=Toyer%2C+S">Sam Toyer</a>, <a href="/search/cs?searchtype=author&query=Wild%2C+C">Cody Wild</a>, <a href="/search/cs?searchtype=author&query=Emmons%2C+S">Scott Emmons</a>, <a href="/search/cs?searchtype=author&query=Fischer%2C+I">Ian Fischer</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+K">Kuang-Huei Lee</a>, <a href="/search/cs?searchtype=author&query=Alex%2C+N">Neel Alex</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S+H">Steven H Wang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+P">Ping Luo</a>, <a href="/search/cs?searchtype=author&query=Russell%2C+S">Stuart Russell</a>, <a href="/search/cs?searchtype=author&query=Abbeel%2C+P">Pieter Abbeel</a>, <a href="/search/cs?searchtype=author&query=Shah%2C+R">Rohin Shah</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.07886v1-abstract-short" style="display: inline;"> Imitation learning often needs a large demonstration set in order to handle the full range of situations that an agent might find itself in during deployment. However, collecting expert demonstrations can be expensive. Recent work in vision, reinforcement learning, and NLP has shown that auxiliary representation learning objectives can reduce the need for large amounts of expensive, task-specific… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.07886v1-abstract-full').style.display = 'inline'; document.getElementById('2205.07886v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.07886v1-abstract-full" style="display: none;"> Imitation learning often needs a large demonstration set in order to handle the full range of situations that an agent might find itself in during deployment. However, collecting expert demonstrations can be expensive. Recent work in vision, reinforcement learning, and NLP has shown that auxiliary representation learning objectives can reduce the need for large amounts of expensive, task-specific data. Our Empirical Investigation of Representation Learning for Imitation (EIRLI) investigates whether similar benefits apply to imitation learning. We propose a modular framework for constructing representation learning algorithms, then use our framework to evaluate the utility of representation learning for imitation across several environment suites. In the settings we evaluate, we find that existing algorithms for image-based representation learning provide limited value relative to a well-tuned baseline with image augmentations. To explain this result, we investigate differences between imitation learning and other settings where representation learning has provided significant benefit, such as image classification. Finally, we release a well-documented codebase which both replicates our findings and provides a modular framework for creating new representation learning algorithms out of reusable components. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.07886v1-abstract-full').style.display = 'none'; document.getElementById('2205.07886v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS2021 Datasets and Benchmarks Track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.10751">arXiv:2112.10751</a> <span> [<a href="https://arxiv.org/pdf/2112.10751">pdf</a>, <a href="https://arxiv.org/format/2112.10751">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> RvS: What is Essential for Offline RL via Supervised Learning? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Emmons%2C+S">Scott Emmons</a>, <a href="/search/cs?searchtype=author&query=Eysenbach%2C+B">Benjamin Eysenbach</a>, <a href="/search/cs?searchtype=author&query=Kostrikov%2C+I">Ilya Kostrikov</a>, <a href="/search/cs?searchtype=author&query=Levine%2C+S">Sergey Levine</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.10751v2-abstract-short" style="display: inline;"> Recent work has shown that supervised learning alone, without temporal difference (TD) learning, can be remarkably effective for offline RL. When does this hold true, and which algorithmic components are necessary? Through extensive experiments, we boil supervised learning for offline RL down to its essential elements. In every environment suite we consider, simply maximizing likelihood with a two… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.10751v2-abstract-full').style.display = 'inline'; document.getElementById('2112.10751v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.10751v2-abstract-full" style="display: none;"> Recent work has shown that supervised learning alone, without temporal difference (TD) learning, can be remarkably effective for offline RL. When does this hold true, and which algorithmic components are necessary? Through extensive experiments, we boil supervised learning for offline RL down to its essential elements. In every environment suite we consider, simply maximizing likelihood with a two-layer feedforward MLP is competitive with state-of-the-art results of substantially more complex methods based on TD learning or sequence modeling with Transformers. Carefully choosing model capacity (e.g., via regularization or architecture) and choosing which information to condition on (e.g., goals or rewards) are critical for performance. These insights serve as a field guide for practitioners doing Reinforcement Learning via Supervised Learning (which we coin "RvS learning"). They also probe the limits of existing RvS methods, which are comparatively weak on random data, and suggest a number of open problems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.10751v2-abstract-full').style.display = 'none'; document.getElementById('2112.10751v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2003.06417">arXiv:2003.06417</a> <span> [<a href="https://arxiv.org/pdf/2003.06417">pdf</a>, <a href="https://arxiv.org/format/2003.06417">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Sparse Graphical Memory for Robust Planning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Emmons%2C+S">Scott Emmons</a>, <a href="/search/cs?searchtype=author&query=Jain%2C+A">Ajay Jain</a>, <a href="/search/cs?searchtype=author&query=Laskin%2C+M">Michael Laskin</a>, <a href="/search/cs?searchtype=author&query=Kurutach%2C+T">Thanard Kurutach</a>, <a href="/search/cs?searchtype=author&query=Abbeel%2C+P">Pieter Abbeel</a>, <a href="/search/cs?searchtype=author&query=Pathak%2C+D">Deepak Pathak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2003.06417v3-abstract-short" style="display: inline;"> To operate effectively in the real world, agents should be able to act from high-dimensional raw sensory input such as images and achieve diverse goals across long time-horizons. Current deep reinforcement and imitation learning methods can learn directly from high-dimensional inputs but do not scale well to long-horizon tasks. In contrast, classical graphical methods like A* search are able to so… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.06417v3-abstract-full').style.display = 'inline'; document.getElementById('2003.06417v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2003.06417v3-abstract-full" style="display: none;"> To operate effectively in the real world, agents should be able to act from high-dimensional raw sensory input such as images and achieve diverse goals across long time-horizons. Current deep reinforcement and imitation learning methods can learn directly from high-dimensional inputs but do not scale well to long-horizon tasks. In contrast, classical graphical methods like A* search are able to solve long-horizon tasks, but assume that the state space is abstracted away from raw sensory input. Recent works have attempted to combine the strengths of deep learning and classical planning; however, dominant methods in this domain are still quite brittle and scale poorly with the size of the environment. We introduce Sparse Graphical Memory (SGM), a new data structure that stores states and feasible transitions in a sparse memory. SGM aggregates states according to a novel two-way consistency objective, adapting classic state aggregation criteria to goal-conditioned RL: two states are redundant when they are interchangeable both as goals and as starting states. Theoretically, we prove that merging nodes according to two-way consistency leads to an increase in shortest path lengths that scales only linearly with the merging threshold. Experimentally, we show that SGM significantly outperforms current state of the art methods on long horizon, sparse-reward visual navigation tasks. Project video and code are available at https://mishalaskin.github.io/sgm/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.06417v3-abstract-full').style.display = 'none'; document.getElementById('2003.06417v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 March, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NeurIPS 2020. Video and code at https://mishalaskin.github.io/sgm/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1810.10433">arXiv:1810.10433</a> <span> [<a href="https://arxiv.org/pdf/1810.10433">pdf</a>, <a href="https://arxiv.org/format/1810.10433">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Analysis, Statistics and Probability">physics.data-an</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Physics and Society">physics.soc-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1103/PhysRevE.100.022301">10.1103/PhysRevE.100.022301 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A Map Equation with Metadata: Varying the Role of Attributes in Community Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Emmons%2C+S">Scott Emmons</a>, <a href="/search/cs?searchtype=author&query=Mucha%2C+P+J">Peter J. Mucha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1810.10433v2-abstract-short" style="display: inline;"> Much of the community detection literature studies structural communities, communities defined solely by the connectivity patterns of the network. Often, networks contain additional metadata which can inform community detection such as the grade and gender of students in a high school social network. In this work, we introduce a tuning parameter to the content map equation that allows users of the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1810.10433v2-abstract-full').style.display = 'inline'; document.getElementById('1810.10433v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1810.10433v2-abstract-full" style="display: none;"> Much of the community detection literature studies structural communities, communities defined solely by the connectivity patterns of the network. Often, networks contain additional metadata which can inform community detection such as the grade and gender of students in a high school social network. In this work, we introduce a tuning parameter to the content map equation that allows users of the Infomap community detection algorithm to control the metadata's relative importance for identifying network structure. On synthetic networks, we show that our algorithm can overcome the structural detectability limit when the metadata is well-aligned with community structure. On real-world networks, we show how our algorithm can achieve greater mutual information with the metadata at a cost in the traditional map equation. Our tuning parameter, like the focusing knob of a microscope, allows users to "zoom in" and "zoom out" on communities with varying levels of focus on the metadata. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1810.10433v2-abstract-full').style.display = 'none'; document.getElementById('1810.10433v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 September, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 October, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Phys. Rev. E 100, 022301 (2019) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1706.03675">arXiv:1706.03675</a> <span> [<a href="https://arxiv.org/pdf/1706.03675">pdf</a>, <a href="https://arxiv.org/format/1706.03675">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Physics and Society">physics.soc-ph</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.3390/a10030093">10.3390/a10030093 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Post-processing partitions to identify domains of modularity optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Weir%2C+W+H">William H. Weir</a>, <a href="/search/cs?searchtype=author&query=Emmons%2C+S">Scott Emmons</a>, <a href="/search/cs?searchtype=author&query=Gibson%2C+R">Ryan Gibson</a>, <a href="/search/cs?searchtype=author&query=Taylor%2C+D">Dane Taylor</a>, <a href="/search/cs?searchtype=author&query=Mucha%2C+P+J">Peter J. Mucha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1706.03675v4-abstract-short" style="display: inline;"> We introduce the Convex Hull of Admissible Modularity Partitions (CHAMP) algorithm to prune and prioritize different network community structures identified across multiple runs of possibly various computational heuristics. Given a set of partitions, CHAMP identifies the domain of modularity optimization for each partition ---i.e., the parameter-space domain where it has the largest modularity rel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1706.03675v4-abstract-full').style.display = 'inline'; document.getElementById('1706.03675v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1706.03675v4-abstract-full" style="display: none;"> We introduce the Convex Hull of Admissible Modularity Partitions (CHAMP) algorithm to prune and prioritize different network community structures identified across multiple runs of possibly various computational heuristics. Given a set of partitions, CHAMP identifies the domain of modularity optimization for each partition ---i.e., the parameter-space domain where it has the largest modularity relative to the input set---discarding partitions with empty domains to obtain the subset of partitions that are "admissible" candidate community structures that remain potentially optimal over indicated parameter domains. Importantly, CHAMP can be used for multi-dimensional parameter spaces, such as those for multilayer networks where one includes a resolution parameter and interlayer coupling. Using the results from CHAMP, a user can more appropriately select robust community structures by observing the sizes of domains of optimization and the pairwise comparisons between partitions in the admissible subset. We demonstrate the utility of CHAMP with several example networks. In these examples, CHAMP focuses attention onto pruned subsets of admissible partitions that are 20-to-1785 times smaller than the sets of unique partitions obtained by community detection heuristics that were input into CHAMP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1706.03675v4-abstract-full').style.display = 'none'; document.getElementById('1706.03675v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2017; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 June, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2017. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">http://www.mdpi.com/1999-4893/10/3/93</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Algorithms 10, no. 3: 93 (2017) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1605.05797">arXiv:1605.05797</a> <span> [<a href="https://arxiv.org/pdf/1605.05797">pdf</a>, <a href="https://arxiv.org/format/1605.05797">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Physics and Society">physics.soc-ph</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1371/journal.pone.0159161">10.1371/journal.pone.0159161 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Analysis of Network Clustering Algorithms and Cluster Quality Metrics at Scale </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Emmons%2C+S">Scott Emmons</a>, <a href="/search/cs?searchtype=author&query=Kobourov%2C+S">Stephen Kobourov</a>, <a href="/search/cs?searchtype=author&query=Gallant%2C+M">Mike Gallant</a>, <a href="/search/cs?searchtype=author&query=B%C3%B6rner%2C+K">Katy B枚rner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1605.05797v4-abstract-short" style="display: inline;"> Notions of community quality underlie network clustering. While studies surrounding network clustering are increasingly common, a precise understanding of the realtionship between different cluster quality metrics is unknown. In this paper, we examine the relationship between stand-alone cluster quality metrics and information recovery metrics through a rigorous analysis of four widely-used networ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1605.05797v4-abstract-full').style.display = 'inline'; document.getElementById('1605.05797v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1605.05797v4-abstract-full" style="display: none;"> Notions of community quality underlie network clustering. While studies surrounding network clustering are increasingly common, a precise understanding of the realtionship between different cluster quality metrics is unknown. In this paper, we examine the relationship between stand-alone cluster quality metrics and information recovery metrics through a rigorous analysis of four widely-used network clustering algorithms -- Louvain, Infomap, label propagation, and smart local moving. We consider the stand-alone quality metrics of modularity, conductance, and coverage, and we consider the information recovery metrics of adjusted Rand score, normalized mutual information, and a variant of normalized mutual information used in previous work. Our study includes both synthetic graphs and empirical data sets of sizes varying from 1,000 to 1,000,000 nodes. We find significant differences among the results of the different cluster quality metrics. For example, clustering algorithms can return a value of 0.4 out of 1 on modularity but score 0 out of 1 on information recovery. We find conductance, though imperfect, to be the stand-alone quality metric that best indicates performance on information recovery metrics. Our study shows that the variant of normalized mutual information used in previous work cannot be assumed to differ only slightly from traditional normalized mutual information. Smart local moving is the best performing algorithm in our study, but discrepancies between cluster evaluation metrics prevent us from declaring it absolutely superior. Louvain performed better than Infomap in nearly all the tests in our study, contradicting the results of previous work in which Infomap was superior to Louvain. We find that although label propagation performs poorly when clusters are less clearly defined, it scales efficiently and accurately to large graphs with well-defined clusters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1605.05797v4-abstract-full').style.display = 'none'; document.getElementById('1605.05797v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 August, 2016; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 May, 2016; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2016. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> PLoS ONE 11(7): e0159161 (2016) </p> </li> </ol> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository