Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–14 of 14 results for author: <span class="mathjax">Kadavath, S</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Kadavath%2C+S">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Kadavath, S"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Kadavath%2C+S&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Kadavath, S"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.13798">arXiv:2310.13798</a> <span> [<a href="https://arxiv.org/pdf/2310.13798">pdf</a>, <a href="https://arxiv.org/format/2310.13798">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Specific versus General Principles for Constitutional AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kundu%2C+S">Sandipan Kundu</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yuntao Bai</a>, <a href="/search/cs?searchtype=author&query=Kadavath%2C+S">Saurav Kadavath</a>, <a href="/search/cs?searchtype=author&query=Askell%2C+A">Amanda Askell</a>, <a href="/search/cs?searchtype=author&query=Callahan%2C+A">Andrew Callahan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+A">Anna Chen</a>, <a href="/search/cs?searchtype=author&query=Goldie%2C+A">Anna Goldie</a>, <a href="/search/cs?searchtype=author&query=Balwit%2C+A">Avital Balwit</a>, <a href="/search/cs?searchtype=author&query=Mirhoseini%2C+A">Azalia Mirhoseini</a>, <a href="/search/cs?searchtype=author&query=McLean%2C+B">Brayden McLean</a>, <a href="/search/cs?searchtype=author&query=Olsson%2C+C">Catherine Olsson</a>, <a href="/search/cs?searchtype=author&query=Evraets%2C+C">Cassie Evraets</a>, <a href="/search/cs?searchtype=author&query=Tran-Johnson%2C+E">Eli Tran-Johnson</a>, <a href="/search/cs?searchtype=author&query=Durmus%2C+E">Esin Durmus</a>, <a href="/search/cs?searchtype=author&query=Perez%2C+E">Ethan Perez</a>, <a href="/search/cs?searchtype=author&query=Kernion%2C+J">Jackson Kernion</a>, <a href="/search/cs?searchtype=author&query=Kerr%2C+J">Jamie Kerr</a>, <a href="/search/cs?searchtype=author&query=Ndousse%2C+K">Kamal Ndousse</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+K">Karina Nguyen</a>, <a href="/search/cs?searchtype=author&query=Elhage%2C+N">Nelson Elhage</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+N">Newton Cheng</a>, <a href="/search/cs?searchtype=author&query=Schiefer%2C+N">Nicholas Schiefer</a>, <a href="/search/cs?searchtype=author&query=DasSarma%2C+N">Nova DasSarma</a>, <a href="/search/cs?searchtype=author&query=Rausch%2C+O">Oliver Rausch</a>, <a href="/search/cs?searchtype=author&query=Larson%2C+R">Robin Larson</a> , et al. (11 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.13798v1-abstract-short" style="display: inline;"> Human feedback can prevent overtly harmful utterances in conversational models, but may not automatically mitigate subtle problematic behaviors such as a stated desire for self-preservation or power. Constitutional AI offers an alternative, replacing human feedback with feedback from AI models conditioned only on a list of written principles. We find this approach effectively prevents the expressi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.13798v1-abstract-full').style.display = 'inline'; document.getElementById('2310.13798v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.13798v1-abstract-full" style="display: none;"> Human feedback can prevent overtly harmful utterances in conversational models, but may not automatically mitigate subtle problematic behaviors such as a stated desire for self-preservation or power. Constitutional AI offers an alternative, replacing human feedback with feedback from AI models conditioned only on a list of written principles. We find this approach effectively prevents the expression of such behaviors. The success of simple principles motivates us to ask: can models learn general ethical behaviors from only a single written principle? To test this, we run experiments using a principle roughly stated as "do what's best for humanity". We find that the largest dialogue models can generalize from this short constitution, resulting in harmless assistants with no stated interest in specific motivations like power. A general principle may thus partially avoid the need for a long list of constitutions targeting potentially harmful behaviors. However, more detailed constitutions still improve fine-grained control over specific types of harms. This suggests both general and specific principles have value for steering AI safely. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.13798v1-abstract-full').style.display = 'none'; document.getElementById('2310.13798v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.13702">arXiv:2307.13702</a> <span> [<a href="https://arxiv.org/pdf/2307.13702">pdf</a>, <a href="https://arxiv.org/format/2307.13702">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Measuring Faithfulness in Chain-of-Thought Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lanham%2C+T">Tamera Lanham</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+A">Anna Chen</a>, <a href="/search/cs?searchtype=author&query=Radhakrishnan%2C+A">Ansh Radhakrishnan</a>, <a href="/search/cs?searchtype=author&query=Steiner%2C+B">Benoit Steiner</a>, <a href="/search/cs?searchtype=author&query=Denison%2C+C">Carson Denison</a>, <a href="/search/cs?searchtype=author&query=Hernandez%2C+D">Danny Hernandez</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dustin Li</a>, <a href="/search/cs?searchtype=author&query=Durmus%2C+E">Esin Durmus</a>, <a href="/search/cs?searchtype=author&query=Hubinger%2C+E">Evan Hubinger</a>, <a href="/search/cs?searchtype=author&query=Kernion%2C+J">Jackson Kernion</a>, <a href="/search/cs?searchtype=author&query=Luko%C5%A1i%C5%ABt%C4%97%2C+K">Kamil臈 Luko拧i奴t臈</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+K">Karina Nguyen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+N">Newton Cheng</a>, <a href="/search/cs?searchtype=author&query=Joseph%2C+N">Nicholas Joseph</a>, <a href="/search/cs?searchtype=author&query=Schiefer%2C+N">Nicholas Schiefer</a>, <a href="/search/cs?searchtype=author&query=Rausch%2C+O">Oliver Rausch</a>, <a href="/search/cs?searchtype=author&query=Larson%2C+R">Robin Larson</a>, <a href="/search/cs?searchtype=author&query=McCandlish%2C+S">Sam McCandlish</a>, <a href="/search/cs?searchtype=author&query=Kundu%2C+S">Sandipan Kundu</a>, <a href="/search/cs?searchtype=author&query=Kadavath%2C+S">Saurav Kadavath</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shannon Yang</a>, <a href="/search/cs?searchtype=author&query=Henighan%2C+T">Thomas Henighan</a>, <a href="/search/cs?searchtype=author&query=Maxwell%2C+T">Timothy Maxwell</a>, <a href="/search/cs?searchtype=author&query=Telleen-Lawton%2C+T">Timothy Telleen-Lawton</a>, <a href="/search/cs?searchtype=author&query=Hume%2C+T">Tristan Hume</a> , et al. (5 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.13702v1-abstract-short" style="display: inline;"> Large language models (LLMs) perform better when they produce step-by-step, "Chain-of-Thought" (CoT) reasoning before answering a question, but it is unclear if the stated reasoning is a faithful explanation of the model's actual reasoning (i.e., its process for answering the question). We investigate hypotheses for how CoT reasoning may be unfaithful, by examining how the model predictions change… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.13702v1-abstract-full').style.display = 'inline'; document.getElementById('2307.13702v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.13702v1-abstract-full" style="display: none;"> Large language models (LLMs) perform better when they produce step-by-step, "Chain-of-Thought" (CoT) reasoning before answering a question, but it is unclear if the stated reasoning is a faithful explanation of the model's actual reasoning (i.e., its process for answering the question). We investigate hypotheses for how CoT reasoning may be unfaithful, by examining how the model predictions change when we intervene on the CoT (e.g., by adding mistakes or paraphrasing it). Models show large variation across tasks in how strongly they condition on the CoT when predicting their answer, sometimes relying heavily on the CoT and other times primarily ignoring it. CoT's performance boost does not seem to come from CoT's added test-time compute alone or from information encoded via the particular phrasing of the CoT. As models become larger and more capable, they produce less faithful reasoning on most tasks we study. Overall, our results suggest that CoT can be faithful if the circumstances such as the model size and task are carefully chosen. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.13702v1-abstract-full').style.display = 'none'; document.getElementById('2307.13702v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.07459">arXiv:2302.07459</a> <span> [<a href="https://arxiv.org/pdf/2302.07459">pdf</a>, <a href="https://arxiv.org/format/2302.07459">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> The Capacity for Moral Self-Correction in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ganguli%2C+D">Deep Ganguli</a>, <a href="/search/cs?searchtype=author&query=Askell%2C+A">Amanda Askell</a>, <a href="/search/cs?searchtype=author&query=Schiefer%2C+N">Nicholas Schiefer</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+T+I">Thomas I. Liao</a>, <a href="/search/cs?searchtype=author&query=Luko%C5%A1i%C5%ABt%C4%97%2C+K">Kamil臈 Luko拧i奴t臈</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+A">Anna Chen</a>, <a href="/search/cs?searchtype=author&query=Goldie%2C+A">Anna Goldie</a>, <a href="/search/cs?searchtype=author&query=Mirhoseini%2C+A">Azalia Mirhoseini</a>, <a href="/search/cs?searchtype=author&query=Olsson%2C+C">Catherine Olsson</a>, <a href="/search/cs?searchtype=author&query=Hernandez%2C+D">Danny Hernandez</a>, <a href="/search/cs?searchtype=author&query=Drain%2C+D">Dawn Drain</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dustin Li</a>, <a href="/search/cs?searchtype=author&query=Tran-Johnson%2C+E">Eli Tran-Johnson</a>, <a href="/search/cs?searchtype=author&query=Perez%2C+E">Ethan Perez</a>, <a href="/search/cs?searchtype=author&query=Kernion%2C+J">Jackson Kernion</a>, <a href="/search/cs?searchtype=author&query=Kerr%2C+J">Jamie Kerr</a>, <a href="/search/cs?searchtype=author&query=Mueller%2C+J">Jared Mueller</a>, <a href="/search/cs?searchtype=author&query=Landau%2C+J">Joshua Landau</a>, <a href="/search/cs?searchtype=author&query=Ndousse%2C+K">Kamal Ndousse</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+K">Karina Nguyen</a>, <a href="/search/cs?searchtype=author&query=Lovitt%2C+L">Liane Lovitt</a>, <a href="/search/cs?searchtype=author&query=Sellitto%2C+M">Michael Sellitto</a>, <a href="/search/cs?searchtype=author&query=Elhage%2C+N">Nelson Elhage</a>, <a href="/search/cs?searchtype=author&query=Mercado%2C+N">Noemi Mercado</a>, <a href="/search/cs?searchtype=author&query=DasSarma%2C+N">Nova DasSarma</a> , et al. (24 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.07459v2-abstract-short" style="display: inline;"> We test the hypothesis that language models trained with reinforcement learning from human feedback (RLHF) have the capability to "morally self-correct" -- to avoid producing harmful outputs -- if instructed to do so. We find strong evidence in support of this hypothesis across three different experiments, each of which reveal different facets of moral self-correction. We find that the capability… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.07459v2-abstract-full').style.display = 'inline'; document.getElementById('2302.07459v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.07459v2-abstract-full" style="display: none;"> We test the hypothesis that language models trained with reinforcement learning from human feedback (RLHF) have the capability to "morally self-correct" -- to avoid producing harmful outputs -- if instructed to do so. We find strong evidence in support of this hypothesis across three different experiments, each of which reveal different facets of moral self-correction. We find that the capability for moral self-correction emerges at 22B model parameters, and typically improves with increasing model size and RLHF training. We believe that at this level of scale, language models obtain two capabilities that they can use for moral self-correction: (1) they can follow instructions and (2) they can learn complex normative concepts of harm like stereotyping, bias, and discrimination. As such, they can follow instructions to avoid certain kinds of morally harmful outputs. We believe our results are cause for cautious optimism regarding the ability to train language models to abide by ethical principles. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.07459v2-abstract-full').style.display = 'none'; document.getElementById('2302.07459v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.09251">arXiv:2212.09251</a> <span> [<a href="https://arxiv.org/pdf/2212.09251">pdf</a>, <a href="https://arxiv.org/format/2212.09251">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Discovering Language Model Behaviors with Model-Written Evaluations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Perez%2C+E">Ethan Perez</a>, <a href="/search/cs?searchtype=author&query=Ringer%2C+S">Sam Ringer</a>, <a href="/search/cs?searchtype=author&query=Luko%C5%A1i%C5%ABt%C4%97%2C+K">Kamil臈 Luko拧i奴t臈</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+K">Karina Nguyen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+E">Edwin Chen</a>, <a href="/search/cs?searchtype=author&query=Heiner%2C+S">Scott Heiner</a>, <a href="/search/cs?searchtype=author&query=Pettit%2C+C">Craig Pettit</a>, <a href="/search/cs?searchtype=author&query=Olsson%2C+C">Catherine Olsson</a>, <a href="/search/cs?searchtype=author&query=Kundu%2C+S">Sandipan Kundu</a>, <a href="/search/cs?searchtype=author&query=Kadavath%2C+S">Saurav Kadavath</a>, <a href="/search/cs?searchtype=author&query=Jones%2C+A">Andy Jones</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+A">Anna Chen</a>, <a href="/search/cs?searchtype=author&query=Mann%2C+B">Ben Mann</a>, <a href="/search/cs?searchtype=author&query=Israel%2C+B">Brian Israel</a>, <a href="/search/cs?searchtype=author&query=Seethor%2C+B">Bryan Seethor</a>, <a href="/search/cs?searchtype=author&query=McKinnon%2C+C">Cameron McKinnon</a>, <a href="/search/cs?searchtype=author&query=Olah%2C+C">Christopher Olah</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+D">Da Yan</a>, <a href="/search/cs?searchtype=author&query=Amodei%2C+D">Daniela Amodei</a>, <a href="/search/cs?searchtype=author&query=Amodei%2C+D">Dario Amodei</a>, <a href="/search/cs?searchtype=author&query=Drain%2C+D">Dawn Drain</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dustin Li</a>, <a href="/search/cs?searchtype=author&query=Tran-Johnson%2C+E">Eli Tran-Johnson</a>, <a href="/search/cs?searchtype=author&query=Khundadze%2C+G">Guro Khundadze</a>, <a href="/search/cs?searchtype=author&query=Kernion%2C+J">Jackson Kernion</a> , et al. (38 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.09251v1-abstract-short" style="display: inline;"> As language models (LMs) scale, they develop many novel behaviors, good and bad, exacerbating the need to evaluate how they behave. Prior work creates evaluations with crowdwork (which is time-consuming and expensive) or existing data sources (which are not always available). Here, we automatically generate evaluations with LMs. We explore approaches with varying amounts of human effort, from inst… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.09251v1-abstract-full').style.display = 'inline'; document.getElementById('2212.09251v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.09251v1-abstract-full" style="display: none;"> As language models (LMs) scale, they develop many novel behaviors, good and bad, exacerbating the need to evaluate how they behave. Prior work creates evaluations with crowdwork (which is time-consuming and expensive) or existing data sources (which are not always available). Here, we automatically generate evaluations with LMs. We explore approaches with varying amounts of human effort, from instructing LMs to write yes/no questions to making complex Winogender schemas with multiple stages of LM-based generation and filtering. Crowdworkers rate the examples as highly relevant and agree with 90-100% of labels, sometimes more so than corresponding human-written datasets. We generate 154 datasets and discover new cases of inverse scaling where LMs get worse with size. Larger LMs repeat back a dialog user's preferred answer ("sycophancy") and express greater desire to pursue concerning goals like resource acquisition and goal preservation. We also find some of the first examples of inverse scaling in RL from Human Feedback (RLHF), where more RLHF makes LMs worse. For example, RLHF makes LMs express stronger political views (on gun rights and immigration) and a greater desire to avoid shut down. Overall, LM-written evaluations are high-quality and let us quickly discover many novel LM behaviors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.09251v1-abstract-full').style.display = 'none'; document.getElementById('2212.09251v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">for associated data visualizations, see https://www.evals.anthropic.com/model-written/ for full datasets, see https://github.com/anthropics/evals</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.08073">arXiv:2212.08073</a> <span> [<a href="https://arxiv.org/pdf/2212.08073">pdf</a>, <a href="https://arxiv.org/format/2212.08073">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Constitutional AI: Harmlessness from AI Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yuntao Bai</a>, <a href="/search/cs?searchtype=author&query=Kadavath%2C+S">Saurav Kadavath</a>, <a href="/search/cs?searchtype=author&query=Kundu%2C+S">Sandipan Kundu</a>, <a href="/search/cs?searchtype=author&query=Askell%2C+A">Amanda Askell</a>, <a href="/search/cs?searchtype=author&query=Kernion%2C+J">Jackson Kernion</a>, <a href="/search/cs?searchtype=author&query=Jones%2C+A">Andy Jones</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+A">Anna Chen</a>, <a href="/search/cs?searchtype=author&query=Goldie%2C+A">Anna Goldie</a>, <a href="/search/cs?searchtype=author&query=Mirhoseini%2C+A">Azalia Mirhoseini</a>, <a href="/search/cs?searchtype=author&query=McKinnon%2C+C">Cameron McKinnon</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Carol Chen</a>, <a href="/search/cs?searchtype=author&query=Olsson%2C+C">Catherine Olsson</a>, <a href="/search/cs?searchtype=author&query=Olah%2C+C">Christopher Olah</a>, <a href="/search/cs?searchtype=author&query=Hernandez%2C+D">Danny Hernandez</a>, <a href="/search/cs?searchtype=author&query=Drain%2C+D">Dawn Drain</a>, <a href="/search/cs?searchtype=author&query=Ganguli%2C+D">Deep Ganguli</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dustin Li</a>, <a href="/search/cs?searchtype=author&query=Tran-Johnson%2C+E">Eli Tran-Johnson</a>, <a href="/search/cs?searchtype=author&query=Perez%2C+E">Ethan Perez</a>, <a href="/search/cs?searchtype=author&query=Kerr%2C+J">Jamie Kerr</a>, <a href="/search/cs?searchtype=author&query=Mueller%2C+J">Jared Mueller</a>, <a href="/search/cs?searchtype=author&query=Ladish%2C+J">Jeffrey Ladish</a>, <a href="/search/cs?searchtype=author&query=Landau%2C+J">Joshua Landau</a>, <a href="/search/cs?searchtype=author&query=Ndousse%2C+K">Kamal Ndousse</a>, <a href="/search/cs?searchtype=author&query=Lukosuite%2C+K">Kamile Lukosuite</a> , et al. (26 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.08073v1-abstract-short" style="display: inline;"> As AI systems become more capable, we would like to enlist their help to supervise other AIs. We experiment with methods for training a harmless AI assistant through self-improvement, without any human labels identifying harmful outputs. The only human oversight is provided through a list of rules or principles, and so we refer to the method as 'Constitutional AI'. The process involves both a supe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.08073v1-abstract-full').style.display = 'inline'; document.getElementById('2212.08073v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.08073v1-abstract-full" style="display: none;"> As AI systems become more capable, we would like to enlist their help to supervise other AIs. We experiment with methods for training a harmless AI assistant through self-improvement, without any human labels identifying harmful outputs. The only human oversight is provided through a list of rules or principles, and so we refer to the method as 'Constitutional AI'. The process involves both a supervised learning and a reinforcement learning phase. In the supervised phase we sample from an initial model, then generate self-critiques and revisions, and then finetune the original model on revised responses. In the RL phase, we sample from the finetuned model, use a model to evaluate which of the two samples is better, and then train a preference model from this dataset of AI preferences. We then train with RL using the preference model as the reward signal, i.e. we use 'RL from AI Feedback' (RLAIF). As a result we are able to train a harmless but non-evasive AI assistant that engages with harmful queries by explaining its objections to them. Both the SL and RL methods can leverage chain-of-thought style reasoning to improve the human-judged performance and transparency of AI decision making. These methods make it possible to control AI behavior more precisely and with far fewer human labels. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.08073v1-abstract-full').style.display = 'none'; document.getElementById('2212.08073v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.11923">arXiv:2209.11923</a> <span> [<a href="https://arxiv.org/pdf/2209.11923">pdf</a>, <a href="https://arxiv.org/format/2209.11923">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DeepChrome 2.0: Investigating and Improving Architectures, Visualizations, & Experiments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kadavath%2C+S">Saurav Kadavath</a>, <a href="/search/cs?searchtype=author&query=Paradis%2C+S">Samuel Paradis</a>, <a href="/search/cs?searchtype=author&query=Yeung%2C+J">Jacob Yeung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.11923v1-abstract-short" style="display: inline;"> Histone modifications play a critical role in gene regulation. Consequently, predicting gene expression from histone modification signals is a highly motivated problem in epigenetics. We build upon the work of DeepChrome by Singh et al. (2016), who trained classifiers that map histone modification signals to gene expression. We present a novel visualization technique for providing insight into com… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.11923v1-abstract-full').style.display = 'inline'; document.getElementById('2209.11923v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.11923v1-abstract-full" style="display: none;"> Histone modifications play a critical role in gene regulation. Consequently, predicting gene expression from histone modification signals is a highly motivated problem in epigenetics. We build upon the work of DeepChrome by Singh et al. (2016), who trained classifiers that map histone modification signals to gene expression. We present a novel visualization technique for providing insight into combinatorial relationships among histone modifications for gene regulation that uses a generative adversarial network to generate histone modification signals. We also explore and compare various architectural changes, with results suggesting that the 645k-parameter convolutional neural network from DeepChrome has the same predictive power as a 12-parameter linear network. Results from cross-cell prediction experiments, where the model is trained and tested on datasets of varying sizes, cell-types, and correlations, suggest the relationship between histone modification signals and gene expression is independent of cell type. We release our PyTorch re-implementation of DeepChrome on GitHub \footnote{\url{github.com/ssss1029/gene_expression_294}}.\parfillskip=0pt <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.11923v1-abstract-full').style.display = 'none'; document.getElementById('2209.11923v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.07858">arXiv:2209.07858</a> <span> [<a href="https://arxiv.org/pdf/2209.07858">pdf</a>, <a href="https://arxiv.org/format/2209.07858">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Red Teaming Language Models to Reduce Harms: Methods, Scaling Behaviors, and Lessons Learned </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ganguli%2C+D">Deep Ganguli</a>, <a href="/search/cs?searchtype=author&query=Lovitt%2C+L">Liane Lovitt</a>, <a href="/search/cs?searchtype=author&query=Kernion%2C+J">Jackson Kernion</a>, <a href="/search/cs?searchtype=author&query=Askell%2C+A">Amanda Askell</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yuntao Bai</a>, <a href="/search/cs?searchtype=author&query=Kadavath%2C+S">Saurav Kadavath</a>, <a href="/search/cs?searchtype=author&query=Mann%2C+B">Ben Mann</a>, <a href="/search/cs?searchtype=author&query=Perez%2C+E">Ethan Perez</a>, <a href="/search/cs?searchtype=author&query=Schiefer%2C+N">Nicholas Schiefer</a>, <a href="/search/cs?searchtype=author&query=Ndousse%2C+K">Kamal Ndousse</a>, <a href="/search/cs?searchtype=author&query=Jones%2C+A">Andy Jones</a>, <a href="/search/cs?searchtype=author&query=Bowman%2C+S">Sam Bowman</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+A">Anna Chen</a>, <a href="/search/cs?searchtype=author&query=Conerly%2C+T">Tom Conerly</a>, <a href="/search/cs?searchtype=author&query=DasSarma%2C+N">Nova DasSarma</a>, <a href="/search/cs?searchtype=author&query=Drain%2C+D">Dawn Drain</a>, <a href="/search/cs?searchtype=author&query=Elhage%2C+N">Nelson Elhage</a>, <a href="/search/cs?searchtype=author&query=El-Showk%2C+S">Sheer El-Showk</a>, <a href="/search/cs?searchtype=author&query=Fort%2C+S">Stanislav Fort</a>, <a href="/search/cs?searchtype=author&query=Hatfield-Dodds%2C+Z">Zac Hatfield-Dodds</a>, <a href="/search/cs?searchtype=author&query=Henighan%2C+T">Tom Henighan</a>, <a href="/search/cs?searchtype=author&query=Hernandez%2C+D">Danny Hernandez</a>, <a href="/search/cs?searchtype=author&query=Hume%2C+T">Tristan Hume</a>, <a href="/search/cs?searchtype=author&query=Jacobson%2C+J">Josh Jacobson</a>, <a href="/search/cs?searchtype=author&query=Johnston%2C+S">Scott Johnston</a> , et al. (11 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.07858v2-abstract-short" style="display: inline;"> We describe our early efforts to red team language models in order to simultaneously discover, measure, and attempt to reduce their potentially harmful outputs. We make three main contributions. First, we investigate scaling behaviors for red teaming across 3 model sizes (2.7B, 13B, and 52B parameters) and 4 model types: a plain language model (LM); an LM prompted to be helpful, honest, and harmle… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.07858v2-abstract-full').style.display = 'inline'; document.getElementById('2209.07858v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.07858v2-abstract-full" style="display: none;"> We describe our early efforts to red team language models in order to simultaneously discover, measure, and attempt to reduce their potentially harmful outputs. We make three main contributions. First, we investigate scaling behaviors for red teaming across 3 model sizes (2.7B, 13B, and 52B parameters) and 4 model types: a plain language model (LM); an LM prompted to be helpful, honest, and harmless; an LM with rejection sampling; and a model trained to be helpful and harmless using reinforcement learning from human feedback (RLHF). We find that the RLHF models are increasingly difficult to red team as they scale, and we find a flat trend with scale for the other model types. Second, we release our dataset of 38,961 red team attacks for others to analyze and learn from. We provide our own analysis of the data and find a variety of harmful outputs, which range from offensive language to more subtly harmful non-violent unethical outputs. Third, we exhaustively describe our instructions, processes, statistical methodologies, and uncertainty about red teaming. We hope that this transparency accelerates our ability to work together as a community in order to develop shared norms, practices, and technical standards for how to red team language models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.07858v2-abstract-full').style.display = 'none'; document.getElementById('2209.07858v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.05221">arXiv:2207.05221</a> <span> [<a href="https://arxiv.org/pdf/2207.05221">pdf</a>, <a href="https://arxiv.org/format/2207.05221">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Language Models (Mostly) Know What They Know </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kadavath%2C+S">Saurav Kadavath</a>, <a href="/search/cs?searchtype=author&query=Conerly%2C+T">Tom Conerly</a>, <a href="/search/cs?searchtype=author&query=Askell%2C+A">Amanda Askell</a>, <a href="/search/cs?searchtype=author&query=Henighan%2C+T">Tom Henighan</a>, <a href="/search/cs?searchtype=author&query=Drain%2C+D">Dawn Drain</a>, <a href="/search/cs?searchtype=author&query=Perez%2C+E">Ethan Perez</a>, <a href="/search/cs?searchtype=author&query=Schiefer%2C+N">Nicholas Schiefer</a>, <a href="/search/cs?searchtype=author&query=Hatfield-Dodds%2C+Z">Zac Hatfield-Dodds</a>, <a href="/search/cs?searchtype=author&query=DasSarma%2C+N">Nova DasSarma</a>, <a href="/search/cs?searchtype=author&query=Tran-Johnson%2C+E">Eli Tran-Johnson</a>, <a href="/search/cs?searchtype=author&query=Johnston%2C+S">Scott Johnston</a>, <a href="/search/cs?searchtype=author&query=El-Showk%2C+S">Sheer El-Showk</a>, <a href="/search/cs?searchtype=author&query=Jones%2C+A">Andy Jones</a>, <a href="/search/cs?searchtype=author&query=Elhage%2C+N">Nelson Elhage</a>, <a href="/search/cs?searchtype=author&query=Hume%2C+T">Tristan Hume</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+A">Anna Chen</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yuntao Bai</a>, <a href="/search/cs?searchtype=author&query=Bowman%2C+S">Sam Bowman</a>, <a href="/search/cs?searchtype=author&query=Fort%2C+S">Stanislav Fort</a>, <a href="/search/cs?searchtype=author&query=Ganguli%2C+D">Deep Ganguli</a>, <a href="/search/cs?searchtype=author&query=Hernandez%2C+D">Danny Hernandez</a>, <a href="/search/cs?searchtype=author&query=Jacobson%2C+J">Josh Jacobson</a>, <a href="/search/cs?searchtype=author&query=Kernion%2C+J">Jackson Kernion</a>, <a href="/search/cs?searchtype=author&query=Kravec%2C+S">Shauna Kravec</a>, <a href="/search/cs?searchtype=author&query=Lovitt%2C+L">Liane Lovitt</a> , et al. (11 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.05221v4-abstract-short" style="display: inline;"> We study whether language models can evaluate the validity of their own claims and predict which questions they will be able to answer correctly. We first show that larger models are well-calibrated on diverse multiple choice and true/false questions when they are provided in the right format. Thus we can approach self-evaluation on open-ended sampling tasks by asking models to first propose answe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.05221v4-abstract-full').style.display = 'inline'; document.getElementById('2207.05221v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.05221v4-abstract-full" style="display: none;"> We study whether language models can evaluate the validity of their own claims and predict which questions they will be able to answer correctly. We first show that larger models are well-calibrated on diverse multiple choice and true/false questions when they are provided in the right format. Thus we can approach self-evaluation on open-ended sampling tasks by asking models to first propose answers, and then to evaluate the probability "P(True)" that their answers are correct. We find encouraging performance, calibration, and scaling for P(True) on a diverse array of tasks. Performance at self-evaluation further improves when we allow models to consider many of their own samples before predicting the validity of one specific possibility. Next, we investigate whether models can be trained to predict "P(IK)", the probability that "I know" the answer to a question, without reference to any particular proposed answer. Models perform well at predicting P(IK) and partially generalize across tasks, though they struggle with calibration of P(IK) on new tasks. The predicted P(IK) probabilities also increase appropriately in the presence of relevant source materials in the context, and in the presence of hints towards the solution of mathematical word problems. We hope these observations lay the groundwork for training more honest models, and for investigating how honesty generalizes to cases where models are trained on objectives other than the imitation of human writing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.05221v4-abstract-full').style.display = 'none'; document.getElementById('2207.05221v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23+17 pages; refs added, typos fixed</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.05862">arXiv:2204.05862</a> <span> [<a href="https://arxiv.org/pdf/2204.05862">pdf</a>, <a href="https://arxiv.org/format/2204.05862">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yuntao Bai</a>, <a href="/search/cs?searchtype=author&query=Jones%2C+A">Andy Jones</a>, <a href="/search/cs?searchtype=author&query=Ndousse%2C+K">Kamal Ndousse</a>, <a href="/search/cs?searchtype=author&query=Askell%2C+A">Amanda Askell</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+A">Anna Chen</a>, <a href="/search/cs?searchtype=author&query=DasSarma%2C+N">Nova DasSarma</a>, <a href="/search/cs?searchtype=author&query=Drain%2C+D">Dawn Drain</a>, <a href="/search/cs?searchtype=author&query=Fort%2C+S">Stanislav Fort</a>, <a href="/search/cs?searchtype=author&query=Ganguli%2C+D">Deep Ganguli</a>, <a href="/search/cs?searchtype=author&query=Henighan%2C+T">Tom Henighan</a>, <a href="/search/cs?searchtype=author&query=Joseph%2C+N">Nicholas Joseph</a>, <a href="/search/cs?searchtype=author&query=Kadavath%2C+S">Saurav Kadavath</a>, <a href="/search/cs?searchtype=author&query=Kernion%2C+J">Jackson Kernion</a>, <a href="/search/cs?searchtype=author&query=Conerly%2C+T">Tom Conerly</a>, <a href="/search/cs?searchtype=author&query=El-Showk%2C+S">Sheer El-Showk</a>, <a href="/search/cs?searchtype=author&query=Elhage%2C+N">Nelson Elhage</a>, <a href="/search/cs?searchtype=author&query=Hatfield-Dodds%2C+Z">Zac Hatfield-Dodds</a>, <a href="/search/cs?searchtype=author&query=Hernandez%2C+D">Danny Hernandez</a>, <a href="/search/cs?searchtype=author&query=Hume%2C+T">Tristan Hume</a>, <a href="/search/cs?searchtype=author&query=Johnston%2C+S">Scott Johnston</a>, <a href="/search/cs?searchtype=author&query=Kravec%2C+S">Shauna Kravec</a>, <a href="/search/cs?searchtype=author&query=Lovitt%2C+L">Liane Lovitt</a>, <a href="/search/cs?searchtype=author&query=Nanda%2C+N">Neel Nanda</a>, <a href="/search/cs?searchtype=author&query=Olsson%2C+C">Catherine Olsson</a>, <a href="/search/cs?searchtype=author&query=Amodei%2C+D">Dario Amodei</a> , et al. (6 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.05862v1-abstract-short" style="display: inline;"> We apply preference modeling and reinforcement learning from human feedback (RLHF) to finetune language models to act as helpful and harmless assistants. We find this alignment training improves performance on almost all NLP evaluations, and is fully compatible with training for specialized skills such as python coding and summarization. We explore an iterated online mode of training, where prefer… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.05862v1-abstract-full').style.display = 'inline'; document.getElementById('2204.05862v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.05862v1-abstract-full" style="display: none;"> We apply preference modeling and reinforcement learning from human feedback (RLHF) to finetune language models to act as helpful and harmless assistants. We find this alignment training improves performance on almost all NLP evaluations, and is fully compatible with training for specialized skills such as python coding and summarization. We explore an iterated online mode of training, where preference models and RL policies are updated on a weekly cadence with fresh human feedback data, efficiently improving our datasets and models. Finally, we investigate the robustness of RLHF training, and identify a roughly linear relation between the RL reward and the square root of the KL divergence between the policy and its initialization. Alongside our main results, we perform peripheral analyses on calibration, competing objectives, and the use of OOD detection, compare our models with human writers, and provide samples from our models using prompts appearing in recent related work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.05862v1-abstract-full').style.display = 'none'; document.getElementById('2204.05862v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Data available at https://github.com/anthropics/hh-rlhf</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.02497">arXiv:2110.02497</a> <span> [<a href="https://arxiv.org/pdf/2110.02497">pdf</a>, <a href="https://arxiv.org/format/2110.02497">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Pretraining & Reinforcement Learning: Sharpening the Axe Before Cutting the Tree </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kadavath%2C+S">Saurav Kadavath</a>, <a href="/search/cs?searchtype=author&query=Paradis%2C+S">Samuel Paradis</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+B">Brian Yao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.02497v1-abstract-short" style="display: inline;"> Pretraining is a common technique in deep learning for increasing performance and reducing training time, with promising experimental results in deep reinforcement learning (RL). However, pretraining requires a relevant dataset for training. In this work, we evaluate the effectiveness of pretraining for RL tasks, with and without distracting backgrounds, using both large, publicly available datase… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.02497v1-abstract-full').style.display = 'inline'; document.getElementById('2110.02497v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.02497v1-abstract-full" style="display: none;"> Pretraining is a common technique in deep learning for increasing performance and reducing training time, with promising experimental results in deep reinforcement learning (RL). However, pretraining requires a relevant dataset for training. In this work, we evaluate the effectiveness of pretraining for RL tasks, with and without distracting backgrounds, using both large, publicly available datasets with minimal relevance, as well as case-by-case generated datasets labeled via self-supervision. Results suggest filters learned during training on less relevant datasets render pretraining ineffective, while filters learned during training on the in-distribution datasets reliably reduce RL training time and improve performance after 80k RL training steps. We further investigate, given a limited number of environment steps, how to optimally divide the available steps into pretraining and RL training to maximize RL performance. Our code is available on GitHub <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.02497v1-abstract-full').style.display = 'none'; document.getElementById('2110.02497v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2105.09938">arXiv:2105.09938</a> <span> [<a href="https://arxiv.org/pdf/2105.09938">pdf</a>, <a href="https://arxiv.org/format/2105.09938">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Measuring Coding Challenge Competence With APPS </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hendrycks%2C+D">Dan Hendrycks</a>, <a href="/search/cs?searchtype=author&query=Basart%2C+S">Steven Basart</a>, <a href="/search/cs?searchtype=author&query=Kadavath%2C+S">Saurav Kadavath</a>, <a href="/search/cs?searchtype=author&query=Mazeika%2C+M">Mantas Mazeika</a>, <a href="/search/cs?searchtype=author&query=Arora%2C+A">Akul Arora</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+E">Ethan Guo</a>, <a href="/search/cs?searchtype=author&query=Burns%2C+C">Collin Burns</a>, <a href="/search/cs?searchtype=author&query=Puranik%2C+S">Samir Puranik</a>, <a href="/search/cs?searchtype=author&query=He%2C+H">Horace He</a>, <a href="/search/cs?searchtype=author&query=Song%2C+D">Dawn Song</a>, <a href="/search/cs?searchtype=author&query=Steinhardt%2C+J">Jacob Steinhardt</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2105.09938v3-abstract-short" style="display: inline;"> While programming is one of the most broadly applicable skills in modern society, modern machine learning models still cannot code solutions to basic problems. Despite its importance, there has been surprisingly little work on evaluating code generation, and it can be difficult to accurately assess code generation performance rigorously. To meet this challenge, we introduce APPS, a benchmark for c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2105.09938v3-abstract-full').style.display = 'inline'; document.getElementById('2105.09938v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2105.09938v3-abstract-full" style="display: none;"> While programming is one of the most broadly applicable skills in modern society, modern machine learning models still cannot code solutions to basic problems. Despite its importance, there has been surprisingly little work on evaluating code generation, and it can be difficult to accurately assess code generation performance rigorously. To meet this challenge, we introduce APPS, a benchmark for code generation. Unlike prior work in more restricted settings, our benchmark measures the ability of models to take an arbitrary natural language specification and generate satisfactory Python code. Similar to how companies assess candidate software developers, we then evaluate models by checking their generated code on test cases. Our benchmark includes 10,000 problems, which range from having simple one-line solutions to being substantial algorithmic challenges. We fine-tune large language models on both GitHub and our training set, and we find that the prevalence of syntax errors is decreasing exponentially as models improve. Recent models such as GPT-Neo can pass approximately 20% of the test cases of introductory problems, so we find that machine learning models are now beginning to learn how to code. As the social significance of automatic code generation increases over the coming years, our benchmark can provide an important measure for tracking advancements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2105.09938v3-abstract-full').style.display = 'none'; document.getElementById('2105.09938v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 May, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2021. Code and the APPS dataset is available at https://github.com/hendrycks/apps</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2103.03874">arXiv:2103.03874</a> <span> [<a href="https://arxiv.org/pdf/2103.03874">pdf</a>, <a href="https://arxiv.org/format/2103.03874">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Measuring Mathematical Problem Solving With the MATH Dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hendrycks%2C+D">Dan Hendrycks</a>, <a href="/search/cs?searchtype=author&query=Burns%2C+C">Collin Burns</a>, <a href="/search/cs?searchtype=author&query=Kadavath%2C+S">Saurav Kadavath</a>, <a href="/search/cs?searchtype=author&query=Arora%2C+A">Akul Arora</a>, <a href="/search/cs?searchtype=author&query=Basart%2C+S">Steven Basart</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+E">Eric Tang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+D">Dawn Song</a>, <a href="/search/cs?searchtype=author&query=Steinhardt%2C+J">Jacob Steinhardt</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2103.03874v2-abstract-short" style="display: inline;"> Many intellectual endeavors require mathematical problem solving, but this skill remains beyond the capabilities of computers. To measure this ability in machine learning models, we introduce MATH, a new dataset of 12,500 challenging competition mathematics problems. Each problem in MATH has a full step-by-step solution which can be used to teach models to generate answer derivations and explanati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.03874v2-abstract-full').style.display = 'inline'; document.getElementById('2103.03874v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2103.03874v2-abstract-full" style="display: none;"> Many intellectual endeavors require mathematical problem solving, but this skill remains beyond the capabilities of computers. To measure this ability in machine learning models, we introduce MATH, a new dataset of 12,500 challenging competition mathematics problems. Each problem in MATH has a full step-by-step solution which can be used to teach models to generate answer derivations and explanations. To facilitate future research and increase accuracy on MATH, we also contribute a large auxiliary pretraining dataset which helps teach models the fundamentals of mathematics. Even though we are able to increase accuracy on MATH, our results show that accuracy remains relatively low, even with enormous Transformer models. Moreover, we find that simply increasing budgets and model parameter counts will be impractical for achieving strong mathematical reasoning if scaling trends continue. While scaling Transformers is automatically solving most other text-based tasks, scaling is not currently solving MATH. To have more traction on mathematical problem solving we will likely need new algorithmic advancements from the broader research community. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.03874v2-abstract-full').style.display = 'none'; document.getElementById('2103.03874v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2021. Code and the MATH dataset is available at https://github.com/hendrycks/math/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2006.16241">arXiv:2006.16241</a> <span> [<a href="https://arxiv.org/pdf/2006.16241">pdf</a>, <a href="https://arxiv.org/format/2006.16241">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> The Many Faces of Robustness: A Critical Analysis of Out-of-Distribution Generalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hendrycks%2C+D">Dan Hendrycks</a>, <a href="/search/cs?searchtype=author&query=Basart%2C+S">Steven Basart</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+N">Norman Mu</a>, <a href="/search/cs?searchtype=author&query=Kadavath%2C+S">Saurav Kadavath</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Frank Wang</a>, <a href="/search/cs?searchtype=author&query=Dorundo%2C+E">Evan Dorundo</a>, <a href="/search/cs?searchtype=author&query=Desai%2C+R">Rahul Desai</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+T">Tyler Zhu</a>, <a href="/search/cs?searchtype=author&query=Parajuli%2C+S">Samyak Parajuli</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+M">Mike Guo</a>, <a href="/search/cs?searchtype=author&query=Song%2C+D">Dawn Song</a>, <a href="/search/cs?searchtype=author&query=Steinhardt%2C+J">Jacob Steinhardt</a>, <a href="/search/cs?searchtype=author&query=Gilmer%2C+J">Justin Gilmer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2006.16241v3-abstract-short" style="display: inline;"> We introduce four new real-world distribution shift datasets consisting of changes in image style, image blurriness, geographic location, camera operation, and more. With our new datasets, we take stock of previously proposed methods for improving out-of-distribution robustness and put them to the test. We find that using larger models and artificial data augmentations can improve robustness on re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.16241v3-abstract-full').style.display = 'inline'; document.getElementById('2006.16241v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2006.16241v3-abstract-full" style="display: none;"> We introduce four new real-world distribution shift datasets consisting of changes in image style, image blurriness, geographic location, camera operation, and more. With our new datasets, we take stock of previously proposed methods for improving out-of-distribution robustness and put them to the test. We find that using larger models and artificial data augmentations can improve robustness on real-world distribution shifts, contrary to claims in prior work. We find improvements in artificial robustness benchmarks can transfer to real-world distribution shifts, contrary to claims in prior work. Motivated by our observation that data augmentations can help with real-world distribution shifts, we also introduce a new data augmentation method which advances the state-of-the-art and outperforms models pretrained with 1000 times more labeled data. Overall we find that some methods consistently help with distribution shifts in texture and local image statistics, but these methods do not help with some other distribution shifts like geographic changes. Our results show that future research must study multiple distribution shifts simultaneously, as we demonstrate that no evaluated method consistently improves robustness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.16241v3-abstract-full').style.display = 'none'; document.getElementById('2006.16241v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 July, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICCV 2021; Datasets, code, and models available at https://github.com/hendrycks/imagenet-r</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1906.12340">arXiv:1906.12340</a> <span> [<a href="https://arxiv.org/pdf/1906.12340">pdf</a>, <a href="https://arxiv.org/format/1906.12340">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Using Self-Supervised Learning Can Improve Model Robustness and Uncertainty </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hendrycks%2C+D">Dan Hendrycks</a>, <a href="/search/cs?searchtype=author&query=Mazeika%2C+M">Mantas Mazeika</a>, <a href="/search/cs?searchtype=author&query=Kadavath%2C+S">Saurav Kadavath</a>, <a href="/search/cs?searchtype=author&query=Song%2C+D">Dawn Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1906.12340v2-abstract-short" style="display: inline;"> Self-supervision provides effective representations for downstream tasks without requiring labels. However, existing approaches lag behind fully supervised training and are often not thought beneficial beyond obviating or reducing the need for annotations. We find that self-supervision can benefit robustness in a variety of ways, including robustness to adversarial examples, label corruption, and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.12340v2-abstract-full').style.display = 'inline'; document.getElementById('1906.12340v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1906.12340v2-abstract-full" style="display: none;"> Self-supervision provides effective representations for downstream tasks without requiring labels. However, existing approaches lag behind fully supervised training and are often not thought beneficial beyond obviating or reducing the need for annotations. We find that self-supervision can benefit robustness in a variety of ways, including robustness to adversarial examples, label corruption, and common input corruptions. Additionally, self-supervision greatly benefits out-of-distribution detection on difficult, near-distribution outliers, so much so that it exceeds the performance of fully supervised methods. These results demonstrate the promise of self-supervision for improving robustness and uncertainty estimation and establish these tasks as new axes of evaluation for future self-supervised learning research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.12340v2-abstract-full').style.display = 'none'; document.getElementById('1906.12340v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 June, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2019; code and data available at https://github.com/hendrycks/ss-ood</span> </p> </li> </ol> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository