CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;12 of 12 results for author: <span class="mathjax">Lovitt, L</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Lovitt%2C+L">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Lovitt, L"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Lovitt%2C+L&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Lovitt, L"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07814">arXiv:2406.07814</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.07814">pdf</a>, <a href="https://arxiv.org/format/2406.07814">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3630106.3658979">10.1145/3630106.3658979 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Collective Constitutional AI: Aligning a Language Model with Public Input </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Huang%2C+S">Saffron Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Siddarth%2C+D">Divya Siddarth</a>, <a href="/search/cs?searchtype=author&amp;query=Lovitt%2C+L">Liane Lovitt</a>, <a href="/search/cs?searchtype=author&amp;query=Liao%2C+T+I">Thomas I. Liao</a>, <a href="/search/cs?searchtype=author&amp;query=Durmus%2C+E">Esin Durmus</a>, <a href="/search/cs?searchtype=author&amp;query=Tamkin%2C+A">Alex Tamkin</a>, <a href="/search/cs?searchtype=author&amp;query=Ganguli%2C+D">Deep Ganguli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07814v1-abstract-short" style="display: inline;"> There is growing consensus that language model (LM) developers should not be the sole deciders of LM behavior, creating a need for methods that enable the broader public to collectively shape the behavior of LM systems that affect them. To address this need, we present Collective Constitutional AI (CCAI): a multi-stage process for sourcing and integrating public input into LMs-from identifying a t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07814v1-abstract-full').style.display = 'inline'; document.getElementById('2406.07814v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07814v1-abstract-full" style="display: none;"> There is growing consensus that language model (LM) developers should not be the sole deciders of LM behavior, creating a need for methods that enable the broader public to collectively shape the behavior of LM systems that affect them. To address this need, we present Collective Constitutional AI (CCAI): a multi-stage process for sourcing and integrating public input into LMs-from identifying a target population to sourcing principles to training and evaluating a model. We demonstrate the real-world practicality of this approach by creating what is, to our knowledge, the first LM fine-tuned with collectively sourced public input and evaluating this model against a baseline model trained with established principles from a LM developer. Our quantitative evaluations demonstrate several benefits of our approach: the CCAI-trained model shows lower bias across nine social dimensions compared to the baseline model, while maintaining equivalent performance on language, math, and helpful-harmless evaluations. Qualitative comparisons of the models suggest that the models differ on the basis of their respective constitutions, e.g., when prompted with contentious topics, the CCAI-trained model tends to generate responses that reframe the matter positively instead of a refusal. These results demonstrate a promising, tractable pathway toward publicly informed development of language models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07814v1-abstract-full').style.display = 'none'; document.getElementById('2406.07814v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7; K.4.2 </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of the 2024 ACM Conference on Fairness, Accountability, and Transparency. 1395-1417 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.03689">arXiv:2312.03689</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.03689">pdf</a>, <a href="https://arxiv.org/format/2312.03689">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Evaluating and Mitigating Discrimination in Language Model Decisions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tamkin%2C+A">Alex Tamkin</a>, <a href="/search/cs?searchtype=author&amp;query=Askell%2C+A">Amanda Askell</a>, <a href="/search/cs?searchtype=author&amp;query=Lovitt%2C+L">Liane Lovitt</a>, <a href="/search/cs?searchtype=author&amp;query=Durmus%2C+E">Esin Durmus</a>, <a href="/search/cs?searchtype=author&amp;query=Joseph%2C+N">Nicholas Joseph</a>, <a href="/search/cs?searchtype=author&amp;query=Kravec%2C+S">Shauna Kravec</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+K">Karina Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Kaplan%2C+J">Jared Kaplan</a>, <a href="/search/cs?searchtype=author&amp;query=Ganguli%2C+D">Deep Ganguli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.03689v1-abstract-short" style="display: inline;"> As language models (LMs) advance, interest is growing in applying them to high-stakes societal decisions, such as determining financing or housing eligibility. However, their potential for discrimination in such contexts raises ethical concerns, motivating the need for better methods to evaluate these risks. We present a method for proactively evaluating the potential discriminatory impact of LMs&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.03689v1-abstract-full').style.display = 'inline'; document.getElementById('2312.03689v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.03689v1-abstract-full" style="display: none;"> As language models (LMs) advance, interest is growing in applying them to high-stakes societal decisions, such as determining financing or housing eligibility. However, their potential for discrimination in such contexts raises ethical concerns, motivating the need for better methods to evaluate these risks. We present a method for proactively evaluating the potential discriminatory impact of LMs in a wide range of use cases, including hypothetical use cases where they have not yet been deployed. Specifically, we use an LM to generate a wide array of potential prompts that decision-makers may input into an LM, spanning 70 diverse decision scenarios across society, and systematically vary the demographic information in each prompt. Applying this methodology reveals patterns of both positive and negative discrimination in the Claude 2.0 model in select settings when no interventions are applied. While we do not endorse or permit the use of language models to make automated decisions for the high-risk use cases we study, we demonstrate techniques to significantly decrease both positive and negative discrimination through careful prompt engineering, providing pathways toward safer deployment in use cases where they may be appropriate. Our work enables developers and policymakers to anticipate, measure, and address discrimination as language model capabilities and applications continue to expand. We release our dataset and prompts at https://huggingface.co/datasets/Anthropic/discrim-eval <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.03689v1-abstract-full').style.display = 'none'; document.getElementById('2312.03689v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.16388">arXiv:2306.16388</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2306.16388">pdf</a>, <a href="https://arxiv.org/format/2306.16388">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Towards Measuring the Representation of Subjective Global Opinions in Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Durmus%2C+E">Esin Durmus</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+K">Karina Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Liao%2C+T+I">Thomas I. Liao</a>, <a href="/search/cs?searchtype=author&amp;query=Schiefer%2C+N">Nicholas Schiefer</a>, <a href="/search/cs?searchtype=author&amp;query=Askell%2C+A">Amanda Askell</a>, <a href="/search/cs?searchtype=author&amp;query=Bakhtin%2C+A">Anton Bakhtin</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+C">Carol Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Hatfield-Dodds%2C+Z">Zac Hatfield-Dodds</a>, <a href="/search/cs?searchtype=author&amp;query=Hernandez%2C+D">Danny Hernandez</a>, <a href="/search/cs?searchtype=author&amp;query=Joseph%2C+N">Nicholas Joseph</a>, <a href="/search/cs?searchtype=author&amp;query=Lovitt%2C+L">Liane Lovitt</a>, <a href="/search/cs?searchtype=author&amp;query=McCandlish%2C+S">Sam McCandlish</a>, <a href="/search/cs?searchtype=author&amp;query=Sikder%2C+O">Orowa Sikder</a>, <a href="/search/cs?searchtype=author&amp;query=Tamkin%2C+A">Alex Tamkin</a>, <a href="/search/cs?searchtype=author&amp;query=Thamkul%2C+J">Janel Thamkul</a>, <a href="/search/cs?searchtype=author&amp;query=Kaplan%2C+J">Jared Kaplan</a>, <a href="/search/cs?searchtype=author&amp;query=Clark%2C+J">Jack Clark</a>, <a href="/search/cs?searchtype=author&amp;query=Ganguli%2C+D">Deep Ganguli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.16388v2-abstract-short" style="display: inline;"> Large language models (LLMs) may not equitably represent diverse global perspectives on societal issues. In this paper, we develop a quantitative framework to evaluate whose opinions model-generated responses are more similar to. We first build a dataset, GlobalOpinionQA, comprised of questions and answers from cross-national surveys designed to capture diverse opinions on global issues across dif&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.16388v2-abstract-full').style.display = 'inline'; document.getElementById('2306.16388v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.16388v2-abstract-full" style="display: none;"> Large language models (LLMs) may not equitably represent diverse global perspectives on societal issues. In this paper, we develop a quantitative framework to evaluate whose opinions model-generated responses are more similar to. We first build a dataset, GlobalOpinionQA, comprised of questions and answers from cross-national surveys designed to capture diverse opinions on global issues across different countries. Next, we define a metric that quantifies the similarity between LLM-generated survey responses and human responses, conditioned on country. With our framework, we run three experiments on an LLM trained to be helpful, honest, and harmless with Constitutional AI. By default, LLM responses tend to be more similar to the opinions of certain populations, such as those from the USA, and some European and South American countries, highlighting the potential for biases. When we prompt the model to consider a particular country&#39;s perspective, responses shift to be more similar to the opinions of the prompted populations, but can reflect harmful cultural stereotypes. When we translate GlobalOpinionQA questions to a target language, the model&#39;s responses do not necessarily become the most similar to the opinions of speakers of those languages. We release our dataset for others to use and build on. Our data is at https://huggingface.co/datasets/Anthropic/llm_global_opinions. We also provide an interactive visualization at https://llmglobalvalues.anthropic.com. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.16388v2-abstract-full').style.display = 'none'; document.getElementById('2306.16388v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.07459">arXiv:2302.07459</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2302.07459">pdf</a>, <a href="https://arxiv.org/format/2302.07459">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> The Capacity for Moral Self-Correction in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ganguli%2C+D">Deep Ganguli</a>, <a href="/search/cs?searchtype=author&amp;query=Askell%2C+A">Amanda Askell</a>, <a href="/search/cs?searchtype=author&amp;query=Schiefer%2C+N">Nicholas Schiefer</a>, <a href="/search/cs?searchtype=author&amp;query=Liao%2C+T+I">Thomas I. Liao</a>, <a href="/search/cs?searchtype=author&amp;query=Luko%C5%A1i%C5%ABt%C4%97%2C+K">Kamil臈 Luko拧i奴t臈</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+A">Anna Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Goldie%2C+A">Anna Goldie</a>, <a href="/search/cs?searchtype=author&amp;query=Mirhoseini%2C+A">Azalia Mirhoseini</a>, <a href="/search/cs?searchtype=author&amp;query=Olsson%2C+C">Catherine Olsson</a>, <a href="/search/cs?searchtype=author&amp;query=Hernandez%2C+D">Danny Hernandez</a>, <a href="/search/cs?searchtype=author&amp;query=Drain%2C+D">Dawn Drain</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+D">Dustin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Tran-Johnson%2C+E">Eli Tran-Johnson</a>, <a href="/search/cs?searchtype=author&amp;query=Perez%2C+E">Ethan Perez</a>, <a href="/search/cs?searchtype=author&amp;query=Kernion%2C+J">Jackson Kernion</a>, <a href="/search/cs?searchtype=author&amp;query=Kerr%2C+J">Jamie Kerr</a>, <a href="/search/cs?searchtype=author&amp;query=Mueller%2C+J">Jared Mueller</a>, <a href="/search/cs?searchtype=author&amp;query=Landau%2C+J">Joshua Landau</a>, <a href="/search/cs?searchtype=author&amp;query=Ndousse%2C+K">Kamal Ndousse</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+K">Karina Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Lovitt%2C+L">Liane Lovitt</a>, <a href="/search/cs?searchtype=author&amp;query=Sellitto%2C+M">Michael Sellitto</a>, <a href="/search/cs?searchtype=author&amp;query=Elhage%2C+N">Nelson Elhage</a>, <a href="/search/cs?searchtype=author&amp;query=Mercado%2C+N">Noemi Mercado</a>, <a href="/search/cs?searchtype=author&amp;query=DasSarma%2C+N">Nova DasSarma</a> , et al. (24 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.07459v2-abstract-short" style="display: inline;"> We test the hypothesis that language models trained with reinforcement learning from human feedback (RLHF) have the capability to &#34;morally self-correct&#34; -- to avoid producing harmful outputs -- if instructed to do so. We find strong evidence in support of this hypothesis across three different experiments, each of which reveal different facets of moral self-correction. We find that the capability&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.07459v2-abstract-full').style.display = 'inline'; document.getElementById('2302.07459v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.07459v2-abstract-full" style="display: none;"> We test the hypothesis that language models trained with reinforcement learning from human feedback (RLHF) have the capability to &#34;morally self-correct&#34; -- to avoid producing harmful outputs -- if instructed to do so. We find strong evidence in support of this hypothesis across three different experiments, each of which reveal different facets of moral self-correction. We find that the capability for moral self-correction emerges at 22B model parameters, and typically improves with increasing model size and RLHF training. We believe that at this level of scale, language models obtain two capabilities that they can use for moral self-correction: (1) they can follow instructions and (2) they can learn complex normative concepts of harm like stereotyping, bias, and discrimination. As such, they can follow instructions to avoid certain kinds of morally harmful outputs. We believe our results are cause for cautious optimism regarding the ability to train language models to abide by ethical principles. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.07459v2-abstract-full').style.display = 'none'; document.getElementById('2302.07459v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.09251">arXiv:2212.09251</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2212.09251">pdf</a>, <a href="https://arxiv.org/format/2212.09251">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Discovering Language Model Behaviors with Model-Written Evaluations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Perez%2C+E">Ethan Perez</a>, <a href="/search/cs?searchtype=author&amp;query=Ringer%2C+S">Sam Ringer</a>, <a href="/search/cs?searchtype=author&amp;query=Luko%C5%A1i%C5%ABt%C4%97%2C+K">Kamil臈 Luko拧i奴t臈</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+K">Karina Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+E">Edwin Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Heiner%2C+S">Scott Heiner</a>, <a href="/search/cs?searchtype=author&amp;query=Pettit%2C+C">Craig Pettit</a>, <a href="/search/cs?searchtype=author&amp;query=Olsson%2C+C">Catherine Olsson</a>, <a href="/search/cs?searchtype=author&amp;query=Kundu%2C+S">Sandipan Kundu</a>, <a href="/search/cs?searchtype=author&amp;query=Kadavath%2C+S">Saurav Kadavath</a>, <a href="/search/cs?searchtype=author&amp;query=Jones%2C+A">Andy Jones</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+A">Anna Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Mann%2C+B">Ben Mann</a>, <a href="/search/cs?searchtype=author&amp;query=Israel%2C+B">Brian Israel</a>, <a href="/search/cs?searchtype=author&amp;query=Seethor%2C+B">Bryan Seethor</a>, <a href="/search/cs?searchtype=author&amp;query=McKinnon%2C+C">Cameron McKinnon</a>, <a href="/search/cs?searchtype=author&amp;query=Olah%2C+C">Christopher Olah</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+D">Da Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Amodei%2C+D">Daniela Amodei</a>, <a href="/search/cs?searchtype=author&amp;query=Amodei%2C+D">Dario Amodei</a>, <a href="/search/cs?searchtype=author&amp;query=Drain%2C+D">Dawn Drain</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+D">Dustin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Tran-Johnson%2C+E">Eli Tran-Johnson</a>, <a href="/search/cs?searchtype=author&amp;query=Khundadze%2C+G">Guro Khundadze</a>, <a href="/search/cs?searchtype=author&amp;query=Kernion%2C+J">Jackson Kernion</a> , et al. (38 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.09251v1-abstract-short" style="display: inline;"> As language models (LMs) scale, they develop many novel behaviors, good and bad, exacerbating the need to evaluate how they behave. Prior work creates evaluations with crowdwork (which is time-consuming and expensive) or existing data sources (which are not always available). Here, we automatically generate evaluations with LMs. We explore approaches with varying amounts of human effort, from inst&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.09251v1-abstract-full').style.display = 'inline'; document.getElementById('2212.09251v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.09251v1-abstract-full" style="display: none;"> As language models (LMs) scale, they develop many novel behaviors, good and bad, exacerbating the need to evaluate how they behave. Prior work creates evaluations with crowdwork (which is time-consuming and expensive) or existing data sources (which are not always available). Here, we automatically generate evaluations with LMs. We explore approaches with varying amounts of human effort, from instructing LMs to write yes/no questions to making complex Winogender schemas with multiple stages of LM-based generation and filtering. Crowdworkers rate the examples as highly relevant and agree with 90-100% of labels, sometimes more so than corresponding human-written datasets. We generate 154 datasets and discover new cases of inverse scaling where LMs get worse with size. Larger LMs repeat back a dialog user&#39;s preferred answer (&#34;sycophancy&#34;) and express greater desire to pursue concerning goals like resource acquisition and goal preservation. We also find some of the first examples of inverse scaling in RL from Human Feedback (RLHF), where more RLHF makes LMs worse. For example, RLHF makes LMs express stronger political views (on gun rights and immigration) and a greater desire to avoid shut down. Overall, LM-written evaluations are high-quality and let us quickly discover many novel LM behaviors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.09251v1-abstract-full').style.display = 'none'; document.getElementById('2212.09251v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">for associated data visualizations, see https://www.evals.anthropic.com/model-written/ for full datasets, see https://github.com/anthropics/evals</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.08073">arXiv:2212.08073</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2212.08073">pdf</a>, <a href="https://arxiv.org/format/2212.08073">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Constitutional AI: Harmlessness from AI Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bai%2C+Y">Yuntao Bai</a>, <a href="/search/cs?searchtype=author&amp;query=Kadavath%2C+S">Saurav Kadavath</a>, <a href="/search/cs?searchtype=author&amp;query=Kundu%2C+S">Sandipan Kundu</a>, <a href="/search/cs?searchtype=author&amp;query=Askell%2C+A">Amanda Askell</a>, <a href="/search/cs?searchtype=author&amp;query=Kernion%2C+J">Jackson Kernion</a>, <a href="/search/cs?searchtype=author&amp;query=Jones%2C+A">Andy Jones</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+A">Anna Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Goldie%2C+A">Anna Goldie</a>, <a href="/search/cs?searchtype=author&amp;query=Mirhoseini%2C+A">Azalia Mirhoseini</a>, <a href="/search/cs?searchtype=author&amp;query=McKinnon%2C+C">Cameron McKinnon</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+C">Carol Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Olsson%2C+C">Catherine Olsson</a>, <a href="/search/cs?searchtype=author&amp;query=Olah%2C+C">Christopher Olah</a>, <a href="/search/cs?searchtype=author&amp;query=Hernandez%2C+D">Danny Hernandez</a>, <a href="/search/cs?searchtype=author&amp;query=Drain%2C+D">Dawn Drain</a>, <a href="/search/cs?searchtype=author&amp;query=Ganguli%2C+D">Deep Ganguli</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+D">Dustin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Tran-Johnson%2C+E">Eli Tran-Johnson</a>, <a href="/search/cs?searchtype=author&amp;query=Perez%2C+E">Ethan Perez</a>, <a href="/search/cs?searchtype=author&amp;query=Kerr%2C+J">Jamie Kerr</a>, <a href="/search/cs?searchtype=author&amp;query=Mueller%2C+J">Jared Mueller</a>, <a href="/search/cs?searchtype=author&amp;query=Ladish%2C+J">Jeffrey Ladish</a>, <a href="/search/cs?searchtype=author&amp;query=Landau%2C+J">Joshua Landau</a>, <a href="/search/cs?searchtype=author&amp;query=Ndousse%2C+K">Kamal Ndousse</a>, <a href="/search/cs?searchtype=author&amp;query=Lukosuite%2C+K">Kamile Lukosuite</a> , et al. (26 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.08073v1-abstract-short" style="display: inline;"> As AI systems become more capable, we would like to enlist their help to supervise other AIs. We experiment with methods for training a harmless AI assistant through self-improvement, without any human labels identifying harmful outputs. The only human oversight is provided through a list of rules or principles, and so we refer to the method as &#39;Constitutional AI&#39;. The process involves both a supe&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.08073v1-abstract-full').style.display = 'inline'; document.getElementById('2212.08073v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.08073v1-abstract-full" style="display: none;"> As AI systems become more capable, we would like to enlist their help to supervise other AIs. We experiment with methods for training a harmless AI assistant through self-improvement, without any human labels identifying harmful outputs. The only human oversight is provided through a list of rules or principles, and so we refer to the method as &#39;Constitutional AI&#39;. The process involves both a supervised learning and a reinforcement learning phase. In the supervised phase we sample from an initial model, then generate self-critiques and revisions, and then finetune the original model on revised responses. In the RL phase, we sample from the finetuned model, use a model to evaluate which of the two samples is better, and then train a preference model from this dataset of AI preferences. We then train with RL using the preference model as the reward signal, i.e. we use &#39;RL from AI Feedback&#39; (RLAIF). As a result we are able to train a harmless but non-evasive AI assistant that engages with harmful queries by explaining its objections to them. Both the SL and RL methods can leverage chain-of-thought style reasoning to improve the human-judged performance and transparency of AI decision making. These methods make it possible to control AI behavior more precisely and with far fewer human labels. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.08073v1-abstract-full').style.display = 'none'; document.getElementById('2212.08073v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.03540">arXiv:2211.03540</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2211.03540">pdf</a>, <a href="https://arxiv.org/format/2211.03540">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Measuring Progress on Scalable Oversight for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bowman%2C+S+R">Samuel R. Bowman</a>, <a href="/search/cs?searchtype=author&amp;query=Hyun%2C+J">Jeeyoon Hyun</a>, <a href="/search/cs?searchtype=author&amp;query=Perez%2C+E">Ethan Perez</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+E">Edwin Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Pettit%2C+C">Craig Pettit</a>, <a href="/search/cs?searchtype=author&amp;query=Heiner%2C+S">Scott Heiner</a>, <a href="/search/cs?searchtype=author&amp;query=Luko%C5%A1i%C5%ABt%C4%97%2C+K">Kamil臈 Luko拧i奴t臈</a>, <a href="/search/cs?searchtype=author&amp;query=Askell%2C+A">Amanda Askell</a>, <a href="/search/cs?searchtype=author&amp;query=Jones%2C+A">Andy Jones</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+A">Anna Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Goldie%2C+A">Anna Goldie</a>, <a href="/search/cs?searchtype=author&amp;query=Mirhoseini%2C+A">Azalia Mirhoseini</a>, <a href="/search/cs?searchtype=author&amp;query=McKinnon%2C+C">Cameron McKinnon</a>, <a href="/search/cs?searchtype=author&amp;query=Olah%2C+C">Christopher Olah</a>, <a href="/search/cs?searchtype=author&amp;query=Amodei%2C+D">Daniela Amodei</a>, <a href="/search/cs?searchtype=author&amp;query=Amodei%2C+D">Dario Amodei</a>, <a href="/search/cs?searchtype=author&amp;query=Drain%2C+D">Dawn Drain</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+D">Dustin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Tran-Johnson%2C+E">Eli Tran-Johnson</a>, <a href="/search/cs?searchtype=author&amp;query=Kernion%2C+J">Jackson Kernion</a>, <a href="/search/cs?searchtype=author&amp;query=Kerr%2C+J">Jamie Kerr</a>, <a href="/search/cs?searchtype=author&amp;query=Mueller%2C+J">Jared Mueller</a>, <a href="/search/cs?searchtype=author&amp;query=Ladish%2C+J">Jeffrey Ladish</a>, <a href="/search/cs?searchtype=author&amp;query=Landau%2C+J">Joshua Landau</a>, <a href="/search/cs?searchtype=author&amp;query=Ndousse%2C+K">Kamal Ndousse</a> , et al. (21 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.03540v2-abstract-short" style="display: inline;"> Developing safe and useful general-purpose AI systems will require us to make progress on scalable oversight: the problem of supervising systems that potentially outperform us on most skills relevant to the task at hand. Empirical work on this problem is not straightforward, since we do not yet have systems that broadly exceed our abilities. This paper discusses one of the major ways we think abou&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.03540v2-abstract-full').style.display = 'inline'; document.getElementById('2211.03540v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.03540v2-abstract-full" style="display: none;"> Developing safe and useful general-purpose AI systems will require us to make progress on scalable oversight: the problem of supervising systems that potentially outperform us on most skills relevant to the task at hand. Empirical work on this problem is not straightforward, since we do not yet have systems that broadly exceed our abilities. This paper discusses one of the major ways we think about this problem, with a focus on ways it can be studied empirically. We first present an experimental design centered on tasks for which human specialists succeed but unaided humans and current general AI systems fail. We then present a proof-of-concept experiment meant to demonstrate a key feature of this experimental design and show its viability with two question-answering tasks: MMLU and time-limited QuALITY. On these tasks, we find that human participants who interact with an unreliable large-language-model dialog assistant through chat -- a trivial baseline strategy for scalable oversight -- substantially outperform both the model alone and their own unaided performance. These results are an encouraging sign that scalable oversight will be tractable to study with present models and bolster recent findings that large language models can productively assist humans with difficult tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.03540v2-abstract-full').style.display = 'none'; document.getElementById('2211.03540v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">v2 fixes a few typos from v1</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.11895">arXiv:2209.11895</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2209.11895">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> In-context Learning and Induction Heads </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Olsson%2C+C">Catherine Olsson</a>, <a href="/search/cs?searchtype=author&amp;query=Elhage%2C+N">Nelson Elhage</a>, <a href="/search/cs?searchtype=author&amp;query=Nanda%2C+N">Neel Nanda</a>, <a href="/search/cs?searchtype=author&amp;query=Joseph%2C+N">Nicholas Joseph</a>, <a href="/search/cs?searchtype=author&amp;query=DasSarma%2C+N">Nova DasSarma</a>, <a href="/search/cs?searchtype=author&amp;query=Henighan%2C+T">Tom Henighan</a>, <a href="/search/cs?searchtype=author&amp;query=Mann%2C+B">Ben Mann</a>, <a href="/search/cs?searchtype=author&amp;query=Askell%2C+A">Amanda Askell</a>, <a href="/search/cs?searchtype=author&amp;query=Bai%2C+Y">Yuntao Bai</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+A">Anna Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Conerly%2C+T">Tom Conerly</a>, <a href="/search/cs?searchtype=author&amp;query=Drain%2C+D">Dawn Drain</a>, <a href="/search/cs?searchtype=author&amp;query=Ganguli%2C+D">Deep Ganguli</a>, <a href="/search/cs?searchtype=author&amp;query=Hatfield-Dodds%2C+Z">Zac Hatfield-Dodds</a>, <a href="/search/cs?searchtype=author&amp;query=Hernandez%2C+D">Danny Hernandez</a>, <a href="/search/cs?searchtype=author&amp;query=Johnston%2C+S">Scott Johnston</a>, <a href="/search/cs?searchtype=author&amp;query=Jones%2C+A">Andy Jones</a>, <a href="/search/cs?searchtype=author&amp;query=Kernion%2C+J">Jackson Kernion</a>, <a href="/search/cs?searchtype=author&amp;query=Lovitt%2C+L">Liane Lovitt</a>, <a href="/search/cs?searchtype=author&amp;query=Ndousse%2C+K">Kamal Ndousse</a>, <a href="/search/cs?searchtype=author&amp;query=Amodei%2C+D">Dario Amodei</a>, <a href="/search/cs?searchtype=author&amp;query=Brown%2C+T">Tom Brown</a>, <a href="/search/cs?searchtype=author&amp;query=Clark%2C+J">Jack Clark</a>, <a href="/search/cs?searchtype=author&amp;query=Kaplan%2C+J">Jared Kaplan</a>, <a href="/search/cs?searchtype=author&amp;query=McCandlish%2C+S">Sam McCandlish</a> , et al. (1 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.11895v1-abstract-short" style="display: inline;"> &#34;Induction heads&#34; are attention heads that implement a simple algorithm to complete token sequences like [A][B] ... [A] -&gt; [B]. In this work, we present preliminary and indirect evidence for a hypothesis that induction heads might constitute the mechanism for the majority of all &#34;in-context learning&#34; in large transformer models (i.e. decreasing loss at increasing token indices). We find that induc&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.11895v1-abstract-full').style.display = 'inline'; document.getElementById('2209.11895v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.11895v1-abstract-full" style="display: none;"> &#34;Induction heads&#34; are attention heads that implement a simple algorithm to complete token sequences like [A][B] ... [A] -&gt; [B]. In this work, we present preliminary and indirect evidence for a hypothesis that induction heads might constitute the mechanism for the majority of all &#34;in-context learning&#34; in large transformer models (i.e. decreasing loss at increasing token indices). We find that induction heads develop at precisely the same point as a sudden sharp increase in in-context learning ability, visible as a bump in the training loss. We present six complementary lines of evidence, arguing that induction heads may be the mechanistic source of general in-context learning in transformer models of any size. For small attention-only models, we present strong, causal evidence; for larger models with MLPs, we present correlational evidence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.11895v1-abstract-full').style.display = 'none'; document.getElementById('2209.11895v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.07858">arXiv:2209.07858</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2209.07858">pdf</a>, <a href="https://arxiv.org/format/2209.07858">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Red Teaming Language Models to Reduce Harms: Methods, Scaling Behaviors, and Lessons Learned </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ganguli%2C+D">Deep Ganguli</a>, <a href="/search/cs?searchtype=author&amp;query=Lovitt%2C+L">Liane Lovitt</a>, <a href="/search/cs?searchtype=author&amp;query=Kernion%2C+J">Jackson Kernion</a>, <a href="/search/cs?searchtype=author&amp;query=Askell%2C+A">Amanda Askell</a>, <a href="/search/cs?searchtype=author&amp;query=Bai%2C+Y">Yuntao Bai</a>, <a href="/search/cs?searchtype=author&amp;query=Kadavath%2C+S">Saurav Kadavath</a>, <a href="/search/cs?searchtype=author&amp;query=Mann%2C+B">Ben Mann</a>, <a href="/search/cs?searchtype=author&amp;query=Perez%2C+E">Ethan Perez</a>, <a href="/search/cs?searchtype=author&amp;query=Schiefer%2C+N">Nicholas Schiefer</a>, <a href="/search/cs?searchtype=author&amp;query=Ndousse%2C+K">Kamal Ndousse</a>, <a href="/search/cs?searchtype=author&amp;query=Jones%2C+A">Andy Jones</a>, <a href="/search/cs?searchtype=author&amp;query=Bowman%2C+S">Sam Bowman</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+A">Anna Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Conerly%2C+T">Tom Conerly</a>, <a href="/search/cs?searchtype=author&amp;query=DasSarma%2C+N">Nova DasSarma</a>, <a href="/search/cs?searchtype=author&amp;query=Drain%2C+D">Dawn Drain</a>, <a href="/search/cs?searchtype=author&amp;query=Elhage%2C+N">Nelson Elhage</a>, <a href="/search/cs?searchtype=author&amp;query=El-Showk%2C+S">Sheer El-Showk</a>, <a href="/search/cs?searchtype=author&amp;query=Fort%2C+S">Stanislav Fort</a>, <a href="/search/cs?searchtype=author&amp;query=Hatfield-Dodds%2C+Z">Zac Hatfield-Dodds</a>, <a href="/search/cs?searchtype=author&amp;query=Henighan%2C+T">Tom Henighan</a>, <a href="/search/cs?searchtype=author&amp;query=Hernandez%2C+D">Danny Hernandez</a>, <a href="/search/cs?searchtype=author&amp;query=Hume%2C+T">Tristan Hume</a>, <a href="/search/cs?searchtype=author&amp;query=Jacobson%2C+J">Josh Jacobson</a>, <a href="/search/cs?searchtype=author&amp;query=Johnston%2C+S">Scott Johnston</a> , et al. (11 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.07858v2-abstract-short" style="display: inline;"> We describe our early efforts to red team language models in order to simultaneously discover, measure, and attempt to reduce their potentially harmful outputs. We make three main contributions. First, we investigate scaling behaviors for red teaming across 3 model sizes (2.7B, 13B, and 52B parameters) and 4 model types: a plain language model (LM); an LM prompted to be helpful, honest, and harmle&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.07858v2-abstract-full').style.display = 'inline'; document.getElementById('2209.07858v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.07858v2-abstract-full" style="display: none;"> We describe our early efforts to red team language models in order to simultaneously discover, measure, and attempt to reduce their potentially harmful outputs. We make three main contributions. First, we investigate scaling behaviors for red teaming across 3 model sizes (2.7B, 13B, and 52B parameters) and 4 model types: a plain language model (LM); an LM prompted to be helpful, honest, and harmless; an LM with rejection sampling; and a model trained to be helpful and harmless using reinforcement learning from human feedback (RLHF). We find that the RLHF models are increasingly difficult to red team as they scale, and we find a flat trend with scale for the other model types. Second, we release our dataset of 38,961 red team attacks for others to analyze and learn from. We provide our own analysis of the data and find a variety of harmful outputs, which range from offensive language to more subtly harmful non-violent unethical outputs. Third, we exhaustively describe our instructions, processes, statistical methodologies, and uncertainty about red teaming. We hope that this transparency accelerates our ability to work together as a community in order to develop shared norms, practices, and technical standards for how to red team language models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.07858v2-abstract-full').style.display = 'none'; document.getElementById('2209.07858v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.05221">arXiv:2207.05221</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2207.05221">pdf</a>, <a href="https://arxiv.org/format/2207.05221">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Language Models (Mostly) Know What They Know </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kadavath%2C+S">Saurav Kadavath</a>, <a href="/search/cs?searchtype=author&amp;query=Conerly%2C+T">Tom Conerly</a>, <a href="/search/cs?searchtype=author&amp;query=Askell%2C+A">Amanda Askell</a>, <a href="/search/cs?searchtype=author&amp;query=Henighan%2C+T">Tom Henighan</a>, <a href="/search/cs?searchtype=author&amp;query=Drain%2C+D">Dawn Drain</a>, <a href="/search/cs?searchtype=author&amp;query=Perez%2C+E">Ethan Perez</a>, <a href="/search/cs?searchtype=author&amp;query=Schiefer%2C+N">Nicholas Schiefer</a>, <a href="/search/cs?searchtype=author&amp;query=Hatfield-Dodds%2C+Z">Zac Hatfield-Dodds</a>, <a href="/search/cs?searchtype=author&amp;query=DasSarma%2C+N">Nova DasSarma</a>, <a href="/search/cs?searchtype=author&amp;query=Tran-Johnson%2C+E">Eli Tran-Johnson</a>, <a href="/search/cs?searchtype=author&amp;query=Johnston%2C+S">Scott Johnston</a>, <a href="/search/cs?searchtype=author&amp;query=El-Showk%2C+S">Sheer El-Showk</a>, <a href="/search/cs?searchtype=author&amp;query=Jones%2C+A">Andy Jones</a>, <a href="/search/cs?searchtype=author&amp;query=Elhage%2C+N">Nelson Elhage</a>, <a href="/search/cs?searchtype=author&amp;query=Hume%2C+T">Tristan Hume</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+A">Anna Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Bai%2C+Y">Yuntao Bai</a>, <a href="/search/cs?searchtype=author&amp;query=Bowman%2C+S">Sam Bowman</a>, <a href="/search/cs?searchtype=author&amp;query=Fort%2C+S">Stanislav Fort</a>, <a href="/search/cs?searchtype=author&amp;query=Ganguli%2C+D">Deep Ganguli</a>, <a href="/search/cs?searchtype=author&amp;query=Hernandez%2C+D">Danny Hernandez</a>, <a href="/search/cs?searchtype=author&amp;query=Jacobson%2C+J">Josh Jacobson</a>, <a href="/search/cs?searchtype=author&amp;query=Kernion%2C+J">Jackson Kernion</a>, <a href="/search/cs?searchtype=author&amp;query=Kravec%2C+S">Shauna Kravec</a>, <a href="/search/cs?searchtype=author&amp;query=Lovitt%2C+L">Liane Lovitt</a> , et al. (11 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.05221v4-abstract-short" style="display: inline;"> We study whether language models can evaluate the validity of their own claims and predict which questions they will be able to answer correctly. We first show that larger models are well-calibrated on diverse multiple choice and true/false questions when they are provided in the right format. Thus we can approach self-evaluation on open-ended sampling tasks by asking models to first propose answe&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.05221v4-abstract-full').style.display = 'inline'; document.getElementById('2207.05221v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.05221v4-abstract-full" style="display: none;"> We study whether language models can evaluate the validity of their own claims and predict which questions they will be able to answer correctly. We first show that larger models are well-calibrated on diverse multiple choice and true/false questions when they are provided in the right format. Thus we can approach self-evaluation on open-ended sampling tasks by asking models to first propose answers, and then to evaluate the probability &#34;P(True)&#34; that their answers are correct. We find encouraging performance, calibration, and scaling for P(True) on a diverse array of tasks. Performance at self-evaluation further improves when we allow models to consider many of their own samples before predicting the validity of one specific possibility. Next, we investigate whether models can be trained to predict &#34;P(IK)&#34;, the probability that &#34;I know&#34; the answer to a question, without reference to any particular proposed answer. Models perform well at predicting P(IK) and partially generalize across tasks, though they struggle with calibration of P(IK) on new tasks. The predicted P(IK) probabilities also increase appropriately in the presence of relevant source materials in the context, and in the presence of hints towards the solution of mathematical word problems. We hope these observations lay the groundwork for training more honest models, and for investigating how honesty generalizes to cases where models are trained on objectives other than the imitation of human writing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.05221v4-abstract-full').style.display = 'none'; document.getElementById('2207.05221v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23+17 pages; refs added, typos fixed</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.05862">arXiv:2204.05862</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2204.05862">pdf</a>, <a href="https://arxiv.org/format/2204.05862">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bai%2C+Y">Yuntao Bai</a>, <a href="/search/cs?searchtype=author&amp;query=Jones%2C+A">Andy Jones</a>, <a href="/search/cs?searchtype=author&amp;query=Ndousse%2C+K">Kamal Ndousse</a>, <a href="/search/cs?searchtype=author&amp;query=Askell%2C+A">Amanda Askell</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+A">Anna Chen</a>, <a href="/search/cs?searchtype=author&amp;query=DasSarma%2C+N">Nova DasSarma</a>, <a href="/search/cs?searchtype=author&amp;query=Drain%2C+D">Dawn Drain</a>, <a href="/search/cs?searchtype=author&amp;query=Fort%2C+S">Stanislav Fort</a>, <a href="/search/cs?searchtype=author&amp;query=Ganguli%2C+D">Deep Ganguli</a>, <a href="/search/cs?searchtype=author&amp;query=Henighan%2C+T">Tom Henighan</a>, <a href="/search/cs?searchtype=author&amp;query=Joseph%2C+N">Nicholas Joseph</a>, <a href="/search/cs?searchtype=author&amp;query=Kadavath%2C+S">Saurav Kadavath</a>, <a href="/search/cs?searchtype=author&amp;query=Kernion%2C+J">Jackson Kernion</a>, <a href="/search/cs?searchtype=author&amp;query=Conerly%2C+T">Tom Conerly</a>, <a href="/search/cs?searchtype=author&amp;query=El-Showk%2C+S">Sheer El-Showk</a>, <a href="/search/cs?searchtype=author&amp;query=Elhage%2C+N">Nelson Elhage</a>, <a href="/search/cs?searchtype=author&amp;query=Hatfield-Dodds%2C+Z">Zac Hatfield-Dodds</a>, <a href="/search/cs?searchtype=author&amp;query=Hernandez%2C+D">Danny Hernandez</a>, <a href="/search/cs?searchtype=author&amp;query=Hume%2C+T">Tristan Hume</a>, <a href="/search/cs?searchtype=author&amp;query=Johnston%2C+S">Scott Johnston</a>, <a href="/search/cs?searchtype=author&amp;query=Kravec%2C+S">Shauna Kravec</a>, <a href="/search/cs?searchtype=author&amp;query=Lovitt%2C+L">Liane Lovitt</a>, <a href="/search/cs?searchtype=author&amp;query=Nanda%2C+N">Neel Nanda</a>, <a href="/search/cs?searchtype=author&amp;query=Olsson%2C+C">Catherine Olsson</a>, <a href="/search/cs?searchtype=author&amp;query=Amodei%2C+D">Dario Amodei</a> , et al. (6 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.05862v1-abstract-short" style="display: inline;"> We apply preference modeling and reinforcement learning from human feedback (RLHF) to finetune language models to act as helpful and harmless assistants. We find this alignment training improves performance on almost all NLP evaluations, and is fully compatible with training for specialized skills such as python coding and summarization. We explore an iterated online mode of training, where prefer&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.05862v1-abstract-full').style.display = 'inline'; document.getElementById('2204.05862v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.05862v1-abstract-full" style="display: none;"> We apply preference modeling and reinforcement learning from human feedback (RLHF) to finetune language models to act as helpful and harmless assistants. We find this alignment training improves performance on almost all NLP evaluations, and is fully compatible with training for specialized skills such as python coding and summarization. We explore an iterated online mode of training, where preference models and RL policies are updated on a weekly cadence with fresh human feedback data, efficiently improving our datasets and models. Finally, we investigate the robustness of RLHF training, and identify a roughly linear relation between the RL reward and the square root of the KL divergence between the policy and its initialization. Alongside our main results, we perform peripheral analyses on calibration, competing objectives, and the use of OOD detection, compare our models with human writers, and provide samples from our models using prompts appearing in recent related work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.05862v1-abstract-full').style.display = 'none'; document.getElementById('2204.05862v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Data available at https://github.com/anthropics/hh-rlhf</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.07785">arXiv:2202.07785</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2202.07785">pdf</a>, <a href="https://arxiv.org/format/2202.07785">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3531146.3533229">10.1145/3531146.3533229 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Predictability and Surprise in Large Generative Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ganguli%2C+D">Deep Ganguli</a>, <a href="/search/cs?searchtype=author&amp;query=Hernandez%2C+D">Danny Hernandez</a>, <a href="/search/cs?searchtype=author&amp;query=Lovitt%2C+L">Liane Lovitt</a>, <a href="/search/cs?searchtype=author&amp;query=DasSarma%2C+N">Nova DasSarma</a>, <a href="/search/cs?searchtype=author&amp;query=Henighan%2C+T">Tom Henighan</a>, <a href="/search/cs?searchtype=author&amp;query=Jones%2C+A">Andy Jones</a>, <a href="/search/cs?searchtype=author&amp;query=Joseph%2C+N">Nicholas Joseph</a>, <a href="/search/cs?searchtype=author&amp;query=Kernion%2C+J">Jackson Kernion</a>, <a href="/search/cs?searchtype=author&amp;query=Mann%2C+B">Ben Mann</a>, <a href="/search/cs?searchtype=author&amp;query=Askell%2C+A">Amanda Askell</a>, <a href="/search/cs?searchtype=author&amp;query=Bai%2C+Y">Yuntao Bai</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+A">Anna Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Conerly%2C+T">Tom Conerly</a>, <a href="/search/cs?searchtype=author&amp;query=Drain%2C+D">Dawn Drain</a>, <a href="/search/cs?searchtype=author&amp;query=Elhage%2C+N">Nelson Elhage</a>, <a href="/search/cs?searchtype=author&amp;query=Showk%2C+S+E">Sheer El Showk</a>, <a href="/search/cs?searchtype=author&amp;query=Fort%2C+S">Stanislav Fort</a>, <a href="/search/cs?searchtype=author&amp;query=Hatfield-Dodds%2C+Z">Zac Hatfield-Dodds</a>, <a href="/search/cs?searchtype=author&amp;query=Johnston%2C+S">Scott Johnston</a>, <a href="/search/cs?searchtype=author&amp;query=Kravec%2C+S">Shauna Kravec</a>, <a href="/search/cs?searchtype=author&amp;query=Nanda%2C+N">Neel Nanda</a>, <a href="/search/cs?searchtype=author&amp;query=Ndousse%2C+K">Kamal Ndousse</a>, <a href="/search/cs?searchtype=author&amp;query=Olsson%2C+C">Catherine Olsson</a>, <a href="/search/cs?searchtype=author&amp;query=Amodei%2C+D">Daniela Amodei</a>, <a href="/search/cs?searchtype=author&amp;query=Amodei%2C+D">Dario Amodei</a> , et al. (5 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.07785v2-abstract-short" style="display: inline;"> Large-scale pre-training has recently emerged as a technique for creating capable, general purpose, generative models such as GPT-3, Megatron-Turing NLG, Gopher, and many others. In this paper, we highlight a counterintuitive property of such models and discuss the policy implications of this property. Namely, these generative models have an unusual combination of predictable loss on a broad train&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.07785v2-abstract-full').style.display = 'inline'; document.getElementById('2202.07785v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.07785v2-abstract-full" style="display: none;"> Large-scale pre-training has recently emerged as a technique for creating capable, general purpose, generative models such as GPT-3, Megatron-Turing NLG, Gopher, and many others. In this paper, we highlight a counterintuitive property of such models and discuss the policy implications of this property. Namely, these generative models have an unusual combination of predictable loss on a broad training distribution (as embodied in their &#34;scaling laws&#34;), and unpredictable specific capabilities, inputs, and outputs. We believe that the high-level predictability and appearance of useful capabilities drives rapid development of such models, while the unpredictable qualities make it difficult to anticipate the consequences of model deployment. We go through examples of how this combination can lead to socially harmful behavior with examples from the literature and real world observations, and we also perform two novel experiments to illustrate our point about harms from unpredictability. Furthermore, we analyze how these conflicting properties combine to give model developers various motivations for deploying these models, and challenges that can hinder deployment. We conclude with a list of possible interventions the AI community may take to increase the chance of these models having a beneficial impact. We intend this paper to be useful to policymakers who want to understand and regulate AI systems, technologists who care about the potential policy impact of their work, and academics who want to analyze, critique, and potentially develop large generative models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.07785v2-abstract-full').style.display = 'none'; document.getElementById('2202.07785v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Updated to reflect the version submitted (and accepted) to ACM FAccT &#39;22. This update incorporates feedback from peer-review and fixes minor typos. See open access FAccT conference version at: https://dl.acm.org/doi/abs/10.1145/3531146.3533229</span> </p> </li> </ol> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10