Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–9 of 9 results for author: <span class="mathjax">Teehan, R</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Teehan%2C+R">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Teehan, R"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Teehan%2C+R&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Teehan, R"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08324">arXiv:2411.08324</a> <span> [<a href="https://arxiv.org/pdf/2411.08324">pdf</a>, <a href="https://arxiv.org/format/2411.08324">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Are LLMs Prescient? A Continuous Evaluation using Daily News as the Oracle </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dai%2C+H">Hui Dai</a>, <a href="/search/cs?searchtype=author&query=Teehan%2C+R">Ryan Teehan</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+M">Mengye Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08324v1-abstract-short" style="display: inline;"> Many existing evaluation benchmarks for Large Language Models (LLMs) quickly become outdated due to the emergence of new models and training data. These benchmarks also fall short in assessing how LLM performance changes over time, as they consist of static questions without a temporal dimension. To address these limitations, we propose using future event prediction as a continuous evaluation meth… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08324v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08324v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08324v1-abstract-full" style="display: none;"> Many existing evaluation benchmarks for Large Language Models (LLMs) quickly become outdated due to the emergence of new models and training data. These benchmarks also fall short in assessing how LLM performance changes over time, as they consist of static questions without a temporal dimension. To address these limitations, we propose using future event prediction as a continuous evaluation method to assess LLMs' temporal generalization and forecasting abilities. Our benchmark, Daily Oracle, automatically generates question-answer (QA) pairs from daily news, challenging LLMs to predict "future" event outcomes. Our findings reveal that as pre-training data becomes outdated, LLM performance degrades over time. While Retrieval Augmented Generation (RAG) has the potential to enhance prediction accuracy, the performance degradation pattern persists, highlighting the need for continuous model updates. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08324v1-abstract-full').style.display = 'none'; document.getElementById('2411.08324v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.02226">arXiv:2408.02226</a> <span> [<a href="https://arxiv.org/pdf/2408.02226">pdf</a>, <a href="https://arxiv.org/format/2408.02226">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ProCreate, Don't Reproduce! Propulsive Energy Diffusion for Creative Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jack Lu</a>, <a href="/search/cs?searchtype=author&query=Teehan%2C+R">Ryan Teehan</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+M">Mengye Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.02226v2-abstract-short" style="display: inline;"> In this paper, we propose ProCreate, a simple and easy-to-implement method to improve sample diversity and creativity of diffusion-based image generative models and to prevent training data reproduction. ProCreate operates on a set of reference images and actively propels the generated image embedding away from the reference embeddings during the generation process. We propose FSCG-8 (Few-Shot Cre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02226v2-abstract-full').style.display = 'inline'; document.getElementById('2408.02226v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.02226v2-abstract-full" style="display: none;"> In this paper, we propose ProCreate, a simple and easy-to-implement method to improve sample diversity and creativity of diffusion-based image generative models and to prevent training data reproduction. ProCreate operates on a set of reference images and actively propels the generated image embedding away from the reference embeddings during the generation process. We propose FSCG-8 (Few-Shot Creative Generation 8), a few-shot creative generation dataset on eight different categories -- encompassing different concepts, styles, and settings -- in which ProCreate achieves the highest sample diversity and fidelity. Furthermore, we show that ProCreate is effective at preventing replicating training data in a large-scale evaluation using training text prompts. Code and FSCG-8 are available at https://github.com/Agentic-Learning-AI-Lab/procreate-diffusion-public. The project page is available at https://procreate-diffusion.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02226v2-abstract-full').style.display = 'none'; document.getElementById('2408.02226v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ECCV 2024. Project page: https://procreate-diffusion.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.15362">arXiv:2403.15362</a> <span> [<a href="https://arxiv.org/pdf/2403.15362">pdf</a>, <a href="https://arxiv.org/format/2403.15362">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CoLLEGe: Concept Embedding Generation for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Teehan%2C+R">Ryan Teehan</a>, <a href="/search/cs?searchtype=author&query=Lake%2C+B">Brenden Lake</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+M">Mengye Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.15362v2-abstract-short" style="display: inline;"> Current language models are unable to quickly learn new concepts on the fly, often requiring a more involved finetuning process to learn robustly. Prompting in-context is not robust to context distractions, and often fails to confer much information about the new concepts. Classic methods for few-shot word learning in NLP, relying on global word vectors, are less applicable to large language model… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15362v2-abstract-full').style.display = 'inline'; document.getElementById('2403.15362v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.15362v2-abstract-full" style="display: none;"> Current language models are unable to quickly learn new concepts on the fly, often requiring a more involved finetuning process to learn robustly. Prompting in-context is not robust to context distractions, and often fails to confer much information about the new concepts. Classic methods for few-shot word learning in NLP, relying on global word vectors, are less applicable to large language models. In this paper, we introduce a novel approach named CoLLEGe (Concept Learning with Language Embedding Generation) to modernize few-shot concept learning. CoLLEGe is a meta-learning framework capable of generating flexible embeddings for new concepts using a small number of example sentences or definitions. Our primary meta-learning objective is simply to facilitate a language model to make next word predictions in forthcoming sentences, making it compatible with language model pretraining. We design a series of tasks to test new concept learning in challenging real-world scenarios, including new word acquisition, definition inference, and verbal reasoning, and demonstrate that our method succeeds in each setting without task-specific training. Code and data for our project can be found at https://college-concept-learning.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15362v2-abstract-full').style.display = 'none'; document.getElementById('2403.15362v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.03210">arXiv:2310.03210</a> <span> [<a href="https://arxiv.org/pdf/2310.03210">pdf</a>, <a href="https://arxiv.org/ps/2310.03210">ps</a>, <a href="https://arxiv.org/format/2310.03210">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Can Language Models Employ the Socratic Method? Experiments with Code Debugging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Al-Hossami%2C+E">Erfan Al-Hossami</a>, <a href="/search/cs?searchtype=author&query=Bunescu%2C+R">Razvan Bunescu</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+J">Justin Smith</a>, <a href="/search/cs?searchtype=author&query=Teehan%2C+R">Ryan Teehan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.03210v1-abstract-short" style="display: inline;"> When employing the Socratic method of teaching, instructors guide students toward solving a problem on their own rather than providing the solution directly. While this strategy can substantially improve learning outcomes, it is usually time-consuming and cognitively demanding. Automated Socratic conversational agents can augment human instruction and provide the necessary scale, however their dev… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.03210v1-abstract-full').style.display = 'inline'; document.getElementById('2310.03210v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.03210v1-abstract-full" style="display: none;"> When employing the Socratic method of teaching, instructors guide students toward solving a problem on their own rather than providing the solution directly. While this strategy can substantially improve learning outcomes, it is usually time-consuming and cognitively demanding. Automated Socratic conversational agents can augment human instruction and provide the necessary scale, however their development is hampered by the lack of suitable data for training and evaluation. In this paper, we introduce a manually created dataset of multi-turn Socratic advice that is aimed at helping a novice programmer fix buggy solutions to simple computational problems. The dataset is then used for benchmarking the Socratic debugging abilities of a number of language models, ranging from fine-tuning the instruction-based text-to-text transformer Flan-T5 to zero-shot and chain of thought prompting of the much larger GPT-4. The code and datasets are made freely available for research at the link below. https://github.com/taisazero/socratic-debugging-benchmark <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.03210v1-abstract-full').style.display = 'none'; document.getElementById('2310.03210v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 2 tables. To be published in Proceedings of the 2024 Technical Symposium on Computer Science Education (SIGCSE'24)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.05100">arXiv:2211.05100</a> <span> [<a href="https://arxiv.org/pdf/2211.05100">pdf</a>, <a href="https://arxiv.org/format/2211.05100">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> BLOOM: A 176B-Parameter Open-Access Multilingual Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Workshop%2C+B">BigScience Workshop</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Scao%2C+T+L">Teven Le Scao</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+A">Angela Fan</a>, <a href="/search/cs?searchtype=author&query=Akiki%2C+C">Christopher Akiki</a>, <a href="/search/cs?searchtype=author&query=Pavlick%2C+E">Ellie Pavlick</a>, <a href="/search/cs?searchtype=author&query=Ili%C4%87%2C+S">Suzana Ili膰</a>, <a href="/search/cs?searchtype=author&query=Hesslow%2C+D">Daniel Hesslow</a>, <a href="/search/cs?searchtype=author&query=Castagn%C3%A9%2C+R">Roman Castagn茅</a>, <a href="/search/cs?searchtype=author&query=Luccioni%2C+A+S">Alexandra Sasha Luccioni</a>, <a href="/search/cs?searchtype=author&query=Yvon%2C+F">Fran莽ois Yvon</a>, <a href="/search/cs?searchtype=author&query=Gall%C3%A9%2C+M">Matthias Gall茅</a>, <a href="/search/cs?searchtype=author&query=Tow%2C+J">Jonathan Tow</a>, <a href="/search/cs?searchtype=author&query=Rush%2C+A+M">Alexander M. Rush</a>, <a href="/search/cs?searchtype=author&query=Biderman%2C+S">Stella Biderman</a>, <a href="/search/cs?searchtype=author&query=Webson%2C+A">Albert Webson</a>, <a href="/search/cs?searchtype=author&query=Ammanamanchi%2C+P+S">Pawan Sasanka Ammanamanchi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Thomas Wang</a>, <a href="/search/cs?searchtype=author&query=Sagot%2C+B">Beno卯t Sagot</a>, <a href="/search/cs?searchtype=author&query=Muennighoff%2C+N">Niklas Muennighoff</a>, <a href="/search/cs?searchtype=author&query=del+Moral%2C+A+V">Albert Villanova del Moral</a>, <a href="/search/cs?searchtype=author&query=Ruwase%2C+O">Olatunji Ruwase</a>, <a href="/search/cs?searchtype=author&query=Bawden%2C+R">Rachel Bawden</a>, <a href="/search/cs?searchtype=author&query=Bekman%2C+S">Stas Bekman</a>, <a href="/search/cs?searchtype=author&query=McMillan-Major%2C+A">Angelina McMillan-Major</a> , et al. (369 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.05100v4-abstract-short" style="display: inline;"> Large language models (LLMs) have been shown to be able to perform new tasks based on a few demonstrations or natural language instructions. While these capabilities have led to widespread adoption, most LLMs are developed by resource-rich organizations and are frequently kept from the public. As a step towards democratizing this powerful technology, we present BLOOM, a 176B-parameter open-access… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.05100v4-abstract-full').style.display = 'inline'; document.getElementById('2211.05100v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.05100v4-abstract-full" style="display: none;"> Large language models (LLMs) have been shown to be able to perform new tasks based on a few demonstrations or natural language instructions. While these capabilities have led to widespread adoption, most LLMs are developed by resource-rich organizations and are frequently kept from the public. As a step towards democratizing this powerful technology, we present BLOOM, a 176B-parameter open-access language model designed and built thanks to a collaboration of hundreds of researchers. BLOOM is a decoder-only Transformer language model that was trained on the ROOTS corpus, a dataset comprising hundreds of sources in 46 natural and 13 programming languages (59 in total). We find that BLOOM achieves competitive performance on a wide variety of benchmarks, with stronger results after undergoing multitask prompted finetuning. To facilitate future research and applications using LLMs, we publicly release our models and code under the Responsible AI License. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.05100v4-abstract-full').style.display = 'none'; document.getElementById('2211.05100v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.04615">arXiv:2206.04615</a> <span> [<a href="https://arxiv.org/pdf/2206.04615">pdf</a>, <a href="https://arxiv.org/format/2206.04615">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Srivastava%2C+A">Aarohi Srivastava</a>, <a href="/search/cs?searchtype=author&query=Rastogi%2C+A">Abhinav Rastogi</a>, <a href="/search/cs?searchtype=author&query=Rao%2C+A">Abhishek Rao</a>, <a href="/search/cs?searchtype=author&query=Shoeb%2C+A+A+M">Abu Awal Md Shoeb</a>, <a href="/search/cs?searchtype=author&query=Abid%2C+A">Abubakar Abid</a>, <a href="/search/cs?searchtype=author&query=Fisch%2C+A">Adam Fisch</a>, <a href="/search/cs?searchtype=author&query=Brown%2C+A+R">Adam R. Brown</a>, <a href="/search/cs?searchtype=author&query=Santoro%2C+A">Adam Santoro</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+A">Aditya Gupta</a>, <a href="/search/cs?searchtype=author&query=Garriga-Alonso%2C+A">Adri脿 Garriga-Alonso</a>, <a href="/search/cs?searchtype=author&query=Kluska%2C+A">Agnieszka Kluska</a>, <a href="/search/cs?searchtype=author&query=Lewkowycz%2C+A">Aitor Lewkowycz</a>, <a href="/search/cs?searchtype=author&query=Agarwal%2C+A">Akshat Agarwal</a>, <a href="/search/cs?searchtype=author&query=Power%2C+A">Alethea Power</a>, <a href="/search/cs?searchtype=author&query=Ray%2C+A">Alex Ray</a>, <a href="/search/cs?searchtype=author&query=Warstadt%2C+A">Alex Warstadt</a>, <a href="/search/cs?searchtype=author&query=Kocurek%2C+A+W">Alexander W. Kocurek</a>, <a href="/search/cs?searchtype=author&query=Safaya%2C+A">Ali Safaya</a>, <a href="/search/cs?searchtype=author&query=Tazarv%2C+A">Ali Tazarv</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+A">Alice Xiang</a>, <a href="/search/cs?searchtype=author&query=Parrish%2C+A">Alicia Parrish</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+A">Allen Nie</a>, <a href="/search/cs?searchtype=author&query=Hussain%2C+A">Aman Hussain</a>, <a href="/search/cs?searchtype=author&query=Askell%2C+A">Amanda Askell</a>, <a href="/search/cs?searchtype=author&query=Dsouza%2C+A">Amanda Dsouza</a> , et al. (426 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.04615v3-abstract-short" style="display: inline;"> Language models demonstrate both quantitative improvement and new qualitative capabilities with increasing scale. Despite their potentially transformative impact, these new capabilities are as yet poorly characterized. In order to inform future research, prepare for disruptive new model capabilities, and ameliorate socially harmful effects, it is vital that we understand the present and near-futur… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.04615v3-abstract-full').style.display = 'inline'; document.getElementById('2206.04615v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.04615v3-abstract-full" style="display: none;"> Language models demonstrate both quantitative improvement and new qualitative capabilities with increasing scale. Despite their potentially transformative impact, these new capabilities are as yet poorly characterized. In order to inform future research, prepare for disruptive new model capabilities, and ameliorate socially harmful effects, it is vital that we understand the present and near-future capabilities and limitations of language models. To address this challenge, we introduce the Beyond the Imitation Game benchmark (BIG-bench). BIG-bench currently consists of 204 tasks, contributed by 450 authors across 132 institutions. Task topics are diverse, drawing problems from linguistics, childhood development, math, common-sense reasoning, biology, physics, social bias, software development, and beyond. BIG-bench focuses on tasks that are believed to be beyond the capabilities of current language models. We evaluate the behavior of OpenAI's GPT models, Google-internal dense transformer architectures, and Switch-style sparse transformers on BIG-bench, across model sizes spanning millions to hundreds of billions of parameters. In addition, a team of human expert raters performed all tasks in order to provide a strong baseline. Findings include: model performance and calibration both improve with scale, but are poor in absolute terms (and when compared with rater performance); performance is remarkably similar across model classes, though with benefits from sparsity; tasks that improve gradually and predictably commonly involve a large knowledge or memorization component, whereas tasks that exhibit "breakthrough" behavior at a critical scale often involve multiple steps or components, or brittle metrics; social bias typically increases with scale in settings with ambiguous context, but this can be improved with prompting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.04615v3-abstract-full').style.display = 'none'; document.getElementById('2206.04615v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27 pages, 17 figures + references and appendices, repo: https://github.com/google/BIG-bench</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Transactions on Machine Learning Research, May/2022, https://openreview.net/forum?id=uyTL5Bvosj </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.02721">arXiv:2112.02721</a> <span> [<a href="https://arxiv.org/pdf/2112.02721">pdf</a>, <a href="https://arxiv.org/format/2112.02721">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> NL-Augmenter: A Framework for Task-Sensitive Natural Language Augmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dhole%2C+K+D">Kaustubh D. Dhole</a>, <a href="/search/cs?searchtype=author&query=Gangal%2C+V">Varun Gangal</a>, <a href="/search/cs?searchtype=author&query=Gehrmann%2C+S">Sebastian Gehrmann</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+A">Aadesh Gupta</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhenhao Li</a>, <a href="/search/cs?searchtype=author&query=Mahamood%2C+S">Saad Mahamood</a>, <a href="/search/cs?searchtype=author&query=Mahendiran%2C+A">Abinaya Mahendiran</a>, <a href="/search/cs?searchtype=author&query=Mille%2C+S">Simon Mille</a>, <a href="/search/cs?searchtype=author&query=Shrivastava%2C+A">Ashish Shrivastava</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+S">Samson Tan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tongshuang Wu</a>, <a href="/search/cs?searchtype=author&query=Sohl-Dickstein%2C+J">Jascha Sohl-Dickstein</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+J+D">Jinho D. Choi</a>, <a href="/search/cs?searchtype=author&query=Hovy%2C+E">Eduard Hovy</a>, <a href="/search/cs?searchtype=author&query=Dusek%2C+O">Ondrej Dusek</a>, <a href="/search/cs?searchtype=author&query=Ruder%2C+S">Sebastian Ruder</a>, <a href="/search/cs?searchtype=author&query=Anand%2C+S">Sajant Anand</a>, <a href="/search/cs?searchtype=author&query=Aneja%2C+N">Nagender Aneja</a>, <a href="/search/cs?searchtype=author&query=Banjade%2C+R">Rabin Banjade</a>, <a href="/search/cs?searchtype=author&query=Barthe%2C+L">Lisa Barthe</a>, <a href="/search/cs?searchtype=author&query=Behnke%2C+H">Hanna Behnke</a>, <a href="/search/cs?searchtype=author&query=Berlot-Attwell%2C+I">Ian Berlot-Attwell</a>, <a href="/search/cs?searchtype=author&query=Boyle%2C+C">Connor Boyle</a>, <a href="/search/cs?searchtype=author&query=Brun%2C+C">Caroline Brun</a>, <a href="/search/cs?searchtype=author&query=Cabezudo%2C+M+A+S">Marco Antonio Sobrevilla Cabezudo</a> , et al. (101 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.02721v2-abstract-short" style="display: inline;"> Data augmentation is an important component in the robustness evaluation of models in natural language processing (NLP) and in enhancing the diversity of the data they are trained on. In this paper, we present NL-Augmenter, a new participatory Python-based natural language augmentation framework which supports the creation of both transformations (modifications to the data) and filters (data split… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.02721v2-abstract-full').style.display = 'inline'; document.getElementById('2112.02721v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.02721v2-abstract-full" style="display: none;"> Data augmentation is an important component in the robustness evaluation of models in natural language processing (NLP) and in enhancing the diversity of the data they are trained on. In this paper, we present NL-Augmenter, a new participatory Python-based natural language augmentation framework which supports the creation of both transformations (modifications to the data) and filters (data splits according to specific features). We describe the framework and an initial set of 117 transformations and 23 filters for a variety of natural language tasks. We demonstrate the efficacy of NL-Augmenter by using several of its transformations to analyze the robustness of popular natural language models. The infrastructure, datacards and robustness analysis results are available publicly on the NL-Augmenter repository (https://github.com/GEM-benchmark/NL-Augmenter). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.02721v2-abstract-full').style.display = 'none'; document.getElementById('2112.02721v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">39 pages, repository at https://github.com/GEM-benchmark/NL-Augmenter</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.08207">arXiv:2110.08207</a> <span> [<a href="https://arxiv.org/pdf/2110.08207">pdf</a>, <a href="https://arxiv.org/format/2110.08207">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Multitask Prompted Training Enables Zero-Shot Task Generalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sanh%2C+V">Victor Sanh</a>, <a href="/search/cs?searchtype=author&query=Webson%2C+A">Albert Webson</a>, <a href="/search/cs?searchtype=author&query=Raffel%2C+C">Colin Raffel</a>, <a href="/search/cs?searchtype=author&query=Bach%2C+S+H">Stephen H. Bach</a>, <a href="/search/cs?searchtype=author&query=Sutawika%2C+L">Lintang Sutawika</a>, <a href="/search/cs?searchtype=author&query=Alyafeai%2C+Z">Zaid Alyafeai</a>, <a href="/search/cs?searchtype=author&query=Chaffin%2C+A">Antoine Chaffin</a>, <a href="/search/cs?searchtype=author&query=Stiegler%2C+A">Arnaud Stiegler</a>, <a href="/search/cs?searchtype=author&query=Scao%2C+T+L">Teven Le Scao</a>, <a href="/search/cs?searchtype=author&query=Raja%2C+A">Arun Raja</a>, <a href="/search/cs?searchtype=author&query=Dey%2C+M">Manan Dey</a>, <a href="/search/cs?searchtype=author&query=Bari%2C+M+S">M Saiful Bari</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Canwen Xu</a>, <a href="/search/cs?searchtype=author&query=Thakker%2C+U">Urmish Thakker</a>, <a href="/search/cs?searchtype=author&query=Sharma%2C+S+S">Shanya Sharma Sharma</a>, <a href="/search/cs?searchtype=author&query=Szczechla%2C+E">Eliza Szczechla</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+T">Taewoon Kim</a>, <a href="/search/cs?searchtype=author&query=Chhablani%2C+G">Gunjan Chhablani</a>, <a href="/search/cs?searchtype=author&query=Nayak%2C+N">Nihal Nayak</a>, <a href="/search/cs?searchtype=author&query=Datta%2C+D">Debajyoti Datta</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+J">Jonathan Chang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+M+T">Mike Tian-Jian Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Han Wang</a>, <a href="/search/cs?searchtype=author&query=Manica%2C+M">Matteo Manica</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Sheng Shen</a> , et al. (16 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.08207v3-abstract-short" style="display: inline;"> Large language models have recently been shown to attain reasonable zero-shot generalization on a diverse set of tasks (Brown et al., 2020). It has been hypothesized that this is a consequence of implicit multitask learning in language models' pretraining (Radford et al., 2019). Can zero-shot generalization instead be directly induced by explicit multitask learning? To test this question at scale,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.08207v3-abstract-full').style.display = 'inline'; document.getElementById('2110.08207v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.08207v3-abstract-full" style="display: none;"> Large language models have recently been shown to attain reasonable zero-shot generalization on a diverse set of tasks (Brown et al., 2020). It has been hypothesized that this is a consequence of implicit multitask learning in language models' pretraining (Radford et al., 2019). Can zero-shot generalization instead be directly induced by explicit multitask learning? To test this question at scale, we develop a system for easily mapping any natural language tasks into a human-readable prompted form. We convert a large set of supervised datasets, each with multiple prompts with diverse wording. These prompted datasets allow for benchmarking the ability of a model to perform completely held-out tasks. We fine-tune a pretrained encoder-decoder model (Raffel et al., 2020; Lester et al., 2021) on this multitask mixture covering a wide variety of tasks. The model attains strong zero-shot performance on several standard datasets, often outperforming models up to 16x its size. Further, our approach attains strong performance on a subset of tasks from the BIG-bench benchmark, outperforming models up to 6x its size. All trained models are available at https://github.com/bigscience-workshop/t-zero and all prompts are available at https://github.com/bigscience-workshop/promptsource. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.08207v3-abstract-full').style.display = 'none'; document.getElementById('2110.08207v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2022 Spotlight (with extended discussion)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.03111">arXiv:2110.03111</a> <span> [<a href="https://arxiv.org/pdf/2110.03111">pdf</a>, <a href="https://arxiv.org/format/2110.03111">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Cut the CARP: Fishing for zero-shot story evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Matiana%2C+S">Shahbuland Matiana</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+J">JR Smith</a>, <a href="/search/cs?searchtype=author&query=Teehan%2C+R">Ryan Teehan</a>, <a href="/search/cs?searchtype=author&query=Castricato%2C+L">Louis Castricato</a>, <a href="/search/cs?searchtype=author&query=Biderman%2C+S">Stella Biderman</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+L">Leo Gao</a>, <a href="/search/cs?searchtype=author&query=Frazier%2C+S">Spencer Frazier</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.03111v3-abstract-short" style="display: inline;"> Recent advances in large-scale language models (Raffel et al., 2019; Brown et al., 2020) have brought significant qualitative and quantitative improvements in machine-driven text generation. Despite this, generation and evaluation of machine-generated narrative text remains a challenging problem. Objective evaluation of computationally-generated stories may be prohibitively expensive, require meti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.03111v3-abstract-full').style.display = 'inline'; document.getElementById('2110.03111v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.03111v3-abstract-full" style="display: none;"> Recent advances in large-scale language models (Raffel et al., 2019; Brown et al., 2020) have brought significant qualitative and quantitative improvements in machine-driven text generation. Despite this, generation and evaluation of machine-generated narrative text remains a challenging problem. Objective evaluation of computationally-generated stories may be prohibitively expensive, require meticulously annotated datasets, or may not adequately measure the logical coherence of a generated story's narratological structure. Informed by recent advances in contrastive learning (Radford et al., 2021), we present Contrastive Authoring and Reviewing Pairing (CARP): a scalable, efficient method for performing qualitatively superior, zero-shot evaluation of stories. We show a strong correlation between human evaluation of stories and those of CARP. Model outputs more significantly correlate with corresponding human input than those language-model based methods which utilize finetuning or prompt engineering approaches. We also present and analyze the Story-Critique Dataset, a new corpora composed of 1.3 million aligned story-critique pairs derived from over 80,000 stories. We expect this corpus to be of interest to NLP researchers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.03111v3-abstract-full').style.display = 'none'; document.getElementById('2110.03111v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 4 figures</span> </p> </li> </ol> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository