CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 164 results for author: <span class="mathjax">Hajishirzi, H</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Hajishirzi%2C+H">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Hajishirzi, H"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Hajishirzi%2C+H&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Hajishirzi, H"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Hajishirzi%2C+H&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Hajishirzi%2C+H&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Hajishirzi%2C+H&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Hajishirzi%2C+H&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Hajishirzi%2C+H&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15124">arXiv:2411.15124</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15124">pdf</a>, <a href="https://arxiv.org/format/2411.15124">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> T脺LU 3: Pushing Frontiers in Open Language Model Post-Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lambert%2C+N">Nathan Lambert</a>, <a href="/search/cs?searchtype=author&amp;query=Morrison%2C+J">Jacob Morrison</a>, <a href="/search/cs?searchtype=author&amp;query=Pyatkin%2C+V">Valentina Pyatkin</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+S">Shengyi Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Ivison%2C+H">Hamish Ivison</a>, <a href="/search/cs?searchtype=author&amp;query=Brahman%2C+F">Faeze Brahman</a>, <a href="/search/cs?searchtype=author&amp;query=Miranda%2C+L+J+V">Lester James V. Miranda</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+A">Alisa Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Dziri%2C+N">Nouha Dziri</a>, <a href="/search/cs?searchtype=author&amp;query=Lyu%2C+S">Shane Lyu</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+Y">Yuling Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Malik%2C+S">Saumya Malik</a>, <a href="/search/cs?searchtype=author&amp;query=Graf%2C+V">Victoria Graf</a>, <a href="/search/cs?searchtype=author&amp;query=Hwang%2C+J+D">Jena D. Hwang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+J">Jiangjiang Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Bras%2C+R+L">Ronan Le Bras</a>, <a href="/search/cs?searchtype=author&amp;query=Tafjord%2C+O">Oyvind Tafjord</a>, <a href="/search/cs?searchtype=author&amp;query=Wilhelm%2C+C">Chris Wilhelm</a>, <a href="/search/cs?searchtype=author&amp;query=Soldaini%2C+L">Luca Soldaini</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+N+A">Noah A. Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yizhong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Dasigi%2C+P">Pradeep Dasigi</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15124v1-abstract-short" style="display: inline;"> Language model post-training is applied to refine behaviors and unlock new skills across a wide range of recent language models, but open recipes for applying these techniques lag behind proprietary ones. The underlying training data and recipes for post-training are simultaneously the most important pieces of the puzzle and the portion with the least transparency. To bridge this gap, we introduce&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15124v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15124v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15124v1-abstract-full" style="display: none;"> Language model post-training is applied to refine behaviors and unlock new skills across a wide range of recent language models, but open recipes for applying these techniques lag behind proprietary ones. The underlying training data and recipes for post-training are simultaneously the most important pieces of the puzzle and the portion with the least transparency. To bridge this gap, we introduce T脺LU 3, a family of fully-open state-of-the-art post-trained models, alongside its data, code, and training recipes, serving as a comprehensive guide for modern post-training techniques. T脺LU 3, which builds on Llama 3.1 base models, achieves results surpassing the instruct versions of Llama 3.1, Qwen 2.5, Mistral, and even closed models such as GPT-4o-mini and Claude 3.5-Haiku. The training algorithms for our models include supervised finetuning (SFT), Direct Preference Optimization (DPO), and a novel method we call Reinforcement Learning with Verifiable Rewards (RLVR). With T脺LU 3, we introduce a multi-task evaluation scheme for post-training recipes with development and unseen evaluations, standard benchmark implementations, and substantial decontamination of existing open datasets on said benchmarks. We conclude with analysis and discussion of training methods that did not reliably improve performance. In addition to the T脺LU 3 model weights and demo, we release the complete recipe -- including datasets for diverse core skills, a robust toolkit for data curation and evaluation, the training code and infrastructure, and, most importantly, a detailed report for reproducing and further adapting the T脺LU 3 approach to more domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15124v1-abstract-full').style.display = 'none'; document.getElementById('2411.15124v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14199">arXiv:2411.14199</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.14199">pdf</a>, <a href="https://arxiv.org/format/2411.14199">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Digital Libraries">cs.DL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> OpenScholar: Synthesizing Scientific Literature with Retrieval-augmented LMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Asai%2C+A">Akari Asai</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+J">Jacqueline He</a>, <a href="/search/cs?searchtype=author&amp;query=Shao%2C+R">Rulin Shao</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+W">Weijia Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Singh%2C+A">Amanpreet Singh</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+J+C">Joseph Chee Chang</a>, <a href="/search/cs?searchtype=author&amp;query=Lo%2C+K">Kyle Lo</a>, <a href="/search/cs?searchtype=author&amp;query=Soldaini%2C+L">Luca Soldaini</a>, <a href="/search/cs?searchtype=author&amp;query=Feldman%2C+S">Sergey Feldman</a>, <a href="/search/cs?searchtype=author&amp;query=D%27arcy%2C+M">Mike D&#39;arcy</a>, <a href="/search/cs?searchtype=author&amp;query=Wadden%2C+D">David Wadden</a>, <a href="/search/cs?searchtype=author&amp;query=Latzke%2C+M">Matt Latzke</a>, <a href="/search/cs?searchtype=author&amp;query=Tian%2C+M">Minyang Tian</a>, <a href="/search/cs?searchtype=author&amp;query=Ji%2C+P">Pan Ji</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Shengyan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+H">Hao Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+B">Bohao Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+Y">Yanyu Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&amp;query=Neubig%2C+G">Graham Neubig</a>, <a href="/search/cs?searchtype=author&amp;query=Weld%2C+D">Dan Weld</a>, <a href="/search/cs?searchtype=author&amp;query=Downey%2C+D">Doug Downey</a>, <a href="/search/cs?searchtype=author&amp;query=Yih%2C+W">Wen-tau Yih</a>, <a href="/search/cs?searchtype=author&amp;query=Koh%2C+P+W">Pang Wei Koh</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14199v1-abstract-short" style="display: inline;"> Scientific progress depends on researchers&#39; ability to synthesize the growing body of literature. Can large language models (LMs) assist scientists in this task? We introduce OpenScholar, a specialized retrieval-augmented LM that answers scientific queries by identifying relevant passages from 45 million open-access papers and synthesizing citation-backed responses. To evaluate OpenScholar, we dev&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14199v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14199v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14199v1-abstract-full" style="display: none;"> Scientific progress depends on researchers&#39; ability to synthesize the growing body of literature. Can large language models (LMs) assist scientists in this task? We introduce OpenScholar, a specialized retrieval-augmented LM that answers scientific queries by identifying relevant passages from 45 million open-access papers and synthesizing citation-backed responses. To evaluate OpenScholar, we develop ScholarQABench, the first large-scale multi-domain benchmark for literature search, comprising 2,967 expert-written queries and 208 long-form answers across computer science, physics, neuroscience, and biomedicine. On ScholarQABench, OpenScholar-8B outperforms GPT-4o by 5% and PaperQA2 by 7% in correctness, despite being a smaller, open model. While GPT4o hallucinates citations 78 to 90% of the time, OpenScholar achieves citation accuracy on par with human experts. OpenScholar&#39;s datastore, retriever, and self-feedback inference loop also improves off-the-shelf LMs: for instance, OpenScholar-GPT4o improves GPT-4o&#39;s correctness by 12%. In human evaluations, experts preferred OpenScholar-8B and OpenScholar-GPT4o responses over expert-written ones 51% and 70% of the time, respectively, compared to GPT4o&#39;s 32%. We open-source all of our code, models, datastore, data and a public demo. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14199v1-abstract-full').style.display = 'none'; document.getElementById('2411.14199v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19133">arXiv:2410.19133</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.19133">pdf</a>, <a href="https://arxiv.org/format/2410.19133">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Hybrid Preferences: Learning to Route Instances for Human vs. AI Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Miranda%2C+L+J+V">Lester James V. Miranda</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yizhong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Elazar%2C+Y">Yanai Elazar</a>, <a href="/search/cs?searchtype=author&amp;query=Kumar%2C+S">Sachin Kumar</a>, <a href="/search/cs?searchtype=author&amp;query=Pyatkin%2C+V">Valentina Pyatkin</a>, <a href="/search/cs?searchtype=author&amp;query=Brahman%2C+F">Faeze Brahman</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+N+A">Noah A. Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a>, <a href="/search/cs?searchtype=author&amp;query=Dasigi%2C+P">Pradeep Dasigi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19133v2-abstract-short" style="display: inline;"> Learning from human feedback has enabled the alignment of language models (LMs) with human preferences. However, directly collecting human preferences can be expensive, time-consuming, and can have high variance. An appealing alternative is to distill preferences from LMs as a source of synthetic annotations as they are more consistent, cheaper, and scale better than human annotation; however, the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19133v2-abstract-full').style.display = 'inline'; document.getElementById('2410.19133v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19133v2-abstract-full" style="display: none;"> Learning from human feedback has enabled the alignment of language models (LMs) with human preferences. However, directly collecting human preferences can be expensive, time-consuming, and can have high variance. An appealing alternative is to distill preferences from LMs as a source of synthetic annotations as they are more consistent, cheaper, and scale better than human annotation; however, they are also prone to biases and errors. In this work, we introduce a routing framework that combines inputs from humans and LMs to achieve better annotation quality, while reducing the total cost of human annotation. The crux of our approach is to identify preference instances that will benefit from human annotations. We formulate this as an optimization problem: given a preference dataset and an evaluation metric, we train a performance prediction model to predict a reward model&#39;s performance on an arbitrary combination of human and LM annotations and employ a routing strategy that selects a combination that maximizes predicted performance. We train the performance prediction model on MultiPref, a new preference dataset with 10K instances paired with human and LM labels. We show that the selected hybrid mixture of LM and direct human preferences using our routing framework achieves better reward model performance compared to using either one exclusively. We simulate selective human preference collection on three other datasets and show that our method generalizes well to all three. We analyze features from the routing model to identify characteristics of instances that can benefit from human feedback, e.g., prompts with a moderate safety concern or moderate intent complexity. We release the dataset, annotation platform, and source code used in this study to foster more efficient and accurate preference collection in the future. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19133v2-abstract-full').style.display = 'none'; document.getElementById('2410.19133v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code in https://github.com/allenai/hybrid-preferences, MultiPref dataset in https://huggingface.co/datasets/allenai/multipref, Updated related work</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16027">arXiv:2410.16027</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.16027">pdf</a>, <a href="https://arxiv.org/format/2410.16027">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ComPO: Community Preferences for Language Model Personalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kumar%2C+S">Sachin Kumar</a>, <a href="/search/cs?searchtype=author&amp;query=Park%2C+C+Y">Chan Young Park</a>, <a href="/search/cs?searchtype=author&amp;query=Tsvetkov%2C+Y">Yulia Tsvetkov</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+N+A">Noah A. Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16027v1-abstract-short" style="display: inline;"> Conventional algorithms for training language models (LMs) with human feedback rely on preferences that are assumed to account for an &#34;average&#34; user, disregarding subjectivity and finer-grained variations. Recent studies have raised concerns that aggregating such diverse and often contradictory human feedback to finetune models results in generic models that generate outputs not preferred by many&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16027v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16027v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16027v1-abstract-full" style="display: none;"> Conventional algorithms for training language models (LMs) with human feedback rely on preferences that are assumed to account for an &#34;average&#34; user, disregarding subjectivity and finer-grained variations. Recent studies have raised concerns that aggregating such diverse and often contradictory human feedback to finetune models results in generic models that generate outputs not preferred by many user groups, as they tend to average out styles and norms. To address this issue, we draw inspiration from recommendation systems and propose ComPO, a method to personalize preference optimization in LMs by contextualizing the probability distribution of model outputs with the preference provider. Focusing on group-level preferences rather than individuals, we collect and release ComPRed, a question answering dataset with community-level preferences from Reddit. This dataset facilitates studying diversity in preferences without incurring privacy concerns associated with individual feedback. Our experiments reveal that conditioning language models on a community identifier (i.e., subreddit name) during preference tuning substantially enhances model performance. Conversely, replacing this context with random subreddit identifiers significantly diminishes performance, highlighting the effectiveness of our approach in tailoring responses to communities&#39; preferences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16027v1-abstract-full').style.display = 'none'; document.getElementById('2410.16027v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15002">arXiv:2410.15002</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.15002">pdf</a>, <a href="https://arxiv.org/format/2410.15002">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> How Many Van Goghs Does It Take to Van Gogh? Finding the Imitation Threshold </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Verma%2C+S">Sahil Verma</a>, <a href="/search/cs?searchtype=author&amp;query=Rassin%2C+R">Royi Rassin</a>, <a href="/search/cs?searchtype=author&amp;query=Das%2C+A">Arnav Das</a>, <a href="/search/cs?searchtype=author&amp;query=Bhatt%2C+G">Gantavya Bhatt</a>, <a href="/search/cs?searchtype=author&amp;query=Seshadri%2C+P">Preethi Seshadri</a>, <a href="/search/cs?searchtype=author&amp;query=Shah%2C+C">Chirag Shah</a>, <a href="/search/cs?searchtype=author&amp;query=Bilmes%2C+J">Jeff Bilmes</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a>, <a href="/search/cs?searchtype=author&amp;query=Elazar%2C+Y">Yanai Elazar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15002v1-abstract-short" style="display: inline;"> Text-to-image models are trained using large datasets collected by scraping image-text pairs from the internet. These datasets often include private, copyrighted, and licensed material. Training models on such datasets enables them to generate images with such content, which might violate copyright laws and individual privacy. This phenomenon is termed imitation -- generation of images with conten&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15002v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15002v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15002v1-abstract-full" style="display: none;"> Text-to-image models are trained using large datasets collected by scraping image-text pairs from the internet. These datasets often include private, copyrighted, and licensed material. Training models on such datasets enables them to generate images with such content, which might violate copyright laws and individual privacy. This phenomenon is termed imitation -- generation of images with content that has recognizable similarity to its training images. In this work we study the relationship between a concept&#39;s frequency in the training dataset and the ability of a model to imitate it. We seek to determine the point at which a model was trained on enough instances to imitate a concept -- the imitation threshold. We posit this question as a new problem: Finding the Imitation Threshold (FIT) and propose an efficient approach that estimates the imitation threshold without incurring the colossal cost of training multiple models from scratch. We experiment with two domains -- human faces and art styles -- for which we create four datasets, and evaluate three text-to-image models which were trained on two pretraining datasets. Our results reveal that the imitation threshold of these models is in the range of 200-600 images, depending on the domain and the model. The imitation threshold can provide an empirical basis for copyright violation claims and acts as a guiding principle for text-to-image model developers that aim to comply with copyright and privacy laws. We release the code and data at \url{https://github.com/vsahil/MIMETIC-2.git} and the project&#39;s website is hosted at \url{https://how-many-van-goghs-does-it-take.github.io}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15002v1-abstract-full').style.display = 'none'; document.getElementById('2410.15002v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ATTRIB, RegML, and SafeGenAI workshops at NeurIPS 2024 and NLLP Workshop 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12937">arXiv:2410.12937</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.12937">pdf</a>, <a href="https://arxiv.org/format/2410.12937">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Merge to Learn: Efficiently Adding Skills to Language Models with Model Merging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Morrison%2C+J">Jacob Morrison</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+N+A">Noah A. Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a>, <a href="/search/cs?searchtype=author&amp;query=Koh%2C+P+W">Pang Wei Koh</a>, <a href="/search/cs?searchtype=author&amp;query=Dodge%2C+J">Jesse Dodge</a>, <a href="/search/cs?searchtype=author&amp;query=Dasigi%2C+P">Pradeep Dasigi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12937v1-abstract-short" style="display: inline;"> Adapting general-purpose language models to new skills is currently an expensive process that must be repeated as new instruction datasets targeting new skills are created, or can cause the models to forget older skills. In this work, we investigate the effectiveness of adding new skills to preexisting models by training on the new skills in isolation and later merging with the general model (e.g.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12937v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12937v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12937v1-abstract-full" style="display: none;"> Adapting general-purpose language models to new skills is currently an expensive process that must be repeated as new instruction datasets targeting new skills are created, or can cause the models to forget older skills. In this work, we investigate the effectiveness of adding new skills to preexisting models by training on the new skills in isolation and later merging with the general model (e.g. using task vectors). In experiments focusing on scientific literature understanding, safety, and coding, we find that the parallel-train-then-merge procedure, which is significantly cheaper than retraining the models on updated data mixtures, is often comparably effective. Our experiments also show that parallel training is especially well-suited for enabling safety features in LMs relative to continued finetuning and retraining, as it dramatically improves model compliance with safe prompts while preserving its ability to refuse dangerous or harmful prompts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12937v1-abstract-full').style.display = 'none'; document.getElementById('2410.12937v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Findings of EMNLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05774">arXiv:2410.05774</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.05774">pdf</a>, <a href="https://arxiv.org/format/2410.05774">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ActionAtlas: A VideoQA Benchmark for Domain-specialized Action Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Salehi%2C+M">Mohammadreza Salehi</a>, <a href="/search/cs?searchtype=author&amp;query=Park%2C+J+S">Jae Sung Park</a>, <a href="/search/cs?searchtype=author&amp;query=Yadav%2C+T">Tanush Yadav</a>, <a href="/search/cs?searchtype=author&amp;query=Kusupati%2C+A">Aditya Kusupati</a>, <a href="/search/cs?searchtype=author&amp;query=Krishna%2C+R">Ranjay Krishna</a>, <a href="/search/cs?searchtype=author&amp;query=Choi%2C+Y">Yejin Choi</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a>, <a href="/search/cs?searchtype=author&amp;query=Farhadi%2C+A">Ali Farhadi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05774v4-abstract-short" style="display: inline;"> Our world is full of varied actions and moves across specialized domains that we, as humans, strive to identify and understand. Within any single domain, actions can often appear quite similar, making it challenging for deep models to distinguish them accurately. To evaluate the effectiveness of multimodal foundation models in helping us recognize such actions, we present ActionAtlas v1.0, a multi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05774v4-abstract-full').style.display = 'inline'; document.getElementById('2410.05774v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05774v4-abstract-full" style="display: none;"> Our world is full of varied actions and moves across specialized domains that we, as humans, strive to identify and understand. Within any single domain, actions can often appear quite similar, making it challenging for deep models to distinguish them accurately. To evaluate the effectiveness of multimodal foundation models in helping us recognize such actions, we present ActionAtlas v1.0, a multiple-choice video question answering benchmark featuring short videos across various sports. Each video in the dataset is paired with a question and four or five choices. The question pinpoints specific individuals, asking which choice &#34;best&#34; describes their action within a certain temporal context. Overall, the dataset includes 934 videos showcasing 580 unique actions across 56 sports, with a total of 1896 actions within choices. Unlike most existing video question answering benchmarks that only cover simplistic actions, often identifiable from a single frame, ActionAtlas focuses on intricate movements and rigorously tests the model&#39;s capability to discern subtle differences between moves that look similar within each domain. We evaluate open and proprietary foundation models on this benchmark, finding that the best model, GPT-4o, achieves a maximum accuracy of 45.52%. Meanwhile, Non-expert crowd workers, provided with action description for each choice, achieve 61.64% accuracy, where random chance is approximately 21%. Our findings with state-of-the-art models indicate that having a high frame sampling rate is important for accurately recognizing actions in ActionAtlas, a feature that some leading proprietary video models, such as Gemini, do not include in their default configuration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05774v4-abstract-full').style.display = 'none'; document.getElementById('2410.05774v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> NeurIPS 2024 Track Datasets and Benchmarks </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17146">arXiv:2409.17146</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.17146">pdf</a>, <a href="https://arxiv.org/format/2409.17146">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Molmo and PixMo: Open Weights and Open Data for State-of-the-Art Multimodal Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Deitke%2C+M">Matt Deitke</a>, <a href="/search/cs?searchtype=author&amp;query=Clark%2C+C">Christopher Clark</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+S">Sangho Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Tripathi%2C+R">Rohun Tripathi</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yue Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Park%2C+J+S">Jae Sung Park</a>, <a href="/search/cs?searchtype=author&amp;query=Salehi%2C+M">Mohammadreza Salehi</a>, <a href="/search/cs?searchtype=author&amp;query=Muennighoff%2C+N">Niklas Muennighoff</a>, <a href="/search/cs?searchtype=author&amp;query=Lo%2C+K">Kyle Lo</a>, <a href="/search/cs?searchtype=author&amp;query=Soldaini%2C+L">Luca Soldaini</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jiasen Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Anderson%2C+T">Taira Anderson</a>, <a href="/search/cs?searchtype=author&amp;query=Bransom%2C+E">Erin Bransom</a>, <a href="/search/cs?searchtype=author&amp;query=Ehsani%2C+K">Kiana Ehsani</a>, <a href="/search/cs?searchtype=author&amp;query=Ngo%2C+H">Huong Ngo</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">YenSung Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Patel%2C+A">Ajay Patel</a>, <a href="/search/cs?searchtype=author&amp;query=Yatskar%2C+M">Mark Yatskar</a>, <a href="/search/cs?searchtype=author&amp;query=Callison-Burch%2C+C">Chris Callison-Burch</a>, <a href="/search/cs?searchtype=author&amp;query=Head%2C+A">Andrew Head</a>, <a href="/search/cs?searchtype=author&amp;query=Hendrix%2C+R">Rose Hendrix</a>, <a href="/search/cs?searchtype=author&amp;query=Bastani%2C+F">Favyen Bastani</a>, <a href="/search/cs?searchtype=author&amp;query=VanderBilt%2C+E">Eli VanderBilt</a>, <a href="/search/cs?searchtype=author&amp;query=Lambert%2C+N">Nathan Lambert</a>, <a href="/search/cs?searchtype=author&amp;query=Chou%2C+Y">Yvonne Chou</a> , et al. (26 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17146v1-abstract-short" style="display: inline;"> Today&#39;s most advanced multimodal models remain proprietary. The strongest open-weight models rely heavily on synthetic data from proprietary VLMs to achieve good performance, effectively distilling these closed models into open ones. As a result, the community is still missing foundational knowledge about how to build performant VLMs from scratch. We present Molmo, a new family of VLMs that are st&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17146v1-abstract-full').style.display = 'inline'; document.getElementById('2409.17146v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17146v1-abstract-full" style="display: none;"> Today&#39;s most advanced multimodal models remain proprietary. The strongest open-weight models rely heavily on synthetic data from proprietary VLMs to achieve good performance, effectively distilling these closed models into open ones. As a result, the community is still missing foundational knowledge about how to build performant VLMs from scratch. We present Molmo, a new family of VLMs that are state-of-the-art in their class of openness. Our key innovation is a novel, highly detailed image caption dataset collected entirely from human annotators using speech-based descriptions. To enable a wide array of user interactions, we also introduce a diverse dataset mixture for fine-tuning that includes in-the-wild Q&amp;A and innovative 2D pointing data. The success of our approach relies on careful choices for the model architecture details, a well-tuned training pipeline, and, most critically, the quality of our newly collected datasets, all of which will be released. The best-in-class 72B model within the Molmo family not only outperforms others in the class of open weight and data models but also compares favorably against proprietary systems like GPT-4o, Claude 3.5, and Gemini 1.5 on both academic benchmarks and human evaluation. We will be releasing all of our model weights, captioning and fine-tuning data, and source code in the near future. Select model weights, inference code, and demo are available at https://molmo.allenai.org. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17146v1-abstract-full').style.display = 'none'; document.getElementById('2409.17146v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02060">arXiv:2409.02060</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.02060">pdf</a>, <a href="https://arxiv.org/format/2409.02060">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> OLMoE: Open Mixture-of-Experts Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Muennighoff%2C+N">Niklas Muennighoff</a>, <a href="/search/cs?searchtype=author&amp;query=Soldaini%2C+L">Luca Soldaini</a>, <a href="/search/cs?searchtype=author&amp;query=Groeneveld%2C+D">Dirk Groeneveld</a>, <a href="/search/cs?searchtype=author&amp;query=Lo%2C+K">Kyle Lo</a>, <a href="/search/cs?searchtype=author&amp;query=Morrison%2C+J">Jacob Morrison</a>, <a href="/search/cs?searchtype=author&amp;query=Min%2C+S">Sewon Min</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+W">Weijia Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Walsh%2C+P">Pete Walsh</a>, <a href="/search/cs?searchtype=author&amp;query=Tafjord%2C+O">Oyvind Tafjord</a>, <a href="/search/cs?searchtype=author&amp;query=Lambert%2C+N">Nathan Lambert</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+Y">Yuling Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Arora%2C+S">Shane Arora</a>, <a href="/search/cs?searchtype=author&amp;query=Bhagia%2C+A">Akshita Bhagia</a>, <a href="/search/cs?searchtype=author&amp;query=Schwenk%2C+D">Dustin Schwenk</a>, <a href="/search/cs?searchtype=author&amp;query=Wadden%2C+D">David Wadden</a>, <a href="/search/cs?searchtype=author&amp;query=Wettig%2C+A">Alexander Wettig</a>, <a href="/search/cs?searchtype=author&amp;query=Hui%2C+B">Binyuan Hui</a>, <a href="/search/cs?searchtype=author&amp;query=Dettmers%2C+T">Tim Dettmers</a>, <a href="/search/cs?searchtype=author&amp;query=Kiela%2C+D">Douwe Kiela</a>, <a href="/search/cs?searchtype=author&amp;query=Farhadi%2C+A">Ali Farhadi</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+N+A">Noah A. Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Koh%2C+P+W">Pang Wei Koh</a>, <a href="/search/cs?searchtype=author&amp;query=Singh%2C+A">Amanpreet Singh</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02060v1-abstract-short" style="display: inline;"> We introduce OLMoE, a fully open, state-of-the-art language model leveraging sparse Mixture-of-Experts (MoE). OLMoE-1B-7B has 7 billion (B) parameters but uses only 1B per input token. We pretrain it on 5 trillion tokens and further adapt it to create OLMoE-1B-7B-Instruct. Our models outperform all available models with similar active parameters, even surpassing larger ones like Llama2-13B-Chat an&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02060v1-abstract-full').style.display = 'inline'; document.getElementById('2409.02060v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02060v1-abstract-full" style="display: none;"> We introduce OLMoE, a fully open, state-of-the-art language model leveraging sparse Mixture-of-Experts (MoE). OLMoE-1B-7B has 7 billion (B) parameters but uses only 1B per input token. We pretrain it on 5 trillion tokens and further adapt it to create OLMoE-1B-7B-Instruct. Our models outperform all available models with similar active parameters, even surpassing larger ones like Llama2-13B-Chat and DeepSeekMoE-16B. We present various experiments on MoE training, analyze routing in our model showing high specialization, and open-source all aspects of our work: model weights, training data, code, and logs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02060v1-abstract-full').style.display = 'none'; document.getElementById('2409.02060v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">61 pages (24 main), 36 figures, 14 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.15018">arXiv:2407.15018</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.15018">pdf</a>, <a href="https://arxiv.org/format/2407.15018">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Answer, Assemble, Ace: Understanding How Transformers Answer Multiple Choice Questions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wiegreffe%2C+S">Sarah Wiegreffe</a>, <a href="/search/cs?searchtype=author&amp;query=Tafjord%2C+O">Oyvind Tafjord</a>, <a href="/search/cs?searchtype=author&amp;query=Belinkov%2C+Y">Yonatan Belinkov</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a>, <a href="/search/cs?searchtype=author&amp;query=Sabharwal%2C+A">Ashish Sabharwal</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.15018v1-abstract-short" style="display: inline;"> Multiple-choice question answering (MCQA) is a key competence of performant transformer language models that is tested by mainstream benchmarks. However, recent evidence shows that models can have quite a range of performance, particularly when the task format is diversified slightly (such as by shuffling answer choice order). In this work we ask: how do successful models perform formatted MCQA? W&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15018v1-abstract-full').style.display = 'inline'; document.getElementById('2407.15018v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.15018v1-abstract-full" style="display: none;"> Multiple-choice question answering (MCQA) is a key competence of performant transformer language models that is tested by mainstream benchmarks. However, recent evidence shows that models can have quite a range of performance, particularly when the task format is diversified slightly (such as by shuffling answer choice order). In this work we ask: how do successful models perform formatted MCQA? We employ vocabulary projection and activation patching methods to localize key hidden states that encode relevant information for predicting the correct answer. We find that prediction of a specific answer symbol is causally attributed to a single middle layer, and specifically its multi-head self-attention mechanism. We show that subsequent layers increase the probability of the predicted answer symbol in vocabulary space, and that this probability increase is associated with a sparse set of attention heads with unique roles. We additionally uncover differences in how different models adjust to alternative symbols. Finally, we demonstrate that a synthetic task can disentangle sources of model error to pinpoint when a model has learned formatted MCQA, and show that an inability to separate answer symbol tokens in vocabulary space is a property of models unable to perform formatted MCQA tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15018v1-abstract-full').style.display = 'none'; document.getElementById('2407.15018v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint. Code will be available at https://github.com/allenai/understanding_mcqa</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.12043">arXiv:2407.12043</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.12043">pdf</a>, <a href="https://arxiv.org/format/2407.12043">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> The Art of Saying No: Contextual Noncompliance in Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Brahman%2C+F">Faeze Brahman</a>, <a href="/search/cs?searchtype=author&amp;query=Kumar%2C+S">Sachin Kumar</a>, <a href="/search/cs?searchtype=author&amp;query=Balachandran%2C+V">Vidhisha Balachandran</a>, <a href="/search/cs?searchtype=author&amp;query=Dasigi%2C+P">Pradeep Dasigi</a>, <a href="/search/cs?searchtype=author&amp;query=Pyatkin%2C+V">Valentina Pyatkin</a>, <a href="/search/cs?searchtype=author&amp;query=Ravichander%2C+A">Abhilasha Ravichander</a>, <a href="/search/cs?searchtype=author&amp;query=Wiegreffe%2C+S">Sarah Wiegreffe</a>, <a href="/search/cs?searchtype=author&amp;query=Dziri%2C+N">Nouha Dziri</a>, <a href="/search/cs?searchtype=author&amp;query=Chandu%2C+K">Khyathi Chandu</a>, <a href="/search/cs?searchtype=author&amp;query=Hessel%2C+J">Jack Hessel</a>, <a href="/search/cs?searchtype=author&amp;query=Tsvetkov%2C+Y">Yulia Tsvetkov</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+N+A">Noah A. Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Choi%2C+Y">Yejin Choi</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.12043v2-abstract-short" style="display: inline;"> Chat-based language models are designed to be helpful, yet they should not comply with every user request. While most existing work primarily focuses on refusal of &#34;unsafe&#34; queries, we posit that the scope of noncompliance should be broadened. We introduce a comprehensive taxonomy of contextual noncompliance describing when and how models should not comply with user requests. Our taxonomy spans a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12043v2-abstract-full').style.display = 'inline'; document.getElementById('2407.12043v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.12043v2-abstract-full" style="display: none;"> Chat-based language models are designed to be helpful, yet they should not comply with every user request. While most existing work primarily focuses on refusal of &#34;unsafe&#34; queries, we posit that the scope of noncompliance should be broadened. We introduce a comprehensive taxonomy of contextual noncompliance describing when and how models should not comply with user requests. Our taxonomy spans a wide range of categories including incomplete, unsupported, indeterminate, and humanizing requests (in addition to unsafe requests). To test noncompliance capabilities of language models, we use this taxonomy to develop a new evaluation suite of 1000 noncompliance prompts. We find that most existing models show significantly high compliance rates in certain previously understudied categories with models like GPT-4 incorrectly complying with as many as 30% of requests. To address these gaps, we explore different training strategies using a synthetically-generated training set of requests and expected noncompliant responses. Our experiments demonstrate that while direct finetuning of instruction-tuned models can lead to both over-refusal and a decline in general capabilities, using parameter efficient methods like low rank adapters helps to strike a good balance between appropriate noncompliance and other capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12043v2-abstract-full').style.display = 'none'; document.getElementById('2407.12043v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The first two authors are co-first authors; Accepted at NeurIPS 2024 Track on Datasets and Benchmarks</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07087">arXiv:2407.07087</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.07087">pdf</a>, <a href="https://arxiv.org/format/2407.07087">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> CopyBench: Measuring Literal and Non-Literal Reproduction of Copyright-Protected Text in Language Model Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+T">Tong Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Asai%2C+A">Akari Asai</a>, <a href="/search/cs?searchtype=author&amp;query=Mireshghallah%2C+N">Niloofar Mireshghallah</a>, <a href="/search/cs?searchtype=author&amp;query=Min%2C+S">Sewon Min</a>, <a href="/search/cs?searchtype=author&amp;query=Grimmelmann%2C+J">James Grimmelmann</a>, <a href="/search/cs?searchtype=author&amp;query=Choi%2C+Y">Yejin Choi</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a>, <a href="/search/cs?searchtype=author&amp;query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&amp;query=Koh%2C+P+W">Pang Wei Koh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07087v2-abstract-short" style="display: inline;"> Evaluating the degree of reproduction of copyright-protected content by language models (LMs) is of significant interest to the AI and legal communities. Although both literal and non-literal similarities are considered by courts when assessing the degree of reproduction, prior research has focused only on literal similarities. To bridge this gap, we introduce CopyBench, a benchmark designed to me&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07087v2-abstract-full').style.display = 'inline'; document.getElementById('2407.07087v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07087v2-abstract-full" style="display: none;"> Evaluating the degree of reproduction of copyright-protected content by language models (LMs) is of significant interest to the AI and legal communities. Although both literal and non-literal similarities are considered by courts when assessing the degree of reproduction, prior research has focused only on literal similarities. To bridge this gap, we introduce CopyBench, a benchmark designed to measure both literal and non-literal copying in LM generations. Using copyrighted fiction books as text sources, we provide automatic evaluation protocols to assess literal and non-literal copying, balanced against the model utility in terms of the ability to recall facts from the copyrighted works and generate fluent completions. We find that, although literal copying is relatively rare, two types of non-literal copying -- event copying and character copying -- occur even in models as small as 7B parameters. Larger models demonstrate significantly more copying, with literal copying rates increasing from 0.2\% to 10.5\% and non-literal copying from 2.3\% to 5.9\% when comparing Llama3-8B and 70B models, respectively. We further evaluate the effectiveness of current strategies for mitigating copying and show that (1) training-time alignment can reduce literal copying but may increase non-literal copying, and (2) current inference-time mitigation methods primarily reduce literal but not non-literal copying. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07087v2-abstract-full').style.display = 'none'; document.getElementById('2407.07087v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18853">arXiv:2406.18853</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.18853">pdf</a>, <a href="https://arxiv.org/format/2406.18853">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Decoding-Time Language Model Alignment with Multiple Objectives </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Shi%2C+R">Ruizhe Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yifang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+Y">Yushi Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+A">Alisa Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+N+A">Noah A. Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Du%2C+S+S">Simon S. Du</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18853v3-abstract-short" style="display: inline;"> Aligning language models (LMs) to human preferences has emerged as a critical pursuit, enabling these models to better serve diverse user needs. Existing methods primarily focus on optimizing LMs for a single reward function, limiting their adaptability to varied objectives. Here, we propose $\textbf{multi-objective decoding (MOD)}$, a decoding-time algorithm that outputs the next token from a lin&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18853v3-abstract-full').style.display = 'inline'; document.getElementById('2406.18853v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18853v3-abstract-full" style="display: none;"> Aligning language models (LMs) to human preferences has emerged as a critical pursuit, enabling these models to better serve diverse user needs. Existing methods primarily focus on optimizing LMs for a single reward function, limiting their adaptability to varied objectives. Here, we propose $\textbf{multi-objective decoding (MOD)}$, a decoding-time algorithm that outputs the next token from a linear combination of predictions of all base models, for any given weightings over different objectives. We exploit a common form among a family of $f$-divergence regularized alignment approaches (such as PPO, DPO, and their variants) to identify a closed-form solution by Legendre transform, and derive an efficient decoding strategy. Theoretically, we show why existing approaches can be sub-optimal even in natural settings and obtain optimality guarantees for our method. Empirical results demonstrate the effectiveness of the algorithm. For example, compared to a parameter-merging baseline, MOD achieves 12.8% overall reward improvement when equally optimizing towards $3$ objectives. Moreover, we experiment with MOD on combining three fully-finetuned LLMs of different model sizes, each aimed at different objectives such as safety, coding, and general user preference. Unlike traditional methods that require careful curation of a mixture of datasets to achieve comprehensive improvement, we can quickly experiment with preference weightings using MOD to find the best combination of models. Our best combination reduces toxicity on Toxigen to nearly 0% and achieves 7.9--33.3% improvement across other three metrics ($\textit{i.e.}$, Codex@1, GSM-COT, BBH-COT). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18853v3-abstract-full').style.display = 'none'; document.getElementById('2406.18853v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS accepted version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09279">arXiv:2406.09279</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.09279">pdf</a>, <a href="https://arxiv.org/format/2406.09279">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Unpacking DPO and PPO: Disentangling Best Practices for Learning from Preference Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ivison%2C+H">Hamish Ivison</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yizhong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jiacheng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zeqiu Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Pyatkin%2C+V">Valentina Pyatkin</a>, <a href="/search/cs?searchtype=author&amp;query=Lambert%2C+N">Nathan Lambert</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+N+A">Noah A. Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Choi%2C+Y">Yejin Choi</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09279v2-abstract-short" style="display: inline;"> Learning from preference feedback has emerged as an essential step for improving the generation quality and performance of modern language models (LMs). Despite its widespread use, the way preference-based learning is applied varies wildly, with differing data, learning algorithms, and evaluations used, making disentangling the impact of each aspect difficult. In this work, we identify four core a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09279v2-abstract-full').style.display = 'inline'; document.getElementById('2406.09279v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09279v2-abstract-full" style="display: none;"> Learning from preference feedback has emerged as an essential step for improving the generation quality and performance of modern language models (LMs). Despite its widespread use, the way preference-based learning is applied varies wildly, with differing data, learning algorithms, and evaluations used, making disentangling the impact of each aspect difficult. In this work, we identify four core aspects of preference-based learning: preference data, learning algorithm, reward model, and policy training prompts, systematically investigate the impact of these components on downstream model performance, and suggest a recipe for strong learning for preference feedback. Our findings indicate that all aspects are important for performance, with better preference data leading to the largest improvements, followed by the choice of learning algorithm, the use of improved reward models, and finally the use of additional unlabeled prompts for policy training. Notably, PPO outperforms DPO by up to 2.5% in math and 1.2% in general domains. High-quality preference data leads to improvements of up to 8% in instruction following and truthfulness. Despite significant gains of up to 5% in mathematical evaluation when scaling up reward models, we surprisingly observe marginal improvements in other categories. We publicly release the code used for training (https://github.com/hamishivi/EasyLM) and evaluating (https://github.com/allenai/open-instruct) our models, along with the models and datasets themselves (https://huggingface.co/collections/allenai/tulu-v25-suite-66676520fd578080e126f618). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09279v2-abstract-full').style.display = 'none'; document.getElementById('2406.09279v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Neurips 2024 camera-ready</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08446">arXiv:2406.08446</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.08446">pdf</a>, <a href="https://arxiv.org/format/2406.08446">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> OLMES: A Standard for Language Model Evaluations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gu%2C+Y">Yuling Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Tafjord%2C+O">Oyvind Tafjord</a>, <a href="/search/cs?searchtype=author&amp;query=Kuehl%2C+B">Bailey Kuehl</a>, <a href="/search/cs?searchtype=author&amp;query=Haddad%2C+D">Dany Haddad</a>, <a href="/search/cs?searchtype=author&amp;query=Dodge%2C+J">Jesse Dodge</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08446v1-abstract-short" style="display: inline;"> Progress in AI is often demonstrated by new models claiming improved performance on tasks measuring model capabilities. Evaluating language models in particular is challenging, as small changes to how a model is evaluated on a task can lead to large changes in measured performance. There is no common standard setup, so different models are evaluated on the same tasks in different ways, leading to&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08446v1-abstract-full').style.display = 'inline'; document.getElementById('2406.08446v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08446v1-abstract-full" style="display: none;"> Progress in AI is often demonstrated by new models claiming improved performance on tasks measuring model capabilities. Evaluating language models in particular is challenging, as small changes to how a model is evaluated on a task can lead to large changes in measured performance. There is no common standard setup, so different models are evaluated on the same tasks in different ways, leading to claims about which models perform best not being reproducible. We propose OLMES, a completely documented, practical, open standard for reproducible LLM evaluations. In developing this standard, we identify and review the varying factors in evaluation practices adopted by the community - such as details of prompt formatting, choice of in-context examples, probability normalizations, and task formulation. In particular, OLMES supports meaningful comparisons between smaller base models that require the unnatural &#34;cloze&#34; formulation of multiple-choice questions against larger models that can utilize the original formulation. OLMES includes well-considered recommendations guided by results from existing literature as well as new experiments investigating open questions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08446v1-abstract-full').style.display = 'none'; document.getElementById('2406.08446v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07835">arXiv:2406.07835</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.07835">pdf</a>, <a href="https://arxiv.org/format/2406.07835">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SciRIFF: A Resource to Enhance Language Model Instruction-Following over Scientific Literature </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wadden%2C+D">David Wadden</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+K">Kejian Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Morrison%2C+J">Jacob Morrison</a>, <a href="/search/cs?searchtype=author&amp;query=Naik%2C+A">Aakanksha Naik</a>, <a href="/search/cs?searchtype=author&amp;query=Singh%2C+S">Shruti Singh</a>, <a href="/search/cs?searchtype=author&amp;query=Barzilay%2C+N">Nitzan Barzilay</a>, <a href="/search/cs?searchtype=author&amp;query=Lo%2C+K">Kyle Lo</a>, <a href="/search/cs?searchtype=author&amp;query=Hope%2C+T">Tom Hope</a>, <a href="/search/cs?searchtype=author&amp;query=Soldaini%2C+L">Luca Soldaini</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+S+Z">Shannon Zejiang Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Downey%2C+D">Doug Downey</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a>, <a href="/search/cs?searchtype=author&amp;query=Cohan%2C+A">Arman Cohan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07835v3-abstract-short" style="display: inline;"> We present SciRIFF (Scientific Resource for Instruction-Following and Finetuning), a dataset of 137K instruction-following demonstrations for 54 tasks covering five essential scientific literature understanding capabilities: information extraction, summarization, question answering, claim verification, and classification. SciRIFF demonstrations are notable for their long input contexts, detailed t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07835v3-abstract-full').style.display = 'inline'; document.getElementById('2406.07835v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07835v3-abstract-full" style="display: none;"> We present SciRIFF (Scientific Resource for Instruction-Following and Finetuning), a dataset of 137K instruction-following demonstrations for 54 tasks covering five essential scientific literature understanding capabilities: information extraction, summarization, question answering, claim verification, and classification. SciRIFF demonstrations are notable for their long input contexts, detailed task specifications, and complex structured outputs. While instruction-following resources are available in specific domains such as clinical medicine and chemistry, SciRIFF is the first dataset focused on extracting and synthesizing information from research literature across a wide range of scientific fields. To demonstrate the utility of SciRIFF, we develop a sample-efficient strategy to adapt a general instruction-following model for science by performing additional finetuning on a mix of general-domain and SciRIFF demonstrations. In evaluations on nine held-out scientific tasks, our model -- called SciTulu -- improves over a strong LLM baseline by 28.1% and 6.5% at the 7B and 70B scales respectively, while maintaining general instruction-following performance within 2% of the baseline. We are optimistic that SciRIFF will facilitate the development and evaluation of LLMs to help researchers navigate the ever-growing body of scientific literature. We release our dataset, model checkpoints, and data processing and evaluation code to enable further research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07835v3-abstract-full').style.display = 'none'; document.getElementById('2406.07835v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to NeurIPS Datasets and Benchmarks 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.06469">arXiv:2406.06469</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.06469">pdf</a>, <a href="https://arxiv.org/format/2406.06469">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Husky: A Unified, Open-Source Language Agent for Multi-Step Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kim%2C+J">Joongwon Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Paranjape%2C+B">Bhargavi Paranjape</a>, <a href="/search/cs?searchtype=author&amp;query=Khot%2C+T">Tushar Khot</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.06469v1-abstract-short" style="display: inline;"> Language agents perform complex tasks by using tools to execute each step precisely. However, most existing agents are based on proprietary models or designed to target specific tasks, such as mathematics or multi-hop question answering. We introduce Husky, a holistic, open-source language agent that learns to reason over a unified action space to address a diverse set of complex tasks involving n&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06469v1-abstract-full').style.display = 'inline'; document.getElementById('2406.06469v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.06469v1-abstract-full" style="display: none;"> Language agents perform complex tasks by using tools to execute each step precisely. However, most existing agents are based on proprietary models or designed to target specific tasks, such as mathematics or multi-hop question answering. We introduce Husky, a holistic, open-source language agent that learns to reason over a unified action space to address a diverse set of complex tasks involving numerical, tabular, and knowledge-based reasoning. Husky iterates between two stages: 1) generating the next action to take towards solving a given task and 2) executing the action using expert models and updating the current solution state. We identify a thorough ontology of actions for addressing complex tasks and curate high-quality data to train expert models for executing these actions. Our experiments show that Husky outperforms prior language agents across 14 evaluation datasets. Moreover, we introduce HuskyQA, a new evaluation set which stress tests language agents for mixed-tool reasoning, with a focus on retrieving missing knowledge and performing numerical reasoning. Despite using 7B models, Husky matches or even exceeds frontier LMs such as GPT-4 on these tasks, showcasing the efficacy of our holistic approach in addressing complex reasoning problems. Our code and models are available at https://github.com/agent-husky/Husky-v1. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06469v1-abstract-full').style.display = 'none'; document.getElementById('2406.06469v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">50 pages, 42 figures. Project webpage available [here](https://agent-husky.github.io/)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.01197">arXiv:2404.01197</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.01197">pdf</a>, <a href="https://arxiv.org/format/2404.01197">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Getting it Right: Improving Spatial Consistency in Text-to-Image Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chatterjee%2C+A">Agneet Chatterjee</a>, <a href="/search/cs?searchtype=author&amp;query=Stan%2C+G+B+M">Gabriela Ben Melech Stan</a>, <a href="/search/cs?searchtype=author&amp;query=Aflalo%2C+E">Estelle Aflalo</a>, <a href="/search/cs?searchtype=author&amp;query=Paul%2C+S">Sayak Paul</a>, <a href="/search/cs?searchtype=author&amp;query=Ghosh%2C+D">Dhruba Ghosh</a>, <a href="/search/cs?searchtype=author&amp;query=Gokhale%2C+T">Tejas Gokhale</a>, <a href="/search/cs?searchtype=author&amp;query=Schmidt%2C+L">Ludwig Schmidt</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a>, <a href="/search/cs?searchtype=author&amp;query=Lal%2C+V">Vasudev Lal</a>, <a href="/search/cs?searchtype=author&amp;query=Baral%2C+C">Chitta Baral</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yezhou Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.01197v2-abstract-short" style="display: inline;"> One of the key shortcomings in current text-to-image (T2I) models is their inability to consistently generate images which faithfully follow the spatial relationships specified in the text prompt. In this paper, we offer a comprehensive investigation of this limitation, while also developing datasets and methods that support algorithmic solutions to improve spatial reasoning in T2I models. We find&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01197v2-abstract-full').style.display = 'inline'; document.getElementById('2404.01197v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.01197v2-abstract-full" style="display: none;"> One of the key shortcomings in current text-to-image (T2I) models is their inability to consistently generate images which faithfully follow the spatial relationships specified in the text prompt. In this paper, we offer a comprehensive investigation of this limitation, while also developing datasets and methods that support algorithmic solutions to improve spatial reasoning in T2I models. We find that spatial relationships are under-represented in the image descriptions found in current vision-language datasets. To alleviate this data bottleneck, we create SPRIGHT, the first spatially focused, large-scale dataset, by re-captioning 6 million images from 4 widely used vision datasets and through a 3-fold evaluation and analysis pipeline, show that SPRIGHT improves the proportion of spatial relationships in existing datasets. We show the efficacy of SPRIGHT data by showing that using only $\sim$0.25% of SPRIGHT results in a 22% improvement in generating spatially accurate images while also improving FID and CMMD scores. We also find that training on images containing a larger number of objects leads to substantial improvements in spatial consistency, including state-of-the-art results on T2I-CompBench with a spatial score of 0.2133, by fine-tuning on &lt;500 images. Through a set of controlled experiments and ablations, we document additional findings that could support future work that seeks to understand factors that affect spatial consistency in text-to-image models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01197v2-abstract-full').style.display = 'none'; document.getElementById('2404.01197v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ECCV 2024. Project Page : https://spright-t2i.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.13787">arXiv:2403.13787</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.13787">pdf</a>, <a href="https://arxiv.org/format/2403.13787">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RewardBench: Evaluating Reward Models for Language Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lambert%2C+N">Nathan Lambert</a>, <a href="/search/cs?searchtype=author&amp;query=Pyatkin%2C+V">Valentina Pyatkin</a>, <a href="/search/cs?searchtype=author&amp;query=Morrison%2C+J">Jacob Morrison</a>, <a href="/search/cs?searchtype=author&amp;query=Miranda%2C+L">LJ Miranda</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+B+Y">Bill Yuchen Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Chandu%2C+K">Khyathi Chandu</a>, <a href="/search/cs?searchtype=author&amp;query=Dziri%2C+N">Nouha Dziri</a>, <a href="/search/cs?searchtype=author&amp;query=Kumar%2C+S">Sachin Kumar</a>, <a href="/search/cs?searchtype=author&amp;query=Zick%2C+T">Tom Zick</a>, <a href="/search/cs?searchtype=author&amp;query=Choi%2C+Y">Yejin Choi</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+N+A">Noah A. Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.13787v2-abstract-short" style="display: inline;"> Reward models (RMs) are at the crux of successfully using RLHF to align pretrained models to human preferences, yet there has been relatively little study that focuses on evaluation of those models. Evaluating reward models presents an opportunity to understand the opaque technologies used for alignment of language models and which values are embedded in them. Resources for reward model training a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.13787v2-abstract-full').style.display = 'inline'; document.getElementById('2403.13787v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.13787v2-abstract-full" style="display: none;"> Reward models (RMs) are at the crux of successfully using RLHF to align pretrained models to human preferences, yet there has been relatively little study that focuses on evaluation of those models. Evaluating reward models presents an opportunity to understand the opaque technologies used for alignment of language models and which values are embedded in them. Resources for reward model training and understanding are sparse in the nascent open-source community around them. To enhance scientific understanding of reward models, we present RewardBench, a benchmark dataset and code-base for evaluation. The RewardBench dataset is a collection of prompt-chosen-rejected trios spanning chat, reasoning, and safety, to benchmark how reward models perform on challenging, structured and out-of-distribution queries. We create specific comparison datasets for RMs that have subtle, but verifiable reasons (e.g. bugs, incorrect facts) why one answer should be preferred to another. On the RewardBench leaderboard, we evaluate reward models trained with a variety of methods, such as the direct MLE training of classifiers and the implicit reward modeling of Direct Preference Optimization (DPO). We present many findings on propensity for refusals, reasoning limitations, and instruction following shortcomings of various reward models towards a better understanding of the RLHF process. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.13787v2-abstract-full').style.display = 'none'; document.getElementById('2403.13787v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">44 pages, 19 figures, 12 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.03187">arXiv:2403.03187</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.03187">pdf</a>, <a href="https://arxiv.org/format/2403.03187">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Reliable, Adaptable, and Attributable Language Models with Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Asai%2C+A">Akari Asai</a>, <a href="/search/cs?searchtype=author&amp;query=Zhong%2C+Z">Zexuan Zhong</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+D">Danqi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Koh%2C+P+W">Pang Wei Koh</a>, <a href="/search/cs?searchtype=author&amp;query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a>, <a href="/search/cs?searchtype=author&amp;query=Yih%2C+W">Wen-tau Yih</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.03187v1-abstract-short" style="display: inline;"> Parametric language models (LMs), which are trained on vast amounts of web data, exhibit remarkable flexibility and capability. However, they still face practical challenges such as hallucinations, difficulty in adapting to new data distributions, and a lack of verifiability. In this position paper, we advocate for retrieval-augmented LMs to replace parametric LMs as the next generation of LMs. By&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.03187v1-abstract-full').style.display = 'inline'; document.getElementById('2403.03187v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.03187v1-abstract-full" style="display: none;"> Parametric language models (LMs), which are trained on vast amounts of web data, exhibit remarkable flexibility and capability. However, they still face practical challenges such as hallucinations, difficulty in adapting to new data distributions, and a lack of verifiability. In this position paper, we advocate for retrieval-augmented LMs to replace parametric LMs as the next generation of LMs. By incorporating large-scale datastores during inference, retrieval-augmented LMs can be more reliable, adaptable, and attributable. Despite their potential, retrieval-augmented LMs have yet to be widely adopted due to several obstacles: specifically, current retrieval-augmented LMs struggle to leverage helpful text beyond knowledge-intensive tasks such as question answering, have limited interaction between retrieval and LM components, and lack the infrastructure for scaling. To address these, we propose a roadmap for developing general-purpose retrieval-augmented LMs. This involves a reconsideration of datastores and retrievers, the exploration of pipelines with improved retriever-LM interaction, and significant investment in infrastructure for efficient training and inference. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.03187v1-abstract-full').style.display = 'none'; document.getElementById('2403.03187v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.16797">arXiv:2402.16797</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.16797">pdf</a>, <a href="https://arxiv.org/format/2402.16797">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Set the Clock: Temporal Alignment of Pretrained Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+B">Bowen Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Brumbaugh%2C+Z">Zander Brumbaugh</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yizhong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+N+A">Noah A. Smith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.16797v2-abstract-short" style="display: inline;"> Language models (LMs) are trained on web text originating from many points in time and, in general, without any explicit temporal grounding. This work investigates the temporal chaos of pretrained LMs and explores various methods to align their internal knowledge to a target time, which we call &#34;temporal alignment.&#34; To do this, we first automatically construct a dataset containing 20K time-sensiti&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.16797v2-abstract-full').style.display = 'inline'; document.getElementById('2402.16797v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.16797v2-abstract-full" style="display: none;"> Language models (LMs) are trained on web text originating from many points in time and, in general, without any explicit temporal grounding. This work investigates the temporal chaos of pretrained LMs and explores various methods to align their internal knowledge to a target time, which we call &#34;temporal alignment.&#34; To do this, we first automatically construct a dataset containing 20K time-sensitive questions and their answers for each year from 2000 to 2023. Based on this dataset, we empirically show that pretrained LMs (e.g., LLaMa2), despite having a recent pretraining cutoff (e.g., 2022), mostly answer questions using earlier knowledge (e.g., in 2019). We then develop several methods, from prompting to finetuning, to align LMs to use their most recent knowledge when answering questions, and investigate various factors in this alignment. Our experiments demonstrate that aligning LLaMa2 to the year 2022 can enhance its performance by up to 62% according to that year&#39;s answers. This improvement occurs even without explicitly mentioning time information, indicating the possibility of aligning models&#39; internal sense of time after pretraining. Finally, we find that alignment to a historical time is also possible, with up to 2.8$\times$ the performance of the unaligned LM in 2010 if finetuning models to that year. These findings hint at the sophistication of LMs&#39; internal knowledge organization and the necessity of tuning them properly. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.16797v2-abstract-full').style.display = 'none'; document.getElementById('2402.16797v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted as Findings of ACL 2024. Our code and data is available at https://github.com/yizhongw/llm-temporal-alignment</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.10171">arXiv:2402.10171</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.10171">pdf</a>, <a href="https://arxiv.org/format/2402.10171">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Data Engineering for Scaling Language Models to 128K Context </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Fu%2C+Y">Yao Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Panda%2C+R">Rameswar Panda</a>, <a href="/search/cs?searchtype=author&amp;query=Niu%2C+X">Xinyao Niu</a>, <a href="/search/cs?searchtype=author&amp;query=Yue%2C+X">Xiang Yue</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+Y">Yoon Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+H">Hao Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.10171v1-abstract-short" style="display: inline;"> We study the continual pretraining recipe for scaling language models&#39; context lengths to 128K, with a focus on data engineering. We hypothesize that long context modeling, in particular \textit{the ability to utilize information at arbitrary input locations}, is a capability that is mostly already acquired through large-scale pretraining, and that this capability can be readily extended to contex&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.10171v1-abstract-full').style.display = 'inline'; document.getElementById('2402.10171v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.10171v1-abstract-full" style="display: none;"> We study the continual pretraining recipe for scaling language models&#39; context lengths to 128K, with a focus on data engineering. We hypothesize that long context modeling, in particular \textit{the ability to utilize information at arbitrary input locations}, is a capability that is mostly already acquired through large-scale pretraining, and that this capability can be readily extended to contexts substantially longer than seen during training~(e.g., 4K to 128K) through lightweight continual pretraining on appropriate data mixture. We investigate the \textit{quantity} and \textit{quality} of the data for continual pretraining: (1) for quantity, we show that 500 million to 5 billion tokens are enough to enable the model to retrieve information anywhere within the 128K context; (2) for quality, our results equally emphasize \textit{domain balance} and \textit{length upsampling}. Concretely, we find that naively upsampling longer data on certain domains like books, a common practice of existing work, gives suboptimal performance, and that a balanced domain mixture is important. We demonstrate that continual pretraining of the full model on 1B-5B tokens of such data is an effective and affordable strategy for scaling the context length of language models to 128K. Our recipe outperforms strong open-source long-context models and closes the gap to frontier models like GPT-4 128K. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.10171v1-abstract-full').style.display = 'none'; document.getElementById('2402.10171v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code at https://github.com/FranxYao/Long-Context-Data-Engineering</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.07841">arXiv:2402.07841</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.07841">pdf</a>, <a href="https://arxiv.org/format/2402.07841">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Do Membership Inference Attacks Work on Large Language Models? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Duan%2C+M">Michael Duan</a>, <a href="/search/cs?searchtype=author&amp;query=Suri%2C+A">Anshuman Suri</a>, <a href="/search/cs?searchtype=author&amp;query=Mireshghallah%2C+N">Niloofar Mireshghallah</a>, <a href="/search/cs?searchtype=author&amp;query=Min%2C+S">Sewon Min</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+W">Weijia Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&amp;query=Tsvetkov%2C+Y">Yulia Tsvetkov</a>, <a href="/search/cs?searchtype=author&amp;query=Choi%2C+Y">Yejin Choi</a>, <a href="/search/cs?searchtype=author&amp;query=Evans%2C+D">David Evans</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.07841v2-abstract-short" style="display: inline;"> Membership inference attacks (MIAs) attempt to predict whether a particular datapoint is a member of a target model&#39;s training data. Despite extensive research on traditional machine learning models, there has been limited work studying MIA on the pre-training data of large language models (LLMs). We perform a large-scale evaluation of MIAs over a suite of language models (LMs) trained on the Pile&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.07841v2-abstract-full').style.display = 'inline'; document.getElementById('2402.07841v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.07841v2-abstract-full" style="display: none;"> Membership inference attacks (MIAs) attempt to predict whether a particular datapoint is a member of a target model&#39;s training data. Despite extensive research on traditional machine learning models, there has been limited work studying MIA on the pre-training data of large language models (LLMs). We perform a large-scale evaluation of MIAs over a suite of language models (LMs) trained on the Pile, ranging from 160M to 12B parameters. We find that MIAs barely outperform random guessing for most settings across varying LLM sizes and domains. Our further analyses reveal that this poor performance can be attributed to (1) the combination of a large dataset and few training iterations, and (2) an inherently fuzzy boundary between members and non-members. We identify specific settings where LLMs have been shown to be vulnerable to membership inference and show that the apparent success in such settings can be attributed to a distribution shift, such as when members and non-members are drawn from the seemingly identical domain but with different temporal ranges. We release our code and data as a unified benchmark package that includes all existing MIAs, supporting future work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.07841v2-abstract-full').style.display = 'none'; document.getElementById('2402.07841v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at Conference on Language Modeling (COLM), 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.00838">arXiv:2402.00838</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.00838">pdf</a>, <a href="https://arxiv.org/format/2402.00838">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> OLMo: Accelerating the Science of Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Groeneveld%2C+D">Dirk Groeneveld</a>, <a href="/search/cs?searchtype=author&amp;query=Beltagy%2C+I">Iz Beltagy</a>, <a href="/search/cs?searchtype=author&amp;query=Walsh%2C+P">Pete Walsh</a>, <a href="/search/cs?searchtype=author&amp;query=Bhagia%2C+A">Akshita Bhagia</a>, <a href="/search/cs?searchtype=author&amp;query=Kinney%2C+R">Rodney Kinney</a>, <a href="/search/cs?searchtype=author&amp;query=Tafjord%2C+O">Oyvind Tafjord</a>, <a href="/search/cs?searchtype=author&amp;query=Jha%2C+A+H">Ananya Harsh Jha</a>, <a href="/search/cs?searchtype=author&amp;query=Ivison%2C+H">Hamish Ivison</a>, <a href="/search/cs?searchtype=author&amp;query=Magnusson%2C+I">Ian Magnusson</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yizhong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Arora%2C+S">Shane Arora</a>, <a href="/search/cs?searchtype=author&amp;query=Atkinson%2C+D">David Atkinson</a>, <a href="/search/cs?searchtype=author&amp;query=Authur%2C+R">Russell Authur</a>, <a href="/search/cs?searchtype=author&amp;query=Chandu%2C+K+R">Khyathi Raghavi Chandu</a>, <a href="/search/cs?searchtype=author&amp;query=Cohan%2C+A">Arman Cohan</a>, <a href="/search/cs?searchtype=author&amp;query=Dumas%2C+J">Jennifer Dumas</a>, <a href="/search/cs?searchtype=author&amp;query=Elazar%2C+Y">Yanai Elazar</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+Y">Yuling Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Hessel%2C+J">Jack Hessel</a>, <a href="/search/cs?searchtype=author&amp;query=Khot%2C+T">Tushar Khot</a>, <a href="/search/cs?searchtype=author&amp;query=Merrill%2C+W">William Merrill</a>, <a href="/search/cs?searchtype=author&amp;query=Morrison%2C+J">Jacob Morrison</a>, <a href="/search/cs?searchtype=author&amp;query=Muennighoff%2C+N">Niklas Muennighoff</a>, <a href="/search/cs?searchtype=author&amp;query=Naik%2C+A">Aakanksha Naik</a>, <a href="/search/cs?searchtype=author&amp;query=Nam%2C+C">Crystal Nam</a> , et al. (18 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.00838v4-abstract-short" style="display: inline;"> Language models (LMs) have become ubiquitous in both NLP research and in commercial product offerings. As their commercial importance has surged, the most powerful models have become closed off, gated behind proprietary interfaces, with important details of their training data, architectures, and development undisclosed. Given the importance of these details in scientifically studying these models&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.00838v4-abstract-full').style.display = 'inline'; document.getElementById('2402.00838v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.00838v4-abstract-full" style="display: none;"> Language models (LMs) have become ubiquitous in both NLP research and in commercial product offerings. As their commercial importance has surged, the most powerful models have become closed off, gated behind proprietary interfaces, with important details of their training data, architectures, and development undisclosed. Given the importance of these details in scientifically studying these models, including their biases and potential risks, we believe it is essential for the research community to have access to powerful, truly open LMs. To this end, we have built OLMo, a competitive, truly Open Language Model, to enable the scientific study of language models. Unlike most prior efforts that have only released model weights and inference code, we release OLMo alongside open training data and training and evaluation code. We hope this release will empower the open research community and inspire a new wave of innovation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.00838v4-abstract-full').style.display = 'none'; document.getElementById('2402.00838v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.00159">arXiv:2402.00159</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.00159">pdf</a>, <a href="https://arxiv.org/format/2402.00159">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Soldaini%2C+L">Luca Soldaini</a>, <a href="/search/cs?searchtype=author&amp;query=Kinney%2C+R">Rodney Kinney</a>, <a href="/search/cs?searchtype=author&amp;query=Bhagia%2C+A">Akshita Bhagia</a>, <a href="/search/cs?searchtype=author&amp;query=Schwenk%2C+D">Dustin Schwenk</a>, <a href="/search/cs?searchtype=author&amp;query=Atkinson%2C+D">David Atkinson</a>, <a href="/search/cs?searchtype=author&amp;query=Authur%2C+R">Russell Authur</a>, <a href="/search/cs?searchtype=author&amp;query=Bogin%2C+B">Ben Bogin</a>, <a href="/search/cs?searchtype=author&amp;query=Chandu%2C+K">Khyathi Chandu</a>, <a href="/search/cs?searchtype=author&amp;query=Dumas%2C+J">Jennifer Dumas</a>, <a href="/search/cs?searchtype=author&amp;query=Elazar%2C+Y">Yanai Elazar</a>, <a href="/search/cs?searchtype=author&amp;query=Hofmann%2C+V">Valentin Hofmann</a>, <a href="/search/cs?searchtype=author&amp;query=Jha%2C+A+H">Ananya Harsh Jha</a>, <a href="/search/cs?searchtype=author&amp;query=Kumar%2C+S">Sachin Kumar</a>, <a href="/search/cs?searchtype=author&amp;query=Lucy%2C+L">Li Lucy</a>, <a href="/search/cs?searchtype=author&amp;query=Lyu%2C+X">Xinxi Lyu</a>, <a href="/search/cs?searchtype=author&amp;query=Lambert%2C+N">Nathan Lambert</a>, <a href="/search/cs?searchtype=author&amp;query=Magnusson%2C+I">Ian Magnusson</a>, <a href="/search/cs?searchtype=author&amp;query=Morrison%2C+J">Jacob Morrison</a>, <a href="/search/cs?searchtype=author&amp;query=Muennighoff%2C+N">Niklas Muennighoff</a>, <a href="/search/cs?searchtype=author&amp;query=Naik%2C+A">Aakanksha Naik</a>, <a href="/search/cs?searchtype=author&amp;query=Nam%2C+C">Crystal Nam</a>, <a href="/search/cs?searchtype=author&amp;query=Peters%2C+M+E">Matthew E. Peters</a>, <a href="/search/cs?searchtype=author&amp;query=Ravichander%2C+A">Abhilasha Ravichander</a>, <a href="/search/cs?searchtype=author&amp;query=Richardson%2C+K">Kyle Richardson</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+Z">Zejiang Shen</a> , et al. (11 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.00159v2-abstract-short" style="display: inline;"> Information about pretraining corpora used to train the current best-performing language models is seldom discussed: commercial models rarely detail their data, and even open models are often released without accompanying training data or recipes to reproduce them. As a result, it is challenging to conduct and advance scientific research on language modeling, such as understanding how training dat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.00159v2-abstract-full').style.display = 'inline'; document.getElementById('2402.00159v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.00159v2-abstract-full" style="display: none;"> Information about pretraining corpora used to train the current best-performing language models is seldom discussed: commercial models rarely detail their data, and even open models are often released without accompanying training data or recipes to reproduce them. As a result, it is challenging to conduct and advance scientific research on language modeling, such as understanding how training data impacts model capabilities and limitations. To facilitate scientific research on language model pretraining, we curate and release Dolma, a three-trillion-token English corpus, built from a diverse mixture of web content, scientific papers, code, public-domain books, social media, and encyclopedic materials. We extensively document Dolma, including its design principles, details about its construction, and a summary of its contents. We present analyses and experimental results on intermediate states of Dolma to share what we have learned about important data curation practices. Finally, we open-source our data curation toolkit to enable reproduction of our work as well as support further research in large-scale data curation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.00159v2-abstract-full').style.display = 'none'; document.getElementById('2402.00159v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ACL 2024; Dataset: https://hf.co/datasets/allenai/dolma; Code: https://github.com/allenai/dolma</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.17377">arXiv:2401.17377</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.17377">pdf</a>, <a href="https://arxiv.org/format/2401.17377">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Infini-gram: Scaling Unbounded n-gram Language Models to a Trillion Tokens </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jiacheng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Min%2C+S">Sewon Min</a>, <a href="/search/cs?searchtype=author&amp;query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&amp;query=Choi%2C+Y">Yejin Choi</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.17377v3-abstract-short" style="display: inline;"> Are $n$-gram language models still relevant in this era of neural large language models (LLMs)? Our answer is yes, and we showcase their values in both text analysis and improving neural LLMs. This was done by modernizing $n$-gram LMs in two aspects. First, we train them at the same data scale as neural LLMs -- 5 trillion tokens. This is the largest $n$-gram LM ever built. Second, existing $n$-gra&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.17377v3-abstract-full').style.display = 'inline'; document.getElementById('2401.17377v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.17377v3-abstract-full" style="display: none;"> Are $n$-gram language models still relevant in this era of neural large language models (LLMs)? Our answer is yes, and we showcase their values in both text analysis and improving neural LLMs. This was done by modernizing $n$-gram LMs in two aspects. First, we train them at the same data scale as neural LLMs -- 5 trillion tokens. This is the largest $n$-gram LM ever built. Second, existing $n$-gram LMs use small $n$ which hinders their performance; we instead allow $n$ to be arbitrarily large, by introducing a new $\infty$-gram LM with backoff. Instead of pre-computing $n$-gram count tables (which would be very expensive), we develop an engine named infini-gram -- powered by suffix arrays -- that can compute $\infty$-gram (as well as $n$-gram with arbitrary $n$) probabilities with millisecond-level latency. The $\infty$-gram framework and infini-gram engine enable us to conduct many novel and interesting analyses of human-written and machine-generated text: we find that the $\infty$-gram LM has fairly high accuracy for next-token prediction (47%), and can complement neural LLMs to greatly reduce their perplexity. When analyzing machine-generated text, we also observe irregularities in the machine--$\infty$-gram agreement level with respect to the suffix length, which indicates deficiencies in neural LLM pretraining and the positional embeddings of Transformers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.17377v3-abstract-full').style.display = 'none'; document.getElementById('2401.17377v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.12200">arXiv:2401.12200</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.12200">pdf</a>, <a href="https://arxiv.org/format/2401.12200">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> APT: Adaptive Pruning and Tuning Pretrained Language Models for Efficient Training and Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+B">Bowen Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+Q">Qingqing Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.12200v2-abstract-short" style="display: inline;"> Fine-tuning and inference with large Language Models (LM) are generally known to be expensive. Parameter-efficient fine-tuning over pretrained LMs reduces training memory by updating a small number of LM parameters but does not improve inference efficiency. Structured pruning improves LM inference efficiency by removing consistent parameter blocks, yet often increases training memory and time. To&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.12200v2-abstract-full').style.display = 'inline'; document.getElementById('2401.12200v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.12200v2-abstract-full" style="display: none;"> Fine-tuning and inference with large Language Models (LM) are generally known to be expensive. Parameter-efficient fine-tuning over pretrained LMs reduces training memory by updating a small number of LM parameters but does not improve inference efficiency. Structured pruning improves LM inference efficiency by removing consistent parameter blocks, yet often increases training memory and time. To improve both training and inference efficiency, we introduce APT that adaptively prunes and tunes parameters for the LMs. At the early stage of fine-tuning, APT dynamically adds salient tuning parameters for fast and accurate convergence while discarding unimportant parameters for efficiency. Compared to baselines, our experiments show that APT maintains up to 98% task performance when pruning RoBERTa and T5 models with 40% parameters left while keeping 86.4% LLaMA models&#39; performance with 70% parameters remained. Furthermore, APT speeds up LMs fine-tuning by up to 8x and reduces large LMs memory training footprint by up to 70%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.12200v2-abstract-full').style.display = 'none'; document.getElementById('2401.12200v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICML 2024 Oral; code available at https://github.com/ROIM1998/APT</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.06855">arXiv:2401.06855</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.06855">pdf</a>, <a href="https://arxiv.org/format/2401.06855">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Fine-grained Hallucination Detection and Editing for Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mishra%2C+A">Abhika Mishra</a>, <a href="/search/cs?searchtype=author&amp;query=Asai%2C+A">Akari Asai</a>, <a href="/search/cs?searchtype=author&amp;query=Balachandran%2C+V">Vidhisha Balachandran</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yizhong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Neubig%2C+G">Graham Neubig</a>, <a href="/search/cs?searchtype=author&amp;query=Tsvetkov%2C+Y">Yulia Tsvetkov</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.06855v4-abstract-short" style="display: inline;"> Large language models (LMs) are prone to generate factual errors, which are often called hallucinations. In this paper, we introduce a comprehensive taxonomy of hallucinations and argue that hallucinations manifest in diverse forms, each requiring varying degrees of careful assessments to verify factuality. We propose a novel task of automatic fine-grained hallucination detection and construct a n&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.06855v4-abstract-full').style.display = 'inline'; document.getElementById('2401.06855v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.06855v4-abstract-full" style="display: none;"> Large language models (LMs) are prone to generate factual errors, which are often called hallucinations. In this paper, we introduce a comprehensive taxonomy of hallucinations and argue that hallucinations manifest in diverse forms, each requiring varying degrees of careful assessments to verify factuality. We propose a novel task of automatic fine-grained hallucination detection and construct a new evaluation benchmark, FavaBench, that includes about one thousand fine-grained human judgments on three LM outputs across various domains. Our analysis reveals that ChatGPT and Llama2-Chat (70B, 7B) exhibit diverse types of hallucinations in the majority of their outputs in information-seeking scenarios. We train FAVA, a retrieval-augmented LM by carefully creating synthetic data to detect and correct fine-grained hallucinations. On our benchmark, our automatic and human evaluations show that FAVA significantly outperforms ChatGPT and GPT-4 on fine-grained hallucination detection, and edits suggested by FAVA improve the factuality of LM-generated text. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.06855v4-abstract-full').style.display = 'none'; document.getElementById('2401.06855v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Our code, data, and demo are available at https://fine-grained-hallucination.github.io. Published as a conference paper at COLM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.10523">arXiv:2312.10523</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.10523">pdf</a>, <a href="https://arxiv.org/format/2312.10523">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Paloma: A Benchmark for Evaluating Language Model Fit </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Magnusson%2C+I">Ian Magnusson</a>, <a href="/search/cs?searchtype=author&amp;query=Bhagia%2C+A">Akshita Bhagia</a>, <a href="/search/cs?searchtype=author&amp;query=Hofmann%2C+V">Valentin Hofmann</a>, <a href="/search/cs?searchtype=author&amp;query=Soldaini%2C+L">Luca Soldaini</a>, <a href="/search/cs?searchtype=author&amp;query=Jha%2C+A+H">Ananya Harsh Jha</a>, <a href="/search/cs?searchtype=author&amp;query=Tafjord%2C+O">Oyvind Tafjord</a>, <a href="/search/cs?searchtype=author&amp;query=Schwenk%2C+D">Dustin Schwenk</a>, <a href="/search/cs?searchtype=author&amp;query=Walsh%2C+E+P">Evan Pete Walsh</a>, <a href="/search/cs?searchtype=author&amp;query=Elazar%2C+Y">Yanai Elazar</a>, <a href="/search/cs?searchtype=author&amp;query=Lo%2C+K">Kyle Lo</a>, <a href="/search/cs?searchtype=author&amp;query=Groeneveld%2C+D">Dirk Groeneveld</a>, <a href="/search/cs?searchtype=author&amp;query=Beltagy%2C+I">Iz Beltagy</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+N+A">Noah A. Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Richardson%2C+K">Kyle Richardson</a>, <a href="/search/cs?searchtype=author&amp;query=Dodge%2C+J">Jesse Dodge</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.10523v1-abstract-short" style="display: inline;"> Language models (LMs) commonly report perplexity on monolithic data held out from training. Implicitly or explicitly, this data is composed of domains$\unicode{x2013}$varying distributions of language. Rather than assuming perplexity on one distribution extrapolates to others, Perplexity Analysis for Language Model Assessment (Paloma), measures LM fit to 585 text domains, ranging from nytimes.com&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.10523v1-abstract-full').style.display = 'inline'; document.getElementById('2312.10523v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.10523v1-abstract-full" style="display: none;"> Language models (LMs) commonly report perplexity on monolithic data held out from training. Implicitly or explicitly, this data is composed of domains$\unicode{x2013}$varying distributions of language. Rather than assuming perplexity on one distribution extrapolates to others, Perplexity Analysis for Language Model Assessment (Paloma), measures LM fit to 585 text domains, ranging from nytimes.com to r/depression on Reddit. We invite submissions to our benchmark and organize results by comparability based on compliance with guidelines such as removal of benchmark contamination from pretraining. Submissions can also record parameter and training token count to make comparisons of Pareto efficiency for performance as a function of these measures of cost. We populate our benchmark with results from 6 baselines pretrained on popular corpora. In case studies, we demonstrate analyses that are possible with Paloma, such as finding that pretraining without data beyond Common Crawl leads to inconsistent fit to many domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.10523v1-abstract-full').style.display = 'none'; document.getElementById('2312.10523v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://paloma.allen.ai/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.10702">arXiv:2311.10702</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.10702">pdf</a>, <a href="https://arxiv.org/format/2311.10702">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Camels in a Changing Climate: Enhancing LM Adaptation with Tulu 2 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ivison%2C+H">Hamish Ivison</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yizhong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Pyatkin%2C+V">Valentina Pyatkin</a>, <a href="/search/cs?searchtype=author&amp;query=Lambert%2C+N">Nathan Lambert</a>, <a href="/search/cs?searchtype=author&amp;query=Peters%2C+M">Matthew Peters</a>, <a href="/search/cs?searchtype=author&amp;query=Dasigi%2C+P">Pradeep Dasigi</a>, <a href="/search/cs?searchtype=author&amp;query=Jang%2C+J">Joel Jang</a>, <a href="/search/cs?searchtype=author&amp;query=Wadden%2C+D">David Wadden</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+N+A">Noah A. Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Beltagy%2C+I">Iz Beltagy</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.10702v2-abstract-short" style="display: inline;"> Since the release of T脺LU [Wang et al., 2023b], open resources for instruction tuning have developed quickly, from better base models to new finetuning techniques. We test and incorporate a number of these advances into T脺LU, resulting in T脺LU 2, a suite of improved T脺LU models for advancing the understanding and best practices of adapting pretrained language models to downstream tasks and user pr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.10702v2-abstract-full').style.display = 'inline'; document.getElementById('2311.10702v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.10702v2-abstract-full" style="display: none;"> Since the release of T脺LU [Wang et al., 2023b], open resources for instruction tuning have developed quickly, from better base models to new finetuning techniques. We test and incorporate a number of these advances into T脺LU, resulting in T脺LU 2, a suite of improved T脺LU models for advancing the understanding and best practices of adapting pretrained language models to downstream tasks and user preferences. Concretely, we release: (1) T脺LU-V2-mix, an improved collection of high-quality instruction datasets; (2) T脺LU 2, LLAMA-2 models finetuned on the V2 mixture; (3) T脺LU 2+DPO, T脺LU 2 models trained with direct preference optimization (DPO), including the largest DPO-trained model to date (T脺LU 2+DPO 70B); (4) CODE T脺LU 2, CODE LLAMA models finetuned on our V2 mix that outperform CODE LLAMA and its instruction-tuned variant, CODE LLAMA-Instruct. Our evaluation from multiple perspectives shows that the T脺LU 2 suite achieves state-of-the-art performance among open models and matches or exceeds the performance of GPT-3.5-turbo-0301 on several benchmarks. We release all the checkpoints, data, training and evaluation code to facilitate future open efforts on adapting large language models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.10702v2-abstract-full').style.display = 'none'; document.getElementById('2311.10702v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">technical report; fixed zephyr numbers</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.20707">arXiv:2310.20707</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.20707">pdf</a>, <a href="https://arxiv.org/format/2310.20707">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> What&#39;s In My Big Data? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Elazar%2C+Y">Yanai Elazar</a>, <a href="/search/cs?searchtype=author&amp;query=Bhagia%2C+A">Akshita Bhagia</a>, <a href="/search/cs?searchtype=author&amp;query=Magnusson%2C+I">Ian Magnusson</a>, <a href="/search/cs?searchtype=author&amp;query=Ravichander%2C+A">Abhilasha Ravichander</a>, <a href="/search/cs?searchtype=author&amp;query=Schwenk%2C+D">Dustin Schwenk</a>, <a href="/search/cs?searchtype=author&amp;query=Suhr%2C+A">Alane Suhr</a>, <a href="/search/cs?searchtype=author&amp;query=Walsh%2C+P">Pete Walsh</a>, <a href="/search/cs?searchtype=author&amp;query=Groeneveld%2C+D">Dirk Groeneveld</a>, <a href="/search/cs?searchtype=author&amp;query=Soldaini%2C+L">Luca Soldaini</a>, <a href="/search/cs?searchtype=author&amp;query=Singh%2C+S">Sameer Singh</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hanna Hajishirzi</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+N+A">Noah A. Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Dodge%2C+J">Jesse Dodge</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.20707v2-abstract-short" style="display: inline;"> Large text corpora are the backbone of language models. However, we have a limited understanding of the content of these corpora, including general statistics, quality, social factors, and inclusion of evaluation data (contamination). In this work, we propose What&#39;s In My Big Data? (WIMBD), a platform and a set of sixteen analyses that allow us to reveal and compare the contents of large text corp&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.20707v2-abstract-full').style.display = 'inline'; document.getElementById('2310.20707v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.20707v2-abstract-full" style="display: none;"> Large text corpora are the backbone of language models. However, we have a limited understanding of the content of these corpora, including general statistics, quality, social factors, and inclusion of evaluation data (contamination). In this work, we propose What&#39;s In My Big Data? (WIMBD), a platform and a set of sixteen analyses that allow us to reveal and compare the contents of large text corpora. WIMBD builds on two basic capabilities -- count and search -- at scale, which allows us to analyze more than 35 terabytes on a standard compute node. We apply WIMBD to ten different corpora used to train popular language models, including C4, The Pile, and RedPajama. Our analysis uncovers several surprising and previously undocumented findings about these corpora, including the high prevalence of duplicate, synthetic, and low-quality content, personally identifiable information, toxic language, and benchmark contamination. For instance, we find that about 50% of the documents in RedPajama and LAION-2B-en are duplicates. In addition, several datasets used for benchmarking models trained on such corpora are contaminated with respect to important benchmarks, including the Winograd Schema Challenge and parts of GLUE and SuperGLUE. We open-source WIMBD&#39;s code and artifacts to provide a standard set of evaluations for new text-based corpora and to encourage more analyses and transparency around them. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.20707v2-abstract-full').style.display = 'none'; document.getElementById('2310.20707v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published at ICLR 2024 spotlight</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.12126">arXiv:2310.12126</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.12126">pdf</a>, <a href="https://arxiv.org/format/2310.12126">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SHARCS: Efficient Transformers through Routing with Dynamic Width Sub-networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Salehi%2C+M">Mohammadreza Salehi</a>, <a href="/search/cs?searchtype=author&amp;query=Mehta%2C+S">Sachin Mehta</a>, <a href="/search/cs?searchtype=author&amp;query=Kusupati%2C+A">Aditya Kusupati</a>, <a href="/search/cs?searchtype=author&amp;query=Farhadi%2C+A">Ali Farhadi</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.12126v1-abstract-short" style="display: inline;"> We introduce SHARCS for adaptive inference that takes into account the hardness of input samples. SHARCS can train a router on any transformer network, enabling the model to direct different samples to sub-networks with varying widths. Our experiments demonstrate that: (1) SHARCS outperforms or complements existing per-sample adaptive inference methods across various classification tasks in terms&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.12126v1-abstract-full').style.display = 'inline'; document.getElementById('2310.12126v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.12126v1-abstract-full" style="display: none;"> We introduce SHARCS for adaptive inference that takes into account the hardness of input samples. SHARCS can train a router on any transformer network, enabling the model to direct different samples to sub-networks with varying widths. Our experiments demonstrate that: (1) SHARCS outperforms or complements existing per-sample adaptive inference methods across various classification tasks in terms of accuracy vs. FLOPs; (2) SHARCS generalizes across different architectures and can be even applied to compressed and efficient transformer encoders to further improve their efficiency; (3) SHARCS can provide a 2 times inference speed up at an insignificant drop in accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.12126v1-abstract-full').style.display = 'none'; document.getElementById('2310.12126v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.11564">arXiv:2310.11564</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.11564">pdf</a>, <a href="https://arxiv.org/format/2310.11564">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Personalized Soups: Personalized Large Language Model Alignment via Post-hoc Parameter Merging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jang%2C+J">Joel Jang</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+S">Seungone Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+B+Y">Bill Yuchen Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yizhong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Hessel%2C+J">Jack Hessel</a>, <a href="/search/cs?searchtype=author&amp;query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a>, <a href="/search/cs?searchtype=author&amp;query=Choi%2C+Y">Yejin Choi</a>, <a href="/search/cs?searchtype=author&amp;query=Ammanabrolu%2C+P">Prithviraj Ammanabrolu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.11564v1-abstract-short" style="display: inline;"> While Reinforcement Learning from Human Feedback (RLHF) aligns Large Language Models (LLMs) with general, aggregate human preferences, it is suboptimal for learning diverse, individual perspectives. In this work, we study Reinforcement Learning from Personalized Human Feedback (RLPHF) problem, wherein LLMs are aligned to multiple (sometimes conflicting) preferences by modeling alignment as a Multi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.11564v1-abstract-full').style.display = 'inline'; document.getElementById('2310.11564v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.11564v1-abstract-full" style="display: none;"> While Reinforcement Learning from Human Feedback (RLHF) aligns Large Language Models (LLMs) with general, aggregate human preferences, it is suboptimal for learning diverse, individual perspectives. In this work, we study Reinforcement Learning from Personalized Human Feedback (RLPHF) problem, wherein LLMs are aligned to multiple (sometimes conflicting) preferences by modeling alignment as a Multi-Objective Reinforcement Learning (MORL) problem. Compared to strong single-objective baselines, we show that we can achieve personalized alignment by decomposing preferences into multiple dimensions. These dimensions are defined based on personalizations that are declared as desirable by the user. In this work, we show that they can be efficiently trained independently in a distributed manner and combined effectively post-hoc through parameter merging. The code is available at https://github.com/joeljang/RLPHF. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.11564v1-abstract-full').style.display = 'none'; document.getElementById('2310.11564v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.11513">arXiv:2310.11513</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.11513">pdf</a>, <a href="https://arxiv.org/format/2310.11513">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GenEval: An Object-Focused Framework for Evaluating Text-to-Image Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ghosh%2C+D">Dhruba Ghosh</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hanna Hajishirzi</a>, <a href="/search/cs?searchtype=author&amp;query=Schmidt%2C+L">Ludwig Schmidt</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.11513v1-abstract-short" style="display: inline;"> Recent breakthroughs in diffusion models, multimodal pretraining, and efficient finetuning have led to an explosion of text-to-image generative models. Given human evaluation is expensive and difficult to scale, automated methods are critical for evaluating the increasingly large number of new models. However, most current automated evaluation metrics like FID or CLIPScore only offer a holistic me&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.11513v1-abstract-full').style.display = 'inline'; document.getElementById('2310.11513v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.11513v1-abstract-full" style="display: none;"> Recent breakthroughs in diffusion models, multimodal pretraining, and efficient finetuning have led to an explosion of text-to-image generative models. Given human evaluation is expensive and difficult to scale, automated methods are critical for evaluating the increasingly large number of new models. However, most current automated evaluation metrics like FID or CLIPScore only offer a holistic measure of image quality or image-text alignment, and are unsuited for fine-grained or instance-level analysis. In this paper, we introduce GenEval, an object-focused framework to evaluate compositional image properties such as object co-occurrence, position, count, and color. We show that current object detection models can be leveraged to evaluate text-to-image models on a variety of generation tasks with strong human agreement, and that other discriminative vision models can be linked to this pipeline to further verify properties like object color. We then evaluate several open-source text-to-image models and analyze their relative generative capabilities on our benchmark. We find that recent models demonstrate significant improvement on these tasks, though they are still lacking in complex capabilities such as spatial relations and attribute binding. Finally, we demonstrate how GenEval might be used to help discover existing failure modes, in order to inform development of the next generation of text-to-image models. Our code to run the GenEval framework is publicly available at https://github.com/djghosh13/geneval. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.11513v1-abstract-full').style.display = 'none'; document.getElementById('2310.11513v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.11511">arXiv:2310.11511</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.11511">pdf</a>, <a href="https://arxiv.org/format/2310.11511">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Self-RAG: Learning to Retrieve, Generate, and Critique through Self-Reflection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Asai%2C+A">Akari Asai</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zeqiu Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yizhong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Sil%2C+A">Avirup Sil</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.11511v1-abstract-short" style="display: inline;"> Despite their remarkable capabilities, large language models (LLMs) often produce responses containing factual inaccuracies due to their sole reliance on the parametric knowledge they encapsulate. Retrieval-Augmented Generation (RAG), an ad hoc approach that augments LMs with retrieval of relevant knowledge, decreases such issues. However, indiscriminately retrieving and incorporating a fixed numb&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.11511v1-abstract-full').style.display = 'inline'; document.getElementById('2310.11511v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.11511v1-abstract-full" style="display: none;"> Despite their remarkable capabilities, large language models (LLMs) often produce responses containing factual inaccuracies due to their sole reliance on the parametric knowledge they encapsulate. Retrieval-Augmented Generation (RAG), an ad hoc approach that augments LMs with retrieval of relevant knowledge, decreases such issues. However, indiscriminately retrieving and incorporating a fixed number of retrieved passages, regardless of whether retrieval is necessary, or passages are relevant, diminishes LM versatility or can lead to unhelpful response generation. We introduce a new framework called Self-Reflective Retrieval-Augmented Generation (Self-RAG) that enhances an LM&#39;s quality and factuality through retrieval and self-reflection. Our framework trains a single arbitrary LM that adaptively retrieves passages on-demand, and generates and reflects on retrieved passages and its own generations using special tokens, called reflection tokens. Generating reflection tokens makes the LM controllable during the inference phase, enabling it to tailor its behavior to diverse task requirements. Experiments show that Self-RAG (7B and 13B parameters) significantly outperforms state-of-the-art LLMs and retrieval-augmented models on a diverse set of tasks. Specifically, Self-RAG outperforms ChatGPT and retrieval-augmented Llama2-chat on Open-domain QA, reasoning and fact verification tasks, and it shows significant gains in improving factuality and citation accuracy for long-form generations relative to these models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.11511v1-abstract-full').style.display = 'none'; document.getElementById('2310.11511v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">30 pages, 2 figures, 12 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.07707">arXiv:2310.07707</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.07707">pdf</a>, <a href="https://arxiv.org/format/2310.07707">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MatFormer: Nested Transformer for Elastic Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Devvrit"> Devvrit</a>, <a href="/search/cs?searchtype=author&amp;query=Kudugunta%2C+S">Sneha Kudugunta</a>, <a href="/search/cs?searchtype=author&amp;query=Kusupati%2C+A">Aditya Kusupati</a>, <a href="/search/cs?searchtype=author&amp;query=Dettmers%2C+T">Tim Dettmers</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+K">Kaifeng Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Dhillon%2C+I">Inderjit Dhillon</a>, <a href="/search/cs?searchtype=author&amp;query=Tsvetkov%2C+Y">Yulia Tsvetkov</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a>, <a href="/search/cs?searchtype=author&amp;query=Kakade%2C+S">Sham Kakade</a>, <a href="/search/cs?searchtype=author&amp;query=Farhadi%2C+A">Ali Farhadi</a>, <a href="/search/cs?searchtype=author&amp;query=Jain%2C+P">Prateek Jain</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.07707v1-abstract-short" style="display: inline;"> Transformer models are deployed in a wide range of settings, from multi-accelerator clusters to standalone mobile phones. The diverse inference constraints in these scenarios necessitate practitioners to train foundation models such as PaLM 2, Llama, &amp; ViTs as a series of models of varying sizes. Due to significant training costs, only a select few model sizes are trained and supported, limiting m&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.07707v1-abstract-full').style.display = 'inline'; document.getElementById('2310.07707v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.07707v1-abstract-full" style="display: none;"> Transformer models are deployed in a wide range of settings, from multi-accelerator clusters to standalone mobile phones. The diverse inference constraints in these scenarios necessitate practitioners to train foundation models such as PaLM 2, Llama, &amp; ViTs as a series of models of varying sizes. Due to significant training costs, only a select few model sizes are trained and supported, limiting more fine-grained control over relevant tradeoffs, including latency, cost, and accuracy. This work introduces MatFormer, a nested Transformer architecture designed to offer elasticity in a variety of deployment constraints. Each Feed Forward Network (FFN) block of a MatFormer model is jointly optimized with a few nested smaller FFN blocks. This training procedure allows for the Mix&#39;n&#39;Match of model granularities across layers -- i.e., a trained universal MatFormer model enables extraction of hundreds of accurate smaller models, which were never explicitly optimized. We empirically demonstrate MatFormer&#39;s effectiveness across different model classes (decoders &amp; encoders), modalities (language &amp; vision), and scales (up to 2.6B parameters). We find that a 2.6B decoder-only MatFormer language model (MatLM) allows us to extract smaller models spanning from 1.5B to 2.6B, each exhibiting comparable validation loss and one-shot downstream evaluations to their independently trained counterparts. Furthermore, we observe that smaller encoders extracted from a universal MatFormer-based ViT (MatViT) encoder preserve the metric-space structure for adaptive large-scale retrieval. Finally, we showcase that speculative decoding with the accurate and consistent submodels extracted from MatFormer can further reduce inference latency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.07707v1-abstract-full').style.display = 'none'; document.getElementById('2310.07707v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">31 pages, 12 figures, first three authors contributed equally</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.04921">arXiv:2310.04921</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.04921">pdf</a>, <a href="https://arxiv.org/format/2310.04921">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Crystal: Introspective Reasoners Reinforced with Self-Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jiacheng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Pasunuru%2C+R">Ramakanth Pasunuru</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a>, <a href="/search/cs?searchtype=author&amp;query=Choi%2C+Y">Yejin Choi</a>, <a href="/search/cs?searchtype=author&amp;query=Celikyilmaz%2C+A">Asli Celikyilmaz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.04921v2-abstract-short" style="display: inline;"> Extensive work has shown that the performance and interpretability of commonsense reasoning can be improved via knowledge-augmented reasoning methods, where the knowledge that underpins the reasoning process is explicitly verbalized and utilized. However, existing implementations, including &#34;chain-of-thought&#34; and its variants, fall short in capturing the introspective nature of knowledge required&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.04921v2-abstract-full').style.display = 'inline'; document.getElementById('2310.04921v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.04921v2-abstract-full" style="display: none;"> Extensive work has shown that the performance and interpretability of commonsense reasoning can be improved via knowledge-augmented reasoning methods, where the knowledge that underpins the reasoning process is explicitly verbalized and utilized. However, existing implementations, including &#34;chain-of-thought&#34; and its variants, fall short in capturing the introspective nature of knowledge required in commonsense reasoning, and in accounting for the mutual adaptation between the generation and utilization of knowledge. We propose a novel method to develop an introspective commonsense reasoner, Crystal. To tackle commonsense problems, it first introspects for knowledge statements related to the given question, and subsequently makes an informed prediction that is grounded in the previously introspected knowledge. The knowledge introspection and knowledge-grounded reasoning modes of the model are tuned via reinforcement learning to mutually adapt, where the reward derives from the feedback given by the model itself. Experiments show that Crystal significantly outperforms both the standard supervised finetuning and chain-of-thought distilled methods, and enhances the transparency of the commonsense reasoning process. Our work ultimately validates the feasibility and potential of reinforcing a neural model with self-feedback. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.04921v2-abstract-full').style.display = 'none'; document.getElementById('2310.04921v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2023 main conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.02255">arXiv:2310.02255</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.02255">pdf</a>, <a href="https://arxiv.org/format/2310.02255">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MathVista: Evaluating Mathematical Reasoning of Foundation Models in Visual Contexts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lu%2C+P">Pan Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Bansal%2C+H">Hritik Bansal</a>, <a href="/search/cs?searchtype=author&amp;query=Xia%2C+T">Tony Xia</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jiacheng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chunyuan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+H">Hao Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&amp;query=Galley%2C+M">Michel Galley</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+J">Jianfeng Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.02255v3-abstract-short" style="display: inline;"> Large Language Models (LLMs) and Large Multimodal Models (LMMs) exhibit impressive problem-solving skills in many tasks and domains, but their ability in mathematical reasoning in visual contexts has not been systematically studied. To bridge this gap, we present MathVista, a benchmark designed to combine challenges from diverse mathematical and visual tasks. It consists of 6,141 examples, derived&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.02255v3-abstract-full').style.display = 'inline'; document.getElementById('2310.02255v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.02255v3-abstract-full" style="display: none;"> Large Language Models (LLMs) and Large Multimodal Models (LMMs) exhibit impressive problem-solving skills in many tasks and domains, but their ability in mathematical reasoning in visual contexts has not been systematically studied. To bridge this gap, we present MathVista, a benchmark designed to combine challenges from diverse mathematical and visual tasks. It consists of 6,141 examples, derived from 28 existing multimodal datasets involving mathematics and 3 newly created datasets (i.e., IQTest, FunctionQA, and PaperQA). Completing these tasks requires fine-grained, deep visual understanding and compositional reasoning, which all state-of-the-art foundation models find challenging. With MathVista, we have conducted a comprehensive, quantitative evaluation of 12 prominent foundation models. The best-performing GPT-4V model achieves an overall accuracy of 49.9%, substantially outperforming Bard, the second-best performer, by 15.1%. Our in-depth analysis reveals that the superiority of GPT-4V is mainly attributed to its enhanced visual perception and mathematical reasoning. However, GPT-4V still falls short of human performance by 10.4%, as it often struggles to understand complex figures and perform rigorous reasoning. This significant gap underscores the critical role that MathVista will play in the development of general-purpose AI agents capable of tackling mathematically intensive and visually rich real-world tasks. We further explore the new ability of self-verification, the application of self-consistency, and the interactive chatbot capabilities of GPT-4V, highlighting its promising potential for future research. The project is available at https://mathvista.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.02255v3-abstract-full').style.display = 'none'; document.getElementById('2310.02255v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">116 pages, 120 figures. Accepted to ICLR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.01329">arXiv:2310.01329</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.01329">pdf</a>, <a href="https://arxiv.org/format/2310.01329">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> BTR: Binary Token Representations for Efficient Retrieval Augmented Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cao%2C+Q">Qingqing Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Min%2C+S">Sewon Min</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yizhong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.01329v2-abstract-short" style="display: inline;"> Retrieval augmentation addresses many critical problems in large language models such as hallucination, staleness, and privacy leaks. However, running retrieval-augmented language models (LMs) is slow and difficult to scale due to processing large amounts of retrieved text. We introduce binary token representations (BTR), which use 1-bit vectors to precompute every token in passages, significantly&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.01329v2-abstract-full').style.display = 'inline'; document.getElementById('2310.01329v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.01329v2-abstract-full" style="display: none;"> Retrieval augmentation addresses many critical problems in large language models such as hallucination, staleness, and privacy leaks. However, running retrieval-augmented language models (LMs) is slow and difficult to scale due to processing large amounts of retrieved text. We introduce binary token representations (BTR), which use 1-bit vectors to precompute every token in passages, significantly reducing computation during inference. Despite the potential loss of accuracy, our new calibration techniques and training objectives restore performance. Combined with offline and runtime compression, this only requires 127GB of disk space for encoding 3 billion tokens in Wikipedia. Our experiments show that on five knowledge-intensive NLP tasks, BTR accelerates state-of-the-art inference by up to 4x and reduces storage by over 100x while maintaining over 95% task performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.01329v2-abstract-full').style.display = 'none'; document.getElementById('2310.01329v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2024 camera-ready version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.15028">arXiv:2309.15028</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.15028">pdf</a>, <a href="https://arxiv.org/format/2309.15028">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Don&#39;t throw away your value model! Generating more preferable text with Value-Guided Monte-Carlo Tree Search decoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jiacheng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Cohen%2C+A">Andrew Cohen</a>, <a href="/search/cs?searchtype=author&amp;query=Pasunuru%2C+R">Ramakanth Pasunuru</a>, <a href="/search/cs?searchtype=author&amp;query=Choi%2C+Y">Yejin Choi</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a>, <a href="/search/cs?searchtype=author&amp;query=Celikyilmaz%2C+A">Asli Celikyilmaz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.15028v3-abstract-short" style="display: inline;"> Inference-time search algorithms such as Monte-Carlo Tree Search (MCTS) may seem unnecessary when generating natural language text based on state-of-the-art reinforcement learning such as Proximal Policy Optimization (PPO). In this paper, we demonstrate that it is possible to get extra mileage out of PPO by integrating MCTS on top. The key idea is not to throw out the value network, a byproduct of&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.15028v3-abstract-full').style.display = 'inline'; document.getElementById('2309.15028v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.15028v3-abstract-full" style="display: none;"> Inference-time search algorithms such as Monte-Carlo Tree Search (MCTS) may seem unnecessary when generating natural language text based on state-of-the-art reinforcement learning such as Proximal Policy Optimization (PPO). In this paper, we demonstrate that it is possible to get extra mileage out of PPO by integrating MCTS on top. The key idea is not to throw out the value network, a byproduct of PPO training for evaluating partial output sequences, when decoding text out of the policy network. More concretely, we present a novel value-guided decoding algorithm called PPO-MCTS, which can integrate the value network from PPO to work closely with the policy network during inference-time generation. Compared to prior approaches based on MCTS for controlled text generation, the key strength of our approach is to reduce the fundamental mismatch of the scoring mechanisms of the partial outputs between training and test. Evaluation on four text generation tasks demonstrate that PPO-MCTS greatly improves the preferability of generated text compared to the standard practice of using only the PPO policy. Our results demonstrate the promise of search algorithms even on top of the aligned language models from PPO, and the under-explored benefit of the value network. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.15028v3-abstract-full').style.display = 'none'; document.getElementById('2309.15028v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.04430">arXiv:2308.04430</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.04430">pdf</a>, <a href="https://arxiv.org/format/2308.04430">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SILO Language Models: Isolating Legal Risk In a Nonparametric Datastore </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Min%2C+S">Sewon Min</a>, <a href="/search/cs?searchtype=author&amp;query=Gururangan%2C+S">Suchin Gururangan</a>, <a href="/search/cs?searchtype=author&amp;query=Wallace%2C+E">Eric Wallace</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+W">Weijia Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+N+A">Noah A. Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Zettlemoyer%2C+L">Luke Zettlemoyer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.04430v2-abstract-short" style="display: inline;"> The legality of training language models (LMs) on copyrighted or otherwise restricted data is under intense debate. However, as we show, model performance significantly degrades if trained only on low-risk text (e.g., out-of-copyright books or government documents), due to its limited size and domain coverage. We present SILO, a new language model that manages this risk-performance tradeoff during&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.04430v2-abstract-full').style.display = 'inline'; document.getElementById('2308.04430v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.04430v2-abstract-full" style="display: none;"> The legality of training language models (LMs) on copyrighted or otherwise restricted data is under intense debate. However, as we show, model performance significantly degrades if trained only on low-risk text (e.g., out-of-copyright books or government documents), due to its limited size and domain coverage. We present SILO, a new language model that manages this risk-performance tradeoff during inference. SILO is built by (1) training a parametric LM on Open License Corpus (OLC), a new corpus we curate with 228B tokens of public domain and permissively licensed text and (2) augmenting it with a more general and easily modifiable nonparametric datastore (e.g., containing copyrighted books or news) that is only queried during inference. The datastore allows use of high-risk data without training on it, supports sentence-level data attribution, and enables data producers to opt out from the model by removing content from the store. These capabilities can foster compliance with data-use regulations such as the fair use doctrine in the United States and the GDPR in the European Union. Our experiments show that the parametric LM struggles on domains not covered by OLC. However, access to the datastore greatly improves out of domain performance, closing 90% of the performance gap with an LM trained on the Pile, a more diverse corpus with mostly high-risk text. We also analyze which nonparametric approach works best, where the remaining errors lie, and how performance scales with datastore size. Our results suggest that it is possible to build high quality language models while mitigating their legal risk. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.04430v2-abstract-full').style.display = 'none'; document.getElementById('2308.04430v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">29 pages; 7 figures. Published as a conference paper at ICLR 2024 (spotlight). Code, models, and data available at https://github.com/kernelmachine/silo-lm</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.09701">arXiv:2307.09701</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2307.09701">pdf</a>, <a href="https://arxiv.org/format/2307.09701">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Efficiency Pentathlon: A Standardized Arena for Efficiency Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Peng%2C+H">Hao Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+Q">Qingqing Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Dodge%2C+J">Jesse Dodge</a>, <a href="/search/cs?searchtype=author&amp;query=Peters%2C+M+E">Matthew E. Peters</a>, <a href="/search/cs?searchtype=author&amp;query=Fernandez%2C+J">Jared Fernandez</a>, <a href="/search/cs?searchtype=author&amp;query=Sherborne%2C+T">Tom Sherborne</a>, <a href="/search/cs?searchtype=author&amp;query=Lo%2C+K">Kyle Lo</a>, <a href="/search/cs?searchtype=author&amp;query=Skjonsberg%2C+S">Sam Skjonsberg</a>, <a href="/search/cs?searchtype=author&amp;query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&amp;query=Plessas%2C+D">Darrell Plessas</a>, <a href="/search/cs?searchtype=author&amp;query=Beltagy%2C+I">Iz Beltagy</a>, <a href="/search/cs?searchtype=author&amp;query=Walsh%2C+E+P">Evan Pete Walsh</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+N+A">Noah A. Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.09701v1-abstract-short" style="display: inline;"> Rising computational demands of modern natural language processing (NLP) systems have increased the barrier to entry for cutting-edge research while posing serious environmental concerns. Yet, progress on model efficiency has been impeded by practical challenges in model evaluation and comparison. For example, hardware is challenging to control due to disparate levels of accessibility across diffe&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.09701v1-abstract-full').style.display = 'inline'; document.getElementById('2307.09701v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.09701v1-abstract-full" style="display: none;"> Rising computational demands of modern natural language processing (NLP) systems have increased the barrier to entry for cutting-edge research while posing serious environmental concerns. Yet, progress on model efficiency has been impeded by practical challenges in model evaluation and comparison. For example, hardware is challenging to control due to disparate levels of accessibility across different institutions. Moreover, improvements in metrics such as FLOPs often fail to translate to progress in real-world applications. In response, we introduce Pentathlon, a benchmark for holistic and realistic evaluation of model efficiency. Pentathlon focuses on inference, which accounts for a majority of the compute in a model&#39;s lifecycle. It offers a strictly-controlled hardware platform, and is designed to mirror real-world applications scenarios. It incorporates a suite of metrics that target different aspects of efficiency, including latency, throughput, memory overhead, and energy consumption. Pentathlon also comes with a software library that can be seamlessly integrated into any codebase and enable evaluation. As a standardized and centralized evaluation platform, Pentathlon can drastically reduce the workload to make fair and reproducible efficiency comparisons. While initially focused on natural language processing (NLP) models, Pentathlon is designed to allow flexible extension to other fields. We envision Pentathlon will stimulate algorithmic innovations in building efficient models, and foster an increased awareness of the social and environmental implications in the development of future-generation NLP models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.09701v1-abstract-full').style.display = 'none'; document.getElementById('2307.09701v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.04751">arXiv:2306.04751</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2306.04751">pdf</a>, <a href="https://arxiv.org/format/2306.04751">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> How Far Can Camels Go? Exploring the State of Instruction Tuning on Open Resources </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yizhong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ivison%2C+H">Hamish Ivison</a>, <a href="/search/cs?searchtype=author&amp;query=Dasigi%2C+P">Pradeep Dasigi</a>, <a href="/search/cs?searchtype=author&amp;query=Hessel%2C+J">Jack Hessel</a>, <a href="/search/cs?searchtype=author&amp;query=Khot%2C+T">Tushar Khot</a>, <a href="/search/cs?searchtype=author&amp;query=Chandu%2C+K+R">Khyathi Raghavi Chandu</a>, <a href="/search/cs?searchtype=author&amp;query=Wadden%2C+D">David Wadden</a>, <a href="/search/cs?searchtype=author&amp;query=MacMillan%2C+K">Kelsey MacMillan</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+N+A">Noah A. Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Beltagy%2C+I">Iz Beltagy</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.04751v2-abstract-short" style="display: inline;"> In this work we explore recent advances in instruction-tuning language models on a range of open instruction-following datasets. Despite recent claims that open models can be on par with state-of-the-art proprietary models, these claims are often accompanied by limited evaluation, making it difficult to compare models across the board and determine the utility of various resources. We provide a la&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.04751v2-abstract-full').style.display = 'inline'; document.getElementById('2306.04751v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.04751v2-abstract-full" style="display: none;"> In this work we explore recent advances in instruction-tuning language models on a range of open instruction-following datasets. Despite recent claims that open models can be on par with state-of-the-art proprietary models, these claims are often accompanied by limited evaluation, making it difficult to compare models across the board and determine the utility of various resources. We provide a large set of instruction-tuned models from 6.7B to 65B parameters in size, trained on 12 instruction datasets ranging from manually curated (e.g., OpenAssistant) to synthetic and distilled (e.g., Alpaca) and systematically evaluate them on their factual knowledge, reasoning, multilinguality, coding, and open-ended instruction following abilities through a collection of automatic, model-based, and human-based metrics. We further introduce T眉lu, our best performing instruction-tuned model suite finetuned on a combination of high-quality open resources. Our experiments show that different instruction-tuning datasets can uncover or enhance specific skills, while no single dataset (or combination) provides the best performance across all evaluations. Interestingly, we find that model and human preference-based evaluations fail to reflect differences in model capabilities exposed by benchmark-based evaluations, suggesting the need for the type of systemic evaluation performed in this work. Our evaluations show that the best model in any given evaluation reaches on average 87% of ChatGPT performance, and 73% of GPT-4 performance, suggesting that further investment in building better base models and instruction-tuning data is required to close the gap. We release our instruction-tuned models, including a fully finetuned 65B T眉lu, along with our code, data, and evaluation framework at https://github.com/allenai/open-instruct to facilitate future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.04751v2-abstract-full').style.display = 'none'; document.getElementById('2306.04751v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 6 figure, 10 tables. NeurIPS 2023 Datasets and Benchmarks Track Camera Ready</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.01693">arXiv:2306.01693</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2306.01693">pdf</a>, <a href="https://arxiv.org/format/2306.01693">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Fine-Grained Human Feedback Gives Better Rewards for Language Model Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zeqiu Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+Y">Yushi Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+W">Weijia Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Dziri%2C+N">Nouha Dziri</a>, <a href="/search/cs?searchtype=author&amp;query=Suhr%2C+A">Alane Suhr</a>, <a href="/search/cs?searchtype=author&amp;query=Ammanabrolu%2C+P">Prithviraj Ammanabrolu</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+N+A">Noah A. Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Ostendorf%2C+M">Mari Ostendorf</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.01693v2-abstract-short" style="display: inline;"> Language models (LMs) often exhibit undesirable text generation behaviors, including generating false, toxic, or irrelevant outputs. Reinforcement learning from human feedback (RLHF) - where human preference judgments on LM outputs are transformed into a learning signal - has recently shown promise in addressing these issues. However, such holistic feedback conveys limited information on long text&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.01693v2-abstract-full').style.display = 'inline'; document.getElementById('2306.01693v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.01693v2-abstract-full" style="display: none;"> Language models (LMs) often exhibit undesirable text generation behaviors, including generating false, toxic, or irrelevant outputs. Reinforcement learning from human feedback (RLHF) - where human preference judgments on LM outputs are transformed into a learning signal - has recently shown promise in addressing these issues. However, such holistic feedback conveys limited information on long text outputs; it does not indicate which aspects of the outputs influenced user preference; e.g., which parts contain what type(s) of errors. In this paper, we use fine-grained human feedback (e.g., which sentence is false, which sub-sentence is irrelevant) as an explicit training signal. We introduce Fine-Grained RLHF, a framework that enables training and learning from reward functions that are fine-grained in two respects: (1) density, providing a reward after every segment (e.g., a sentence) is generated; and (2) incorporating multiple reward models associated with different feedback types (e.g., factual incorrectness, irrelevance, and information incompleteness). We conduct experiments on detoxification and long-form question answering to illustrate how learning with such reward functions leads to improved performance, supported by both automatic and human evaluation. Additionally, we show that LM behaviors can be customized using different combinations of fine-grained reward models. We release all data, collected human feedback, and codes at https://FineGrainedRLHF.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.01693v2-abstract-full').style.display = 'none'; document.getElementById('2306.01693v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2023 camera-ready</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.17530">arXiv:2305.17530</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.17530">pdf</a>, <a href="https://arxiv.org/format/2305.17530">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> PuMer: Pruning and Merging Tokens for Efficient Vision Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cao%2C+Q">Qingqing Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Paranjape%2C+B">Bhargavi Paranjape</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.17530v1-abstract-short" style="display: inline;"> Large-scale vision language (VL) models use Transformers to perform cross-modal interactions between the input text and image. These cross-modal interactions are computationally expensive and memory-intensive due to the quadratic complexity of processing the input image and text. We present PuMer: a token reduction framework that uses text-informed Pruning and modality-aware Merging strategies to&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.17530v1-abstract-full').style.display = 'inline'; document.getElementById('2305.17530v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.17530v1-abstract-full" style="display: none;"> Large-scale vision language (VL) models use Transformers to perform cross-modal interactions between the input text and image. These cross-modal interactions are computationally expensive and memory-intensive due to the quadratic complexity of processing the input image and text. We present PuMer: a token reduction framework that uses text-informed Pruning and modality-aware Merging strategies to progressively reduce the tokens of input image and text, improving model inference speed and reducing memory footprint. PuMer learns to keep salient image tokens related to the input text and merges similar textual and visual tokens by adding lightweight token reducer modules at several cross-modal layers in the VL model. Training PuMer is mostly the same as finetuning the original VL model but faster. Our evaluation for two vision language models on four downstream VL tasks shows PuMer increases inference throughput by up to 2x and reduces memory footprint by over 50% while incurring less than a 1% accuracy drop. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.17530v1-abstract-full').style.display = 'none'; document.getElementById('2305.17530v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ACL 2023 Main Conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.14857">arXiv:2305.14857</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.14857">pdf</a>, <a href="https://arxiv.org/format/2305.14857">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> BUFFET: Benchmarking Large Language Models for Few-shot Cross-lingual Transfer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Asai%2C+A">Akari Asai</a>, <a href="/search/cs?searchtype=author&amp;query=Kudugunta%2C+S">Sneha Kudugunta</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+X+V">Xinyan Velocity Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Blevins%2C+T">Terra Blevins</a>, <a href="/search/cs?searchtype=author&amp;query=Gonen%2C+H">Hila Gonen</a>, <a href="/search/cs?searchtype=author&amp;query=Reid%2C+M">Machel Reid</a>, <a href="/search/cs?searchtype=author&amp;query=Tsvetkov%2C+Y">Yulia Tsvetkov</a>, <a href="/search/cs?searchtype=author&amp;query=Ruder%2C+S">Sebastian Ruder</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.14857v1-abstract-short" style="display: inline;"> Despite remarkable advancements in few-shot generalization in natural language processing, most models are developed and evaluated primarily in English. To facilitate research on few-shot cross-lingual transfer, we introduce a new benchmark, called BUFFET, which unifies 15 diverse tasks across 54 languages in a sequence-to-sequence format and provides a fixed set of few-shot examples and instructi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14857v1-abstract-full').style.display = 'inline'; document.getElementById('2305.14857v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.14857v1-abstract-full" style="display: none;"> Despite remarkable advancements in few-shot generalization in natural language processing, most models are developed and evaluated primarily in English. To facilitate research on few-shot cross-lingual transfer, we introduce a new benchmark, called BUFFET, which unifies 15 diverse tasks across 54 languages in a sequence-to-sequence format and provides a fixed set of few-shot examples and instructions. BUFFET is designed to establish a rigorous and equitable evaluation framework for few-shot cross-lingual transfer across a broad range of tasks and languages. Using BUFFET, we perform thorough evaluations of state-of-the-art multilingual large language models with different transfer methods, namely in-context learning and fine-tuning. Our findings reveal significant room for improvement in few-shot in-context cross-lingual transfer. In particular, ChatGPT with in-context learning often performs worse than much smaller mT5-base models fine-tuned on English task data and few-shot in-language examples. Our analysis suggests various avenues for future research in few-shot cross-lingual transfer, such as improved pretraining, understanding, and future evaluations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14857v1-abstract-full').style.display = 'none'; document.getElementById('2305.14857v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The data and code is available at https://buffetfs.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.14815">arXiv:2305.14815</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.14815">pdf</a>, <a href="https://arxiv.org/format/2305.14815">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Machine Reading Comprehension using Case-based Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Thai%2C+D">Dung Thai</a>, <a href="/search/cs?searchtype=author&amp;query=Agarwal%2C+D">Dhruv Agarwal</a>, <a href="/search/cs?searchtype=author&amp;query=Chaudhary%2C+M">Mudit Chaudhary</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+W">Wenlong Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Das%2C+R">Rajarshi Das</a>, <a href="/search/cs?searchtype=author&amp;query=Zaheer%2C+M">Manzil Zaheer</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+J">Jay-Yoon Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a>, <a href="/search/cs?searchtype=author&amp;query=McCallum%2C+A">Andrew McCallum</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.14815v4-abstract-short" style="display: inline;"> We present an accurate and interpretable method for answer extraction in machine reading comprehension that is reminiscent of case-based reasoning (CBR) from classical AI. Our method (CBR-MRC) builds upon the hypothesis that contextualized answers to similar questions share semantic similarities with each other. Given a test question, CBR-MRC first retrieves a set of similar cases from a nonparame&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14815v4-abstract-full').style.display = 'inline'; document.getElementById('2305.14815v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.14815v4-abstract-full" style="display: none;"> We present an accurate and interpretable method for answer extraction in machine reading comprehension that is reminiscent of case-based reasoning (CBR) from classical AI. Our method (CBR-MRC) builds upon the hypothesis that contextualized answers to similar questions share semantic similarities with each other. Given a test question, CBR-MRC first retrieves a set of similar cases from a nonparametric memory and then predicts an answer by selecting the span in the test context that is most similar to the contextualized representations of answers in the retrieved cases. The semi-parametric nature of our approach allows it to attribute a prediction to the specific set of evidence cases, making it a desirable choice for building reliable and debuggable QA systems. We show that CBR-MRC provides high accuracy comparable with large reader models and outperforms baselines by 11.5 and 8.4 EM on NaturalQuestions and NewsQA, respectively. Further, we demonstrate the ability of CBR-MRC in identifying not just the correct answer tokens but also the span with the most relevant supporting evidence. Lastly, we observe that contexts for certain question types show higher lexical diversity than others and find that CBR-MRC is robust to these variations while performance using fully-parametric methods drops. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14815v4-abstract-full').style.display = 'none'; document.getElementById('2305.14815v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.14251">arXiv:2305.14251</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.14251">pdf</a>, <a href="https://arxiv.org/format/2305.14251">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FActScore: Fine-grained Atomic Evaluation of Factual Precision in Long Form Text Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Min%2C+S">Sewon Min</a>, <a href="/search/cs?searchtype=author&amp;query=Krishna%2C+K">Kalpesh Krishna</a>, <a href="/search/cs?searchtype=author&amp;query=Lyu%2C+X">Xinxi Lyu</a>, <a href="/search/cs?searchtype=author&amp;query=Lewis%2C+M">Mike Lewis</a>, <a href="/search/cs?searchtype=author&amp;query=Yih%2C+W">Wen-tau Yih</a>, <a href="/search/cs?searchtype=author&amp;query=Koh%2C+P+W">Pang Wei Koh</a>, <a href="/search/cs?searchtype=author&amp;query=Iyyer%2C+M">Mohit Iyyer</a>, <a href="/search/cs?searchtype=author&amp;query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.14251v2-abstract-short" style="display: inline;"> Evaluating the factuality of long-form text generated by large language models (LMs) is non-trivial because (1) generations often contain a mixture of supported and unsupported pieces of information, making binary judgments of quality inadequate, and (2) human evaluation is time-consuming and costly. In this paper, we introduce FACTSCORE, a new evaluation that breaks a generation into a series of&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14251v2-abstract-full').style.display = 'inline'; document.getElementById('2305.14251v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.14251v2-abstract-full" style="display: none;"> Evaluating the factuality of long-form text generated by large language models (LMs) is non-trivial because (1) generations often contain a mixture of supported and unsupported pieces of information, making binary judgments of quality inadequate, and (2) human evaluation is time-consuming and costly. In this paper, we introduce FACTSCORE, a new evaluation that breaks a generation into a series of atomic facts and computes the percentage of atomic facts supported by a reliable knowledge source. We conduct an extensive human evaluation to obtain FACTSCOREs of people biographies generated by several state-of-the-art commercial LMs -- InstructGPT, ChatGPT, and the retrieval-augmented PerplexityAI -- and report new analysis demonstrating the need for such a fine-grained score (e.g., ChatGPT only achieves 58%). Since human evaluation is costly, we also introduce an automated model that estimates FACTSCORE using retrieval and a strong language model, with less than a 2% error rate. Finally, we use this automated metric to evaluate 6,500 generations from a new set of 13 recent LMs that would have cost $26K if evaluated by humans, with various findings: GPT-4 and ChatGPT are more factual than public models, and Vicuna and Alpaca are some of the best public models. FACTSCORE is available for public use via `pip install factscore`. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14251v2-abstract-full').style.display = 'none'; document.getElementById('2305.14251v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages; 7 figures. Published as a main conference paper at EMNLP 2023. Code available at https://github.com/shmsw25/FActScore</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.13256">arXiv:2305.13256</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.13256">pdf</a>, <a href="https://arxiv.org/format/2305.13256">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> TaskWeb: Selecting Better Source Tasks for Multi-task NLP </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kim%2C+J">Joongwon Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Asai%2C+A">Akari Asai</a>, <a href="/search/cs?searchtype=author&amp;query=Ilharco%2C+G">Gabriel Ilharco</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.13256v2-abstract-short" style="display: inline;"> Recent work in NLP has shown promising results in training models on large amounts of tasks to achieve better generalization. However, it is not well-understood how tasks are related, and how helpful training tasks can be chosen for a new task. In this work, we investigate whether knowing task relationships via pairwise task transfer improves choosing one or more source tasks that help to learn a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.13256v2-abstract-full').style.display = 'inline'; document.getElementById('2305.13256v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.13256v2-abstract-full" style="display: none;"> Recent work in NLP has shown promising results in training models on large amounts of tasks to achieve better generalization. However, it is not well-understood how tasks are related, and how helpful training tasks can be chosen for a new task. In this work, we investigate whether knowing task relationships via pairwise task transfer improves choosing one or more source tasks that help to learn a new target task. We provide TaskWeb, a large-scale benchmark of pairwise task transfers for 22 NLP tasks using three different model types, sizes, and adaptation methods, spanning about 25,000 experiments. Then, we design a new method TaskShop based on our analysis of TaskWeb. TaskShop uses TaskWeb to estimate the benefit of using a source task for learning a new target task, and to choose a subset of helpful training tasks for multi-task training. Our method improves overall rankings and top-k precision of source tasks by 10% and 38%, respectively. We also use TaskShop to build much smaller multi-task training sets that improve zero-shot performances across 11 different target tasks by at least 4.3%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.13256v2-abstract-full').style.display = 'none'; document.getElementById('2305.13256v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.11744">arXiv:2305.11744</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.11744">pdf</a>, <a href="https://arxiv.org/format/2305.11744">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ReFIT: Relevance Feedback from a Reranker during Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Reddy%2C+R+G">Revanth Gangi Reddy</a>, <a href="/search/cs?searchtype=author&amp;query=Dasigi%2C+P">Pradeep Dasigi</a>, <a href="/search/cs?searchtype=author&amp;query=Sultan%2C+M+A">Md Arafat Sultan</a>, <a href="/search/cs?searchtype=author&amp;query=Cohan%2C+A">Arman Cohan</a>, <a href="/search/cs?searchtype=author&amp;query=Sil%2C+A">Avirup Sil</a>, <a href="/search/cs?searchtype=author&amp;query=Ji%2C+H">Heng Ji</a>, <a href="/search/cs?searchtype=author&amp;query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.11744v2-abstract-short" style="display: inline;"> Retrieve-and-rerank is a prevalent framework in neural information retrieval, wherein a bi-encoder network initially retrieves a pre-defined number of candidates (e.g., K=100), which are then reranked by a more powerful cross-encoder model. While the reranker often yields improved candidate scores compared to the retriever, its scope is confined to only the top K retrieved candidates. As a result,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.11744v2-abstract-full').style.display = 'inline'; document.getElementById('2305.11744v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.11744v2-abstract-full" style="display: none;"> Retrieve-and-rerank is a prevalent framework in neural information retrieval, wherein a bi-encoder network initially retrieves a pre-defined number of candidates (e.g., K=100), which are then reranked by a more powerful cross-encoder model. While the reranker often yields improved candidate scores compared to the retriever, its scope is confined to only the top K retrieved candidates. As a result, the reranker cannot improve retrieval performance in terms of Recall@K. In this work, we propose to leverage the reranker to improve recall by making it provide relevance feedback to the retriever at inference time. Specifically, given a test instance during inference, we distill the reranker&#39;s predictions for that instance into the retriever&#39;s query representation using a lightweight update mechanism. The aim of the distillation loss is to align the retriever&#39;s candidate scores more closely with those produced by the reranker. The algorithm then proceeds by executing a second retrieval step using the updated query vector. We empirically demonstrate that this method, applicable to various retrieve-and-rerank frameworks, substantially enhances retrieval recall across multiple domains, languages, and modalities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.11744v2-abstract-full').style.display = 'none'; document.getElementById('2305.11744v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Hajishirzi%2C+H&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Hajishirzi%2C+H&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Hajishirzi%2C+H&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Hajishirzi%2C+H&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Hajishirzi%2C+H&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10