Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 207 results for author: <span class="mathjax">Zettlemoyer, L</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Zettlemoyer%2C+L">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zettlemoyer, L"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zettlemoyer%2C+L&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zettlemoyer, L"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zettlemoyer%2C+L&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zettlemoyer%2C+L&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zettlemoyer%2C+L&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zettlemoyer%2C+L&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zettlemoyer%2C+L&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zettlemoyer%2C+L&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14199">arXiv:2411.14199</a> <span> [<a href="https://arxiv.org/pdf/2411.14199">pdf</a>, <a href="https://arxiv.org/format/2411.14199">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Digital Libraries">cs.DL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> OpenScholar: Synthesizing Scientific Literature with Retrieval-augmented LMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Asai%2C+A">Akari Asai</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jacqueline He</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+R">Rulin Shao</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+W">Weijia Shi</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+A">Amanpreet Singh</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+J+C">Joseph Chee Chang</a>, <a href="/search/cs?searchtype=author&query=Lo%2C+K">Kyle Lo</a>, <a href="/search/cs?searchtype=author&query=Soldaini%2C+L">Luca Soldaini</a>, <a href="/search/cs?searchtype=author&query=Feldman%2C+S">Sergey Feldman</a>, <a href="/search/cs?searchtype=author&query=D%27arcy%2C+M">Mike D'arcy</a>, <a href="/search/cs?searchtype=author&query=Wadden%2C+D">David Wadden</a>, <a href="/search/cs?searchtype=author&query=Latzke%2C+M">Matt Latzke</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+M">Minyang Tian</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+P">Pan Ji</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shengyan Liu</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+H">Hao Tong</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Bohao Wu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yanyu Xiong</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Neubig%2C+G">Graham Neubig</a>, <a href="/search/cs?searchtype=author&query=Weld%2C+D">Dan Weld</a>, <a href="/search/cs?searchtype=author&query=Downey%2C+D">Doug Downey</a>, <a href="/search/cs?searchtype=author&query=Yih%2C+W">Wen-tau Yih</a>, <a href="/search/cs?searchtype=author&query=Koh%2C+P+W">Pang Wei Koh</a>, <a href="/search/cs?searchtype=author&query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14199v1-abstract-short" style="display: inline;"> Scientific progress depends on researchers' ability to synthesize the growing body of literature. Can large language models (LMs) assist scientists in this task? We introduce OpenScholar, a specialized retrieval-augmented LM that answers scientific queries by identifying relevant passages from 45 million open-access papers and synthesizing citation-backed responses. To evaluate OpenScholar, we dev… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14199v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14199v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14199v1-abstract-full" style="display: none;"> Scientific progress depends on researchers' ability to synthesize the growing body of literature. Can large language models (LMs) assist scientists in this task? We introduce OpenScholar, a specialized retrieval-augmented LM that answers scientific queries by identifying relevant passages from 45 million open-access papers and synthesizing citation-backed responses. To evaluate OpenScholar, we develop ScholarQABench, the first large-scale multi-domain benchmark for literature search, comprising 2,967 expert-written queries and 208 long-form answers across computer science, physics, neuroscience, and biomedicine. On ScholarQABench, OpenScholar-8B outperforms GPT-4o by 5% and PaperQA2 by 7% in correctness, despite being a smaller, open model. While GPT4o hallucinates citations 78 to 90% of the time, OpenScholar achieves citation accuracy on par with human experts. OpenScholar's datastore, retriever, and self-feedback inference loop also improves off-the-shelf LMs: for instance, OpenScholar-GPT4o improves GPT-4o's correctness by 12%. In human evaluations, experts preferred OpenScholar-8B and OpenScholar-GPT4o responses over expert-written ones 51% and 70% of the time, respectively, compared to GPT4o's 32%. We open-source all of our code, models, datastore, data and a public demo. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14199v1-abstract-full').style.display = 'none'; document.getElementById('2411.14199v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05877">arXiv:2411.05877</a> <span> [<a href="https://arxiv.org/pdf/2411.05877">pdf</a>, <a href="https://arxiv.org/format/2411.05877">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Generative Adapter: Contextualizing Language Models in Parameters with A Single Forward Pass </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tong Chen</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+H">Hao Fang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+P">Patrick Xia</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaodong Liu</a>, <a href="/search/cs?searchtype=author&query=Van+Durme%2C+B">Benjamin Van Durme</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jianfeng Gao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05877v1-abstract-short" style="display: inline;"> Large language models (LMs) are typically adapted to improve performance on new contexts (\eg text prompts that define new tasks or domains) through fine-tuning or prompting. However, there is an accuracy compute tradeoff -- fine-tuning incurs significant training cost and prompting increases inference overhead. We introduce $GenerativeAdapter$, an effective and efficient adaptation method that di… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05877v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05877v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05877v1-abstract-full" style="display: none;"> Large language models (LMs) are typically adapted to improve performance on new contexts (\eg text prompts that define new tasks or domains) through fine-tuning or prompting. However, there is an accuracy compute tradeoff -- fine-tuning incurs significant training cost and prompting increases inference overhead. We introduce $GenerativeAdapter$, an effective and efficient adaptation method that directly maps new contexts to low-rank LM adapters, thereby significantly reducing inference overhead with no need for finetuning. The adapter generator is trained via self-supervised learning, and can be used to adapt a single frozen LM for any new task simply by mapping the associated task or domain context to a new adapter. We apply $GenerativeAdapter$ to two pretrained LMs (Mistral-7B-Instruct and Llama2-7B-Chat) and evaluate the adapted models in three adaption scenarios: knowledge acquisition from documents, learning from demonstrations, and personalization for users. In StreamingQA, our approach is effective in injecting knowledge into the LM's parameters, achieving a 63.5% improvement in F1 score over the model with supervised fine-tuning (from $19.5$ to $31.5$) for contexts as long as 32K tokens. In the MetaICL in-context learning evaluation, our method achieves an average accuracy of $44.9$ across 26 tasks, outperforming the base model. On MSC, our method proves to be highly competitive in memorizing user information from conversations with a 4x reduction in computation and memory costs compared to prompting with full conversation history. Together, these results suggest that $GenerativeAdapter$ should allow for general adaption to a wide range of different contexts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05877v1-abstract-full').style.display = 'none'; document.getElementById('2411.05877v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04996">arXiv:2411.04996</a> <span> [<a href="https://arxiv.org/pdf/2411.04996">pdf</a>, <a href="https://arxiv.org/format/2411.04996">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Mixture-of-Transformers: A Sparse and Scalable Architecture for Multi-Modal Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+W">Weixin Liang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+L">Lili Yu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+L">Liang Luo</a>, <a href="/search/cs?searchtype=author&query=Iyer%2C+S">Srinivasan Iyer</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+N">Ning Dong</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Chunting Zhou</a>, <a href="/search/cs?searchtype=author&query=Ghosh%2C+G">Gargi Ghosh</a>, <a href="/search/cs?searchtype=author&query=Lewis%2C+M">Mike Lewis</a>, <a href="/search/cs?searchtype=author&query=Yih%2C+W">Wen-tau Yih</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X+V">Xi Victoria Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04996v1-abstract-short" style="display: inline;"> The development of large language models (LLMs) has expanded to multi-modal systems capable of processing text, images, and speech within a unified framework. Training these models demands significantly larger datasets and computational resources compared to text-only LLMs. To address the scaling challenges, we introduce Mixture-of-Transformers (MoT), a sparse multi-modal transformer architecture… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04996v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04996v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04996v1-abstract-full" style="display: none;"> The development of large language models (LLMs) has expanded to multi-modal systems capable of processing text, images, and speech within a unified framework. Training these models demands significantly larger datasets and computational resources compared to text-only LLMs. To address the scaling challenges, we introduce Mixture-of-Transformers (MoT), a sparse multi-modal transformer architecture that significantly reduces pretraining computational costs. MoT decouples non-embedding parameters of the model by modality -- including feed-forward networks, attention matrices, and layer normalization -- enabling modality-specific processing with global self-attention over the full input sequence. We evaluate MoT across multiple settings and model scales. In the Chameleon 7B setting (autoregressive text-and-image generation), MoT matches the dense baseline's performance using only 55.8\% of the FLOPs. When extended to include speech, MoT reaches speech performance comparable to the dense baseline with only 37.2\% of the FLOPs. In the Transfusion setting, where text and image are trained with different objectives, a 7B MoT model matches the image modality performance of the dense baseline with one third of the FLOPs, and a 760M MoT model outperforms a 1.4B dense baseline across key image generation metrics. System profiling further highlights MoT's practical benefits, achieving dense baseline image quality in 47.2\% of the wall-clock time and text quality in 75.6\% of the wall-clock time (measured on AWS p4de.24xlarge instances with NVIDIA A100 GPUs). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04996v1-abstract-full').style.display = 'none'; document.getElementById('2411.04996v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03700">arXiv:2411.03700</a> <span> [<a href="https://arxiv.org/pdf/2411.03700">pdf</a>, <a href="https://arxiv.org/format/2411.03700">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> The Root Shapes the Fruit: On the Persistence of Gender-Exclusive Harms in Aligned Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ovalle%2C+A">Anaelia Ovalle</a>, <a href="/search/cs?searchtype=author&query=Pavasovic%2C+K+L">Krunoslav Lehman Pavasovic</a>, <a href="/search/cs?searchtype=author&query=Martin%2C+L">Louis Martin</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+E+M">Eric Michael Smith</a>, <a href="/search/cs?searchtype=author&query=Williams%2C+A">Adina Williams</a>, <a href="/search/cs?searchtype=author&query=Sagun%2C+L">Levent Sagun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03700v1-abstract-short" style="display: inline;"> Natural-language assistants are designed to provide users with helpful responses while avoiding harmful outputs, largely achieved through alignment to human preferences. Yet there is limited understanding of whether alignment techniques may inadvertently perpetuate or even amplify harmful biases inherited from their pre-aligned base models. This issue is compounded by the choice of bias evaluation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03700v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03700v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03700v1-abstract-full" style="display: none;"> Natural-language assistants are designed to provide users with helpful responses while avoiding harmful outputs, largely achieved through alignment to human preferences. Yet there is limited understanding of whether alignment techniques may inadvertently perpetuate or even amplify harmful biases inherited from their pre-aligned base models. This issue is compounded by the choice of bias evaluation benchmarks in popular preference-finetuned models, which predominantly focus on dominant social categories, such as binary gender, thereby limiting insights into biases affecting underrepresented groups. Towards addressing this gap, we center transgender, nonbinary, and other gender-diverse identities to investigate how alignment procedures interact with pre-existing gender-diverse bias in LLMs. Our key contributions include: 1) a comprehensive survey of bias evaluation modalities across leading preference-finetuned LLMs, highlighting critical gaps in gender-diverse representation, 2) systematic evaluation of gender-diverse biases across 12 models spanning Direct Preference Optimization (DPO) stages, uncovering harms popular bias benchmarks fail to detect, and 3) a flexible framework for measuring harmful biases in implicit reward signals applicable to other social contexts. Our findings reveal that DPO-aligned models are particularly sensitive to supervised finetuning (SFT), and can amplify two forms of real-world gender-diverse harms from their base models: stigmatization and gender non-affirmative language. We conclude with recommendations tailored to DPO and broader alignment practices, advocating for the adoption of community-informed bias evaluation frameworks to more effectively identify and address underrepresented harms in LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03700v1-abstract-full').style.display = 'none'; document.getElementById('2411.03700v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to 2024 Neurips Queer in AI Workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17251">arXiv:2410.17251</a> <span> [<a href="https://arxiv.org/pdf/2410.17251">pdf</a>, <a href="https://arxiv.org/format/2410.17251">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Altogether: Image Captioning via Re-aligning Alt-text </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hu Xu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+P">Po-Yao Huang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+X+E">Xiaoqing Ellen Tan</a>, <a href="/search/cs?searchtype=author&query=Yeh%2C+C">Ching-Feng Yeh</a>, <a href="/search/cs?searchtype=author&query=Kahn%2C+J">Jacob Kahn</a>, <a href="/search/cs?searchtype=author&query=Jou%2C+C">Christine Jou</a>, <a href="/search/cs?searchtype=author&query=Ghosh%2C+G">Gargi Ghosh</a>, <a href="/search/cs?searchtype=author&query=Levy%2C+O">Omer Levy</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Yih%2C+W">Wen-tau Yih</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shang-Wen Li</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Saining Xie</a>, <a href="/search/cs?searchtype=author&query=Feichtenhofer%2C+C">Christoph Feichtenhofer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17251v1-abstract-short" style="display: inline;"> This paper focuses on creating synthetic data to improve the quality of image captions. Existing works typically have two shortcomings. First, they caption images from scratch, ignoring existing alt-text metadata, and second, lack transparency if the captioners' training data (e.g. GPT) is unknown. In this paper, we study a principled approach Altogether based on the key idea to edit and re-align… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17251v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17251v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17251v1-abstract-full" style="display: none;"> This paper focuses on creating synthetic data to improve the quality of image captions. Existing works typically have two shortcomings. First, they caption images from scratch, ignoring existing alt-text metadata, and second, lack transparency if the captioners' training data (e.g. GPT) is unknown. In this paper, we study a principled approach Altogether based on the key idea to edit and re-align existing alt-texts associated with the images. To generate training data, we perform human annotation where annotators start with the existing alt-text and re-align it to the image content in multiple rounds, consequently constructing captions with rich visual concepts. This differs from prior work that carries out human annotation as a one-time description task solely based on images and annotator knowledge. We train a captioner on this data that generalizes the process of re-aligning alt-texts at scale. Our results show our Altogether approach leads to richer image captions that also improve text-to-image generation and zero-shot image classification tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17251v1-abstract-full').style.display = 'none'; document.getElementById('2410.17251v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by EMNLP 2024; MetaCLIPv2</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11758">arXiv:2410.11758</a> <span> [<a href="https://arxiv.org/pdf/2410.11758">pdf</a>, <a href="https://arxiv.org/format/2410.11758">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Latent Action Pretraining from Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+S">Seonghyeon Ye</a>, <a href="/search/cs?searchtype=author&query=Jang%2C+J">Joel Jang</a>, <a href="/search/cs?searchtype=author&query=Jeon%2C+B">Byeongguk Jeon</a>, <a href="/search/cs?searchtype=author&query=Joo%2C+S">Sejune Joo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jianwei Yang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+B">Baolin Peng</a>, <a href="/search/cs?searchtype=author&query=Mandlekar%2C+A">Ajay Mandlekar</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+R">Reuben Tan</a>, <a href="/search/cs?searchtype=author&query=Chao%2C+Y">Yu-Wei Chao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B+Y">Bill Yuchen Lin</a>, <a href="/search/cs?searchtype=author&query=Liden%2C+L">Lars Liden</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+K">Kimin Lee</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jianfeng Gao</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Fox%2C+D">Dieter Fox</a>, <a href="/search/cs?searchtype=author&query=Seo%2C+M">Minjoon Seo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11758v1-abstract-short" style="display: inline;"> We introduce Latent Action Pretraining for general Action models (LAPA), an unsupervised method for pretraining Vision-Language-Action (VLA) models without ground-truth robot action labels. Existing Vision-Language-Action models require action labels typically collected by human teleoperators during pretraining, which significantly limits possible data sources and scale. In this work, we propose a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11758v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11758v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11758v1-abstract-full" style="display: none;"> We introduce Latent Action Pretraining for general Action models (LAPA), an unsupervised method for pretraining Vision-Language-Action (VLA) models without ground-truth robot action labels. Existing Vision-Language-Action models require action labels typically collected by human teleoperators during pretraining, which significantly limits possible data sources and scale. In this work, we propose a method to learn from internet-scale videos that do not have robot action labels. We first train an action quantization model leveraging VQ-VAE-based objective to learn discrete latent actions between image frames, then pretrain a latent VLA model to predict these latent actions from observations and task descriptions, and finally finetune the VLA on small-scale robot manipulation data to map from latent to robot actions. Experimental results demonstrate that our method significantly outperforms existing techniques that train robot manipulation policies from large-scale videos. Furthermore, it outperforms the state-of-the-art VLA model trained with robotic action labels on real-world manipulation tasks that require language conditioning, generalization to unseen objects, and semantic generalization to unseen instructions. Training only on human manipulation videos also shows positive transfer, opening up the potential for leveraging web-scale data for robotics foundation model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11758v1-abstract-full').style.display = 'none'; document.getElementById('2410.11758v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Website: https://latentactionpretraining.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11039">arXiv:2408.11039</a> <span> [<a href="https://arxiv.org/pdf/2408.11039">pdf</a>, <a href="https://arxiv.org/format/2408.11039">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Transfusion: Predict the Next Token and Diffuse Images with One Multi-Modal Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Chunting Zhou</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+L">Lili Yu</a>, <a href="/search/cs?searchtype=author&query=Babu%2C+A">Arun Babu</a>, <a href="/search/cs?searchtype=author&query=Tirumala%2C+K">Kushal Tirumala</a>, <a href="/search/cs?searchtype=author&query=Yasunaga%2C+M">Michihiro Yasunaga</a>, <a href="/search/cs?searchtype=author&query=Shamis%2C+L">Leonid Shamis</a>, <a href="/search/cs?searchtype=author&query=Kahn%2C+J">Jacob Kahn</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xuezhe Ma</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Levy%2C+O">Omer Levy</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11039v1-abstract-short" style="display: inline;"> We introduce Transfusion, a recipe for training a multi-modal model over discrete and continuous data. Transfusion combines the language modeling loss function (next token prediction) with diffusion to train a single transformer over mixed-modality sequences. We pretrain multiple Transfusion models up to 7B parameters from scratch on a mixture of text and image data, establishing scaling laws with… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11039v1-abstract-full').style.display = 'inline'; document.getElementById('2408.11039v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11039v1-abstract-full" style="display: none;"> We introduce Transfusion, a recipe for training a multi-modal model over discrete and continuous data. Transfusion combines the language modeling loss function (next token prediction) with diffusion to train a single transformer over mixed-modality sequences. We pretrain multiple Transfusion models up to 7B parameters from scratch on a mixture of text and image data, establishing scaling laws with respect to a variety of uni- and cross-modal benchmarks. Our experiments show that Transfusion scales significantly better than quantizing images and training a language model over discrete image tokens. By introducing modality-specific encoding and decoding layers, we can further improve the performance of Transfusion models, and even compress each image to just 16 patches. We further demonstrate that scaling our Transfusion recipe to 7B parameters and 2T multi-modal tokens produces a model that can generate images and text on a par with similar scale diffusion models and language models, reaping the benefits of both worlds. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11039v1-abstract-full').style.display = 'none'; document.getElementById('2408.11039v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06518">arXiv:2408.06518</a> <span> [<a href="https://arxiv.org/pdf/2408.06518">pdf</a>, <a href="https://arxiv.org/format/2408.06518">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Does Liking Yellow Imply Driving a School Bus? Semantic Leakage in Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gonen%2C+H">Hila Gonen</a>, <a href="/search/cs?searchtype=author&query=Blevins%2C+T">Terra Blevins</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Alisa Liu</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+N+A">Noah A. Smith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06518v2-abstract-short" style="display: inline;"> Despite their wide adoption, the biases and unintended behaviors of language models remain poorly understood. In this paper, we identify and characterize a phenomenon never discussed before, which we call semantic leakage, where models leak irrelevant information from the prompt into the generation in unexpected ways. We propose an evaluation setting to detect semantic leakage both by humans and a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06518v2-abstract-full').style.display = 'inline'; document.getElementById('2408.06518v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06518v2-abstract-full" style="display: none;"> Despite their wide adoption, the biases and unintended behaviors of language models remain poorly understood. In this paper, we identify and characterize a phenomenon never discussed before, which we call semantic leakage, where models leak irrelevant information from the prompt into the generation in unexpected ways. We propose an evaluation setting to detect semantic leakage both by humans and automatically, curate a diverse test suite for diagnosing this behavior, and measure significant semantic leakage in 13 flagship models. We also show that models exhibit semantic leakage in languages besides English and across different settings and generation scenarios. This discovery highlights yet another type of bias in language models that affects their generation patterns and behavior. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06518v2-abstract-full').style.display = 'none'; document.getElementById('2408.06518v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04614">arXiv:2408.04614</a> <span> [<a href="https://arxiv.org/pdf/2408.04614">pdf</a>, <a href="https://arxiv.org/format/2408.04614">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Better Alignment with Instruction Back-and-Forth Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nguyen%2C+T">Thao Nguyen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jeffrey Li</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+S">Sewoong Oh</a>, <a href="/search/cs?searchtype=author&query=Schmidt%2C+L">Ludwig Schmidt</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xian Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04614v2-abstract-short" style="display: inline;"> We propose a new method, instruction back-and-forth translation, to construct high-quality synthetic data grounded in world knowledge for aligning large language models (LLMs). Given documents from a web corpus, we generate and curate synthetic instructions using the backtranslation approach proposed by Li et al.(2023a), and rewrite the responses to improve their quality further based on the initi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04614v2-abstract-full').style.display = 'inline'; document.getElementById('2408.04614v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04614v2-abstract-full" style="display: none;"> We propose a new method, instruction back-and-forth translation, to construct high-quality synthetic data grounded in world knowledge for aligning large language models (LLMs). Given documents from a web corpus, we generate and curate synthetic instructions using the backtranslation approach proposed by Li et al.(2023a), and rewrite the responses to improve their quality further based on the initial documents. Fine-tuning with the resulting (backtranslated instruction, rewritten response) pairs yields higher win rates on AlpacaEval than using other common instruction datasets such as Humpback, ShareGPT, Open Orca, Alpaca-GPT4 and Self-instruct. We also demonstrate that rewriting the responses with an LLM outperforms direct distillation, and the two generated text distributions exhibit significant distinction in embedding space. Further analysis shows that our backtranslated instructions are of higher quality than other sources of synthetic instructions, while our responses are more diverse and complex than those obtained from distillation. Overall we find that instruction back-and-forth translation combines the best of both worlds -- making use of the information diversity and quantity found on the web, while ensuring the quality of the responses which is necessary for effective alignment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04614v2-abstract-full').style.display = 'none'; document.getElementById('2408.04614v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21770">arXiv:2407.21770</a> <span> [<a href="https://arxiv.org/pdf/2407.21770">pdf</a>, <a href="https://arxiv.org/format/2407.21770">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MoMa: Efficient Early-Fusion Pre-training with Mixture of Modality-Aware Experts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+X+V">Xi Victoria Lin</a>, <a href="/search/cs?searchtype=author&query=Shrivastava%2C+A">Akshat Shrivastava</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+L">Liang Luo</a>, <a href="/search/cs?searchtype=author&query=Iyer%2C+S">Srinivasan Iyer</a>, <a href="/search/cs?searchtype=author&query=Lewis%2C+M">Mike Lewis</a>, <a href="/search/cs?searchtype=author&query=Ghosh%2C+G">Gargi Ghosh</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Aghajanyan%2C+A">Armen Aghajanyan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21770v3-abstract-short" style="display: inline;"> We introduce MoMa, a novel modality-aware mixture-of-experts (MoE) architecture designed for pre-training mixed-modal, early-fusion language models. MoMa processes images and text in arbitrary sequences by dividing expert modules into modality-specific groups. These groups exclusively process designated tokens while employing learned routing within each group to maintain semantically informed adap… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21770v3-abstract-full').style.display = 'inline'; document.getElementById('2407.21770v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21770v3-abstract-full" style="display: none;"> We introduce MoMa, a novel modality-aware mixture-of-experts (MoE) architecture designed for pre-training mixed-modal, early-fusion language models. MoMa processes images and text in arbitrary sequences by dividing expert modules into modality-specific groups. These groups exclusively process designated tokens while employing learned routing within each group to maintain semantically informed adaptivity. Our empirical results reveal substantial pre-training efficiency gains through this modality-specific parameter allocation. Under a 1-trillion-token training budget, the MoMa 1.4B model, featuring 4 text experts and 4 image experts, achieves impressive FLOPs savings: 3.7x overall, with 2.6x for text and 5.2x for image processing compared to a compute-equivalent dense baseline, measured by pre-training loss. This outperforms the standard expert-choice MoE with 8 mixed-modal experts, which achieves 3x overall FLOPs savings (3x for text, 2.8x for image). Combining MoMa with mixture-of-depths (MoD) further improves pre-training FLOPs savings to 4.2x overall (text: 3.4x, image: 5.3x), although this combination hurts performance in causal inference due to increased sensitivity to router accuracy. These results demonstrate MoMa's potential to significantly advance the efficiency of mixed-modal, early-fusion language model pre-training, paving the way for more resource-efficient and capable multimodal AI systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21770v3-abstract-full').style.display = 'none'; document.getElementById('2407.21770v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">v2 -> update related work section v3 -> fix spelling</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.12854">arXiv:2407.12854</a> <span> [<a href="https://arxiv.org/pdf/2407.12854">pdf</a>, <a href="https://arxiv.org/format/2407.12854">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Scaling Retrieval-Based Language Models with a Trillion-Token Datastore </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shao%2C+R">Rulin Shao</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jacqueline He</a>, <a href="/search/cs?searchtype=author&query=Asai%2C+A">Akari Asai</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+W">Weijia Shi</a>, <a href="/search/cs?searchtype=author&query=Dettmers%2C+T">Tim Dettmers</a>, <a href="/search/cs?searchtype=author&query=Min%2C+S">Sewon Min</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Koh%2C+P+W">Pang Wei Koh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.12854v1-abstract-short" style="display: inline;"> Scaling laws with respect to the amount of training data and the number of parameters allow us to predict the cost-benefit trade-offs of pretraining language models (LMs) in different configurations. In this paper, we consider another dimension of scaling: the amount of data available at inference time. Specifically, we find that increasing the size of the datastore used by a retrieval-based LM mo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12854v1-abstract-full').style.display = 'inline'; document.getElementById('2407.12854v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.12854v1-abstract-full" style="display: none;"> Scaling laws with respect to the amount of training data and the number of parameters allow us to predict the cost-benefit trade-offs of pretraining language models (LMs) in different configurations. In this paper, we consider another dimension of scaling: the amount of data available at inference time. Specifically, we find that increasing the size of the datastore used by a retrieval-based LM monotonically improves language modeling and several downstream tasks without obvious saturation, such that a smaller model augmented with a large datastore outperforms a larger LM-only model on knowledge-intensive tasks. By plotting compute-optimal scaling curves with varied datastore, model, and pretraining data sizes, we show that using larger datastores can significantly improve model performance for the same training compute budget. We carry out our study by constructing a 1.4 trillion-token datastore named MassiveDS, which is the largest and the most diverse open-sourced datastore for retrieval-based LMs to date, and designing an efficient pipeline for studying datastore scaling in a computationally accessible manner. Finally, we analyze the effect of improving the retriever, datastore quality filtering, and other design choices on our observed scaling trends. Overall, our results show that datastore size should be considered as an integral part of LM efficiency and performance trade-offs. To facilitate future research, we open-source our datastore and code at https://github.com/RulinShao/retrieval-scaling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12854v1-abstract-full').style.display = 'none'; document.getElementById('2407.12854v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07087">arXiv:2407.07087</a> <span> [<a href="https://arxiv.org/pdf/2407.07087">pdf</a>, <a href="https://arxiv.org/format/2407.07087">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> CopyBench: Measuring Literal and Non-Literal Reproduction of Copyright-Protected Text in Language Model Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tong Chen</a>, <a href="/search/cs?searchtype=author&query=Asai%2C+A">Akari Asai</a>, <a href="/search/cs?searchtype=author&query=Mireshghallah%2C+N">Niloofar Mireshghallah</a>, <a href="/search/cs?searchtype=author&query=Min%2C+S">Sewon Min</a>, <a href="/search/cs?searchtype=author&query=Grimmelmann%2C+J">James Grimmelmann</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+Y">Yejin Choi</a>, <a href="/search/cs?searchtype=author&query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Koh%2C+P+W">Pang Wei Koh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07087v2-abstract-short" style="display: inline;"> Evaluating the degree of reproduction of copyright-protected content by language models (LMs) is of significant interest to the AI and legal communities. Although both literal and non-literal similarities are considered by courts when assessing the degree of reproduction, prior research has focused only on literal similarities. To bridge this gap, we introduce CopyBench, a benchmark designed to me… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07087v2-abstract-full').style.display = 'inline'; document.getElementById('2407.07087v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07087v2-abstract-full" style="display: none;"> Evaluating the degree of reproduction of copyright-protected content by language models (LMs) is of significant interest to the AI and legal communities. Although both literal and non-literal similarities are considered by courts when assessing the degree of reproduction, prior research has focused only on literal similarities. To bridge this gap, we introduce CopyBench, a benchmark designed to measure both literal and non-literal copying in LM generations. Using copyrighted fiction books as text sources, we provide automatic evaluation protocols to assess literal and non-literal copying, balanced against the model utility in terms of the ability to recall facts from the copyrighted works and generate fluent completions. We find that, although literal copying is relatively rare, two types of non-literal copying -- event copying and character copying -- occur even in models as small as 7B parameters. Larger models demonstrate significantly more copying, with literal copying rates increasing from 0.2\% to 10.5\% and non-literal copying from 2.3\% to 5.9\% when comparing Llama3-8B and 70B models, respectively. We further evaluate the effectiveness of current strategies for mitigating copying and show that (1) training-time alignment can reduce literal copying but may increase non-literal copying, and (2) current inference-time mitigation methods primarily reduce literal but not non-literal copying. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07087v2-abstract-full').style.display = 'none'; document.getElementById('2407.07087v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.06460">arXiv:2407.06460</a> <span> [<a href="https://arxiv.org/pdf/2407.06460">pdf</a>, <a href="https://arxiv.org/format/2407.06460">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MUSE: Machine Unlearning Six-Way Evaluation for Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+W">Weijia Shi</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jaechan Lee</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yangsibo Huang</a>, <a href="/search/cs?searchtype=author&query=Malladi%2C+S">Sadhika Malladi</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jieyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Holtzman%2C+A">Ari Holtzman</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Daogao Liu</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+N+A">Noah A. Smith</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chiyuan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.06460v2-abstract-short" style="display: inline;"> Language models (LMs) are trained on vast amounts of text data, which may include private and copyrighted content. Data owners may request the removal of their data from a trained model due to privacy or copyright concerns. However, exactly unlearning only these datapoints (i.e., retraining with the data removed) is intractable in modern-day models. This has led to the development of many approxim… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06460v2-abstract-full').style.display = 'inline'; document.getElementById('2407.06460v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.06460v2-abstract-full" style="display: none;"> Language models (LMs) are trained on vast amounts of text data, which may include private and copyrighted content. Data owners may request the removal of their data from a trained model due to privacy or copyright concerns. However, exactly unlearning only these datapoints (i.e., retraining with the data removed) is intractable in modern-day models. This has led to the development of many approximate unlearning algorithms. The evaluation of the efficacy of these algorithms has traditionally been narrow in scope, failing to precisely quantify the success and practicality of the algorithm from the perspectives of both the model deployers and the data owners. We address this issue by proposing MUSE, a comprehensive machine unlearning evaluation benchmark that enumerates six diverse desirable properties for unlearned models: (1) no verbatim memorization, (2) no knowledge memorization, (3) no privacy leakage, (4) utility preservation on data not intended for removal, (5) scalability with respect to the size of removal requests, and (6) sustainability over sequential unlearning requests. Using these criteria, we benchmark how effectively eight popular unlearning algorithms on 7B-parameter LMs can unlearn Harry Potter books and news articles. Our results demonstrate that most algorithms can prevent verbatim memorization and knowledge memorization to varying degrees, but only one algorithm does not lead to severe privacy leakage. Furthermore, existing algorithms fail to meet deployer's expectations because they often degrade general model utility and also cannot sustainably accommodate successive unlearning requests or large-scale content removal. Our findings identify key issues with the practicality of existing unlearning algorithms on language models, and we release our benchmark to facilitate further evaluations: muse-bench.github.io <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06460v2-abstract-full').style.display = 'none'; document.getElementById('2407.06460v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18664">arXiv:2406.18664</a> <span> [<a href="https://arxiv.org/pdf/2406.18664">pdf</a>, <a href="https://arxiv.org/format/2406.18664">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Evaluating Copyright Takedown Methods for Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+B">Boyi Wei</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+W">Weijia Shi</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yangsibo Huang</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+N+A">Noah A. Smith</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chiyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kai Li</a>, <a href="/search/cs?searchtype=author&query=Henderson%2C+P">Peter Henderson</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18664v4-abstract-short" style="display: inline;"> Language models (LMs) derive their capabilities from extensive training on diverse data, including potentially copyrighted material. These models can memorize and generate content similar to their training data, posing potential concerns. Therefore, model creators are motivated to develop mitigation methods that prevent generating protected content. We term this procedure as copyright takedowns fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18664v4-abstract-full').style.display = 'inline'; document.getElementById('2406.18664v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18664v4-abstract-full" style="display: none;"> Language models (LMs) derive their capabilities from extensive training on diverse data, including potentially copyrighted material. These models can memorize and generate content similar to their training data, posing potential concerns. Therefore, model creators are motivated to develop mitigation methods that prevent generating protected content. We term this procedure as copyright takedowns for LMs, noting the conceptual similarity to (but legal distinction from) the DMCA takedown This paper introduces the first evaluation of the feasibility and side effects of copyright takedowns for LMs. We propose CoTaEval, an evaluation framework to assess the effectiveness of copyright takedown methods, the impact on the model's ability to retain uncopyrightable factual knowledge from the training data whose recitation is embargoed, and how well the model maintains its general utility and efficiency. We examine several strategies, including adding system prompts, decoding-time filtering interventions, and unlearning approaches. Our findings indicate that no tested method excels across all metrics, showing significant room for research in this unique problem setting and indicating potential unresolved challenges for live policy proposals. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18664v4-abstract-full').style.display = 'none'; document.getElementById('2406.18664v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">31 pages, 9 figures, 14 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.14526">arXiv:2406.14526</a> <span> [<a href="https://arxiv.org/pdf/2406.14526">pdf</a>, <a href="https://arxiv.org/format/2406.14526">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Fantastic Copyrighted Beasts and How (Not) to Generate Them </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+L">Luxi He</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yangsibo Huang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+W">Weijia Shi</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+T">Tinghao Xie</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haotian Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yue Wang</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chiyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Danqi Chen</a>, <a href="/search/cs?searchtype=author&query=Henderson%2C+P">Peter Henderson</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.14526v1-abstract-short" style="display: inline;"> Recent studies show that image and video generation models can be prompted to reproduce copyrighted content from their training data, raising serious legal concerns around copyright infringement. Copyrighted characters, in particular, pose a difficult challenge for image generation services, with at least one lawsuit already awarding damages based on the generation of these characters. Yet, little… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14526v1-abstract-full').style.display = 'inline'; document.getElementById('2406.14526v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.14526v1-abstract-full" style="display: none;"> Recent studies show that image and video generation models can be prompted to reproduce copyrighted content from their training data, raising serious legal concerns around copyright infringement. Copyrighted characters, in particular, pose a difficult challenge for image generation services, with at least one lawsuit already awarding damages based on the generation of these characters. Yet, little research has empirically examined this issue. We conduct a systematic evaluation to fill this gap. First, we build CopyCat, an evaluation suite consisting of diverse copyrighted characters and a novel evaluation pipeline. Our evaluation considers both the detection of similarity to copyrighted characters and generated image's consistency with user input. Our evaluation systematically shows that both image and video generation models can still generate characters even if characters' names are not explicitly mentioned in the prompt, sometimes with only two generic keywords (e.g., prompting with "videogame, plumber" consistently generates Nintendo's Mario character). We then introduce techniques to semi-automatically identify such keywords or descriptions that trigger character generation. Using our evaluation suite, we study runtime mitigation strategies, including both existing methods and new strategies we propose. Our findings reveal that commonly employed strategies, such as prompt rewriting in the DALL-E system, are not sufficient as standalone guardrails. These strategies must be coupled with other approaches, like negative prompting, to effectively reduce the unintended generation of copyrighted characters. Our work provides empirical grounding to the discussion of copyright mitigation strategies and offers actionable insights for model deployers actively implementing them. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14526v1-abstract-full').style.display = 'none'; document.getElementById('2406.14526v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.11794">arXiv:2406.11794</a> <span> [<a href="https://arxiv.org/pdf/2406.11794">pdf</a>, <a href="https://arxiv.org/format/2406.11794">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DataComp-LM: In search of the next generation of training sets for language models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jeffrey Li</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+A">Alex Fang</a>, <a href="/search/cs?searchtype=author&query=Smyrnis%2C+G">Georgios Smyrnis</a>, <a href="/search/cs?searchtype=author&query=Ivgi%2C+M">Maor Ivgi</a>, <a href="/search/cs?searchtype=author&query=Jordan%2C+M">Matt Jordan</a>, <a href="/search/cs?searchtype=author&query=Gadre%2C+S">Samir Gadre</a>, <a href="/search/cs?searchtype=author&query=Bansal%2C+H">Hritik Bansal</a>, <a href="/search/cs?searchtype=author&query=Guha%2C+E">Etash Guha</a>, <a href="/search/cs?searchtype=author&query=Keh%2C+S">Sedrick Keh</a>, <a href="/search/cs?searchtype=author&query=Arora%2C+K">Kushal Arora</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Saurabh Garg</a>, <a href="/search/cs?searchtype=author&query=Xin%2C+R">Rui Xin</a>, <a href="/search/cs?searchtype=author&query=Muennighoff%2C+N">Niklas Muennighoff</a>, <a href="/search/cs?searchtype=author&query=Heckel%2C+R">Reinhard Heckel</a>, <a href="/search/cs?searchtype=author&query=Mercat%2C+J">Jean Mercat</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mayee Chen</a>, <a href="/search/cs?searchtype=author&query=Gururangan%2C+S">Suchin Gururangan</a>, <a href="/search/cs?searchtype=author&query=Wortsman%2C+M">Mitchell Wortsman</a>, <a href="/search/cs?searchtype=author&query=Albalak%2C+A">Alon Albalak</a>, <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Nezhurina%2C+M">Marianna Nezhurina</a>, <a href="/search/cs?searchtype=author&query=Abbas%2C+A">Amro Abbas</a>, <a href="/search/cs?searchtype=author&query=Hsieh%2C+C">Cheng-Yu Hsieh</a>, <a href="/search/cs?searchtype=author&query=Ghosh%2C+D">Dhruba Ghosh</a>, <a href="/search/cs?searchtype=author&query=Gardner%2C+J">Josh Gardner</a> , et al. (34 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.11794v3-abstract-short" style="display: inline;"> We introduce DataComp for Language Models (DCLM), a testbed for controlled dataset experiments with the goal of improving language models. As part of DCLM, we provide a standardized corpus of 240T tokens extracted from Common Crawl, effective pretraining recipes based on the OpenLM framework, and a broad suite of 53 downstream evaluations. Participants in the DCLM benchmark can experiment with dat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11794v3-abstract-full').style.display = 'inline'; document.getElementById('2406.11794v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.11794v3-abstract-full" style="display: none;"> We introduce DataComp for Language Models (DCLM), a testbed for controlled dataset experiments with the goal of improving language models. As part of DCLM, we provide a standardized corpus of 240T tokens extracted from Common Crawl, effective pretraining recipes based on the OpenLM framework, and a broad suite of 53 downstream evaluations. Participants in the DCLM benchmark can experiment with data curation strategies such as deduplication, filtering, and data mixing at model scales ranging from 412M to 7B parameters. As a baseline for DCLM, we conduct extensive experiments and find that model-based filtering is key to assembling a high-quality training set. The resulting dataset, DCLM-Baseline enables training a 7B parameter language model from scratch to 64% 5-shot accuracy on MMLU with 2.6T training tokens. Compared to MAP-Neo, the previous state-of-the-art in open-data language models, DCLM-Baseline represents a 6.6 percentage point improvement on MMLU while being trained with 40% less compute. Our baseline model is also comparable to Mistral-7B-v0.3 and Llama 3 8B on MMLU (63% & 66%), and performs similarly on an average of 53 natural language understanding tasks while being trained with 6.6x less compute than Llama 3 8B. Our results highlight the importance of dataset design for training language models and offer a starting point for further research on data curation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11794v3-abstract-full').style.display = 'none'; document.getElementById('2406.11794v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://www.datacomp.ai/dclm/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09403">arXiv:2406.09403</a> <span> [<a href="https://arxiv.org/pdf/2406.09403">pdf</a>, <a href="https://arxiv.org/format/2406.09403">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Visual Sketchpad: Sketching as a Visual Chain of Thought for Multimodal Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yushi Hu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+W">Weijia Shi</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+X">Xingyu Fu</a>, <a href="/search/cs?searchtype=author&query=Roth%2C+D">Dan Roth</a>, <a href="/search/cs?searchtype=author&query=Ostendorf%2C+M">Mari Ostendorf</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+N+A">Noah A Smith</a>, <a href="/search/cs?searchtype=author&query=Krishna%2C+R">Ranjay Krishna</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09403v3-abstract-short" style="display: inline;"> Humans draw to facilitate reasoning: we draw auxiliary lines when solving geometry problems; we mark and circle when reasoning on maps; we use sketches to amplify our ideas and relieve our limited-capacity working memory. However, such actions are missing in current multimodal language models (LMs). Current chain-of-thought and tool-use paradigms only use text as intermediate reasoning steps. In t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09403v3-abstract-full').style.display = 'inline'; document.getElementById('2406.09403v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09403v3-abstract-full" style="display: none;"> Humans draw to facilitate reasoning: we draw auxiliary lines when solving geometry problems; we mark and circle when reasoning on maps; we use sketches to amplify our ideas and relieve our limited-capacity working memory. However, such actions are missing in current multimodal language models (LMs). Current chain-of-thought and tool-use paradigms only use text as intermediate reasoning steps. In this work, we introduce Sketchpad, a framework that gives multimodal LMs a visual sketchpad and tools to draw on the sketchpad. The LM conducts planning and reasoning according to the visual artifacts it has drawn. Different from prior work, which uses text-to-image models to enable LMs to draw, Sketchpad enables LMs to draw with lines, boxes, marks, etc., which is closer to human sketching and better facilitates reasoning. Sketchpad can also use specialist vision models during the sketching process (e.g., draw bounding boxes with object detection models, draw masks with segmentation models), to further enhance visual perception and reasoning. We experiment with a wide range of math tasks (including geometry, functions, graphs, and chess) and complex visual reasoning tasks. Sketchpad substantially improves performance on all tasks over strong base models with no sketching, yielding an average gain of 12.7% on math tasks, and 8.6% on vision tasks. GPT-4o with Sketchpad sets a new state of the art on all tasks, including V*Bench (80.3%), BLINK spatial reasoning (83.9%), and visual correspondence (80.8%). All codes and data are in https://visualsketchpad.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09403v3-abstract-full').style.display = 'none'; document.getElementById('2406.09403v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024. Project and codes url: https://visualsketchpad.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.13218">arXiv:2405.13218</a> <span> [<a href="https://arxiv.org/pdf/2405.13218">pdf</a>, <a href="https://arxiv.org/format/2405.13218">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Computational Tradeoffs in Image Synthesis: Diffusion, Masked-Token, and Next-Token Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kilian%2C+M">Maciej Kilian</a>, <a href="/search/cs?searchtype=author&query=Jampani%2C+V">Varun Jampani</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.13218v2-abstract-short" style="display: inline;"> Nearly every recent image synthesis approach, including diffusion, masked-token prediction, and next-token prediction, uses a Transformer network architecture. Despite this common backbone, there has been no direct, compute controlled comparison of how these approaches affect performance and efficiency. We analyze the scalability of each approach through the lens of compute budget measured in FLOP… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.13218v2-abstract-full').style.display = 'inline'; document.getElementById('2405.13218v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.13218v2-abstract-full" style="display: none;"> Nearly every recent image synthesis approach, including diffusion, masked-token prediction, and next-token prediction, uses a Transformer network architecture. Despite this common backbone, there has been no direct, compute controlled comparison of how these approaches affect performance and efficiency. We analyze the scalability of each approach through the lens of compute budget measured in FLOPs. We find that token prediction methods, led by next-token prediction, significantly outperform diffusion on prompt following. On image quality, while next-token prediction initially performs better, scaling trends suggest it is eventually matched by diffusion. We compare the inference compute efficiency of each approach and find that next token prediction is by far the most efficient. Based on our findings we recommend diffusion for applications targeting image quality and low latency; and next-token prediction when prompt following or throughput is more important. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.13218v2-abstract-full').style.display = 'none'; document.getElementById('2405.13218v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.01582">arXiv:2405.01582</a> <span> [<a href="https://arxiv.org/pdf/2405.01582">pdf</a>, <a href="https://arxiv.org/format/2405.01582">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Text Quality-Based Pruning for Efficient Training of Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sharma%2C+V">Vasu Sharma</a>, <a href="/search/cs?searchtype=author&query=Padthe%2C+K">Karthik Padthe</a>, <a href="/search/cs?searchtype=author&query=Ardalani%2C+N">Newsha Ardalani</a>, <a href="/search/cs?searchtype=author&query=Tirumala%2C+K">Kushal Tirumala</a>, <a href="/search/cs?searchtype=author&query=Howes%2C+R">Russell Howes</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hu Xu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+P">Po-Yao Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shang-Wen Li</a>, <a href="/search/cs?searchtype=author&query=Aghajanyan%2C+A">Armen Aghajanyan</a>, <a href="/search/cs?searchtype=author&query=Ghosh%2C+G">Gargi Ghosh</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.01582v3-abstract-short" style="display: inline;"> In recent times training Language Models (LMs) have relied on computationally heavy training over massive datasets which makes this training process extremely laborious. In this paper we propose a novel method for numerically evaluating text quality in large unlabelled NLP datasets in a model agnostic manner to assign the text instances a "quality score". By proposing the text quality metric, th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.01582v3-abstract-full').style.display = 'inline'; document.getElementById('2405.01582v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.01582v3-abstract-full" style="display: none;"> In recent times training Language Models (LMs) have relied on computationally heavy training over massive datasets which makes this training process extremely laborious. In this paper we propose a novel method for numerically evaluating text quality in large unlabelled NLP datasets in a model agnostic manner to assign the text instances a "quality score". By proposing the text quality metric, the paper establishes a framework to identify and eliminate low-quality text instances, leading to improved training efficiency for LM models. Experimental results over multiple models and datasets demonstrate the efficacy of this approach, showcasing substantial gains in training effectiveness and highlighting the potential for resource-efficient LM training. For example, we observe an absolute accuracy improvement of 0.9% averaged over 14 downstream evaluation tasks for multiple LM models while using 40% lesser data and training 42% faster when training on the OpenWebText dataset and 0.8% average absolute accuracy improvement while using 20% lesser data and training 21% faster on the Wikipedia dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.01582v3-abstract-full').style.display = 'none'; document.getElementById('2405.01582v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.16030">arXiv:2404.16030</a> <span> [<a href="https://arxiv.org/pdf/2404.16030">pdf</a>, <a href="https://arxiv.org/format/2404.16030">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MoDE: CLIP Data Experts via Clustering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jiawei Ma</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+P">Po-Yao Huang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Saining Xie</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shang-Wen Li</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+S">Shih-Fu Chang</a>, <a href="/search/cs?searchtype=author&query=Yih%2C+W">Wen-Tau Yih</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hu Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.16030v1-abstract-short" style="display: inline;"> The success of contrastive language-image pretraining (CLIP) relies on the supervision from the pairing between images and captions, which tends to be noisy in web-crawled data. We present Mixture of Data Experts (MoDE) and learn a system of CLIP data experts via clustering. Each data expert is trained on one data cluster, being less sensitive to false negative noises in other clusters. At inferen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16030v1-abstract-full').style.display = 'inline'; document.getElementById('2404.16030v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.16030v1-abstract-full" style="display: none;"> The success of contrastive language-image pretraining (CLIP) relies on the supervision from the pairing between images and captions, which tends to be noisy in web-crawled data. We present Mixture of Data Experts (MoDE) and learn a system of CLIP data experts via clustering. Each data expert is trained on one data cluster, being less sensitive to false negative noises in other clusters. At inference time, we ensemble their outputs by applying weights determined through the correlation between task metadata and cluster conditions. To estimate the correlation precisely, the samples in one cluster should be semantically similar, but the number of data experts should still be reasonable for training and inference. As such, we consider the ontology in human language and propose to use fine-grained cluster centers to represent each data expert at a coarse-grained level. Experimental studies show that four CLIP data experts on ViT-B/16 outperform the ViT-L/14 by OpenAI CLIP and OpenCLIP on zero-shot image classification but with less ($<$35\%) training cost. Meanwhile, MoDE can train all data expert asynchronously and can flexibly include new data experts. The code is available at https://github.com/facebookresearch/MetaCLIP/tree/main/mode. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16030v1-abstract-full').style.display = 'none'; document.getElementById('2404.16030v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE CVPR 2024 Camera Ready. Code Link: https://github.com/facebookresearch/MetaCLIP/tree/main/mode</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.08801">arXiv:2404.08801</a> <span> [<a href="https://arxiv.org/pdf/2404.08801">pdf</a>, <a href="https://arxiv.org/format/2404.08801">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Megalodon: Efficient LLM Pretraining and Inference with Unlimited Context Length </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xuezhe Ma</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaomeng Yang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wenhan Xiong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Beidi Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+L">Lili Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&query=May%2C+J">Jonathan May</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Levy%2C+O">Omer Levy</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Chunting Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.08801v2-abstract-short" style="display: inline;"> The quadratic complexity and weak length extrapolation of Transformers limits their ability to scale to long sequences, and while sub-quadratic solutions like linear attention and state space models exist, they empirically underperform Transformers in pretraining efficiency and downstream task accuracy. We introduce Megalodon, a neural architecture for efficient sequence modeling with unlimited co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.08801v2-abstract-full').style.display = 'inline'; document.getElementById('2404.08801v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.08801v2-abstract-full" style="display: none;"> The quadratic complexity and weak length extrapolation of Transformers limits their ability to scale to long sequences, and while sub-quadratic solutions like linear attention and state space models exist, they empirically underperform Transformers in pretraining efficiency and downstream task accuracy. We introduce Megalodon, a neural architecture for efficient sequence modeling with unlimited context length. Megalodon inherits the architecture of Mega (exponential moving average with gated attention), and further introduces multiple technical components to improve its capability and stability, including complex exponential moving average (CEMA), timestep normalization layer, normalized attention mechanism and pre-norm with two-hop residual configuration. In a controlled head-to-head comparison with Llama2, Megalodon achieves better efficiency than Transformer in the scale of 7 billion parameters and 2 trillion training tokens. Megalodon reaches a training loss of 1.70, landing mid-way between Llama2-7B (1.75) and 13B (1.67). Code: https://github.com/XuezheMax/megalodon <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.08801v2-abstract-full').style.display = 'none'; document.getElementById('2404.08801v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 6 figures and 8 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.10691">arXiv:2403.10691</a> <span> [<a href="https://arxiv.org/pdf/2403.10691">pdf</a>, <a href="https://arxiv.org/format/2403.10691">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.18653/v1/2024.acl-long.804">10.18653/v1/2024.acl-long.804 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> MYTE: Morphology-Driven Byte Encoding for Better and Fairer Multilingual Language Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Limisiewicz%2C+T">Tomasz Limisiewicz</a>, <a href="/search/cs?searchtype=author&query=Blevins%2C+T">Terra Blevins</a>, <a href="/search/cs?searchtype=author&query=Gonen%2C+H">Hila Gonen</a>, <a href="/search/cs?searchtype=author&query=Ahia%2C+O">Orevaoghene Ahia</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.10691v2-abstract-short" style="display: inline;"> A major consideration in multilingual language modeling is how to best represent languages with diverse vocabularies and scripts. Although contemporary text encoding methods cover most of the world's writing systems, they exhibit bias towards the high-resource languages of the Global West. As a result, texts of underrepresented languages tend to be segmented into long sequences of linguistically m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10691v2-abstract-full').style.display = 'inline'; document.getElementById('2403.10691v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.10691v2-abstract-full" style="display: none;"> A major consideration in multilingual language modeling is how to best represent languages with diverse vocabularies and scripts. Although contemporary text encoding methods cover most of the world's writing systems, they exhibit bias towards the high-resource languages of the Global West. As a result, texts of underrepresented languages tend to be segmented into long sequences of linguistically meaningless units. To address the disparities, we introduce a new paradigm that encodes the same information with segments of consistent size across diverse languages. Our encoding convention (MYTE) is based on morphemes, as their inventories are more balanced across languages than characters, which are used in previous methods. We show that MYTE produces shorter encodings for all 99 analyzed languages, with the most notable improvements for non-European languages and non-Latin scripts. This, in turn, improves multilingual LM performance and diminishes the perplexity gap throughout diverse languages. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10691v2-abstract-full').style.display = 'none'; document.getElementById('2403.10691v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published at ACL 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.03187">arXiv:2403.03187</a> <span> [<a href="https://arxiv.org/pdf/2403.03187">pdf</a>, <a href="https://arxiv.org/format/2403.03187">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Reliable, Adaptable, and Attributable Language Models with Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Asai%2C+A">Akari Asai</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+Z">Zexuan Zhong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Danqi Chen</a>, <a href="/search/cs?searchtype=author&query=Koh%2C+P+W">Pang Wei Koh</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a>, <a href="/search/cs?searchtype=author&query=Yih%2C+W">Wen-tau Yih</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.03187v1-abstract-short" style="display: inline;"> Parametric language models (LMs), which are trained on vast amounts of web data, exhibit remarkable flexibility and capability. However, they still face practical challenges such as hallucinations, difficulty in adapting to new data distributions, and a lack of verifiability. In this position paper, we advocate for retrieval-augmented LMs to replace parametric LMs as the next generation of LMs. By… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.03187v1-abstract-full').style.display = 'inline'; document.getElementById('2403.03187v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.03187v1-abstract-full" style="display: none;"> Parametric language models (LMs), which are trained on vast amounts of web data, exhibit remarkable flexibility and capability. However, they still face practical challenges such as hallucinations, difficulty in adapting to new data distributions, and a lack of verifiability. In this position paper, we advocate for retrieval-augmented LMs to replace parametric LMs as the next generation of LMs. By incorporating large-scale datastores during inference, retrieval-augmented LMs can be more reliable, adaptable, and attributable. Despite their potential, retrieval-augmented LMs have yet to be widely adopted due to several obstacles: specifically, current retrieval-augmented LMs struggle to leverage helpful text beyond knowledge-intensive tasks such as question answering, have limited interaction between retrieval and LM components, and lack the infrastructure for scaling. To address these, we propose a roadmap for developing general-purpose retrieval-augmented LMs. This involves a reconsideration of datastores and retrievers, the exploration of pipelines with improved retriever-LM interaction, and significant investment in infrastructure for efficient training and inference. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.03187v1-abstract-full').style.display = 'none'; document.getElementById('2403.03187v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.10496">arXiv:2402.10496</a> <span> [<a href="https://arxiv.org/pdf/2402.10496">pdf</a>, <a href="https://arxiv.org/format/2402.10496">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Comparing Hallucination Detection Metrics for Multilingual Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+H">Haoqiang Kang</a>, <a href="/search/cs?searchtype=author&query=Blevins%2C+T">Terra Blevins</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.10496v2-abstract-short" style="display: inline;"> While many hallucination detection techniques have been evaluated on English text, their effectiveness in multilingual contexts remains unknown. This paper assesses how well various factual hallucination detection metrics (lexical metrics like ROUGE and Named Entity Overlap, and Natural Language Inference (NLI)-based metrics) identify hallucinations in generated biographical summaries across langu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.10496v2-abstract-full').style.display = 'inline'; document.getElementById('2402.10496v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.10496v2-abstract-full" style="display: none;"> While many hallucination detection techniques have been evaluated on English text, their effectiveness in multilingual contexts remains unknown. This paper assesses how well various factual hallucination detection metrics (lexical metrics like ROUGE and Named Entity Overlap, and Natural Language Inference (NLI)-based metrics) identify hallucinations in generated biographical summaries across languages. We compare how well automatic metrics correlate to each other and whether they agree with human judgments of factuality. Our analysis reveals that while the lexical metrics are ineffective, NLI-based metrics perform well, correlating with human annotations in many settings and often outperforming supervised models. However, NLI metrics are still limited, as they do not detect single-fact hallucinations well and fail for lower-resource languages. Therefore, our findings highlight the gaps in exisiting hallucination detection methods for non-English languages and motivate future research to develop more robust multilingual detection methods for LLM hallucinations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.10496v2-abstract-full').style.display = 'none'; document.getElementById('2402.10496v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.07841">arXiv:2402.07841</a> <span> [<a href="https://arxiv.org/pdf/2402.07841">pdf</a>, <a href="https://arxiv.org/format/2402.07841">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Do Membership Inference Attacks Work on Large Language Models? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Duan%2C+M">Michael Duan</a>, <a href="/search/cs?searchtype=author&query=Suri%2C+A">Anshuman Suri</a>, <a href="/search/cs?searchtype=author&query=Mireshghallah%2C+N">Niloofar Mireshghallah</a>, <a href="/search/cs?searchtype=author&query=Min%2C+S">Sewon Min</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+W">Weijia Shi</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Tsvetkov%2C+Y">Yulia Tsvetkov</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+Y">Yejin Choi</a>, <a href="/search/cs?searchtype=author&query=Evans%2C+D">David Evans</a>, <a href="/search/cs?searchtype=author&query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.07841v2-abstract-short" style="display: inline;"> Membership inference attacks (MIAs) attempt to predict whether a particular datapoint is a member of a target model's training data. Despite extensive research on traditional machine learning models, there has been limited work studying MIA on the pre-training data of large language models (LLMs). We perform a large-scale evaluation of MIAs over a suite of language models (LMs) trained on the Pile… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.07841v2-abstract-full').style.display = 'inline'; document.getElementById('2402.07841v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.07841v2-abstract-full" style="display: none;"> Membership inference attacks (MIAs) attempt to predict whether a particular datapoint is a member of a target model's training data. Despite extensive research on traditional machine learning models, there has been limited work studying MIA on the pre-training data of large language models (LLMs). We perform a large-scale evaluation of MIAs over a suite of language models (LMs) trained on the Pile, ranging from 160M to 12B parameters. We find that MIAs barely outperform random guessing for most settings across varying LLM sizes and domains. Our further analyses reveal that this poor performance can be attributed to (1) the combination of a large dataset and few training iterations, and (2) an inherently fuzzy boundary between members and non-members. We identify specific settings where LLMs have been shown to be vulnerable to membership inference and show that the apparent success in such settings can be attributed to a distribution shift, such as when members and non-members are drawn from the seemingly identical domain but with different temporal ranges. We release our code and data as a unified benchmark package that includes all existing MIAs, supporting future work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.07841v2-abstract-full').style.display = 'none'; document.getElementById('2402.07841v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at Conference on Language Modeling (COLM), 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.00838">arXiv:2402.00838</a> <span> [<a href="https://arxiv.org/pdf/2402.00838">pdf</a>, <a href="https://arxiv.org/format/2402.00838">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> OLMo: Accelerating the Science of Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Groeneveld%2C+D">Dirk Groeneveld</a>, <a href="/search/cs?searchtype=author&query=Beltagy%2C+I">Iz Beltagy</a>, <a href="/search/cs?searchtype=author&query=Walsh%2C+P">Pete Walsh</a>, <a href="/search/cs?searchtype=author&query=Bhagia%2C+A">Akshita Bhagia</a>, <a href="/search/cs?searchtype=author&query=Kinney%2C+R">Rodney Kinney</a>, <a href="/search/cs?searchtype=author&query=Tafjord%2C+O">Oyvind Tafjord</a>, <a href="/search/cs?searchtype=author&query=Jha%2C+A+H">Ananya Harsh Jha</a>, <a href="/search/cs?searchtype=author&query=Ivison%2C+H">Hamish Ivison</a>, <a href="/search/cs?searchtype=author&query=Magnusson%2C+I">Ian Magnusson</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yizhong Wang</a>, <a href="/search/cs?searchtype=author&query=Arora%2C+S">Shane Arora</a>, <a href="/search/cs?searchtype=author&query=Atkinson%2C+D">David Atkinson</a>, <a href="/search/cs?searchtype=author&query=Authur%2C+R">Russell Authur</a>, <a href="/search/cs?searchtype=author&query=Chandu%2C+K+R">Khyathi Raghavi Chandu</a>, <a href="/search/cs?searchtype=author&query=Cohan%2C+A">Arman Cohan</a>, <a href="/search/cs?searchtype=author&query=Dumas%2C+J">Jennifer Dumas</a>, <a href="/search/cs?searchtype=author&query=Elazar%2C+Y">Yanai Elazar</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yuling Gu</a>, <a href="/search/cs?searchtype=author&query=Hessel%2C+J">Jack Hessel</a>, <a href="/search/cs?searchtype=author&query=Khot%2C+T">Tushar Khot</a>, <a href="/search/cs?searchtype=author&query=Merrill%2C+W">William Merrill</a>, <a href="/search/cs?searchtype=author&query=Morrison%2C+J">Jacob Morrison</a>, <a href="/search/cs?searchtype=author&query=Muennighoff%2C+N">Niklas Muennighoff</a>, <a href="/search/cs?searchtype=author&query=Naik%2C+A">Aakanksha Naik</a>, <a href="/search/cs?searchtype=author&query=Nam%2C+C">Crystal Nam</a> , et al. (18 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.00838v4-abstract-short" style="display: inline;"> Language models (LMs) have become ubiquitous in both NLP research and in commercial product offerings. As their commercial importance has surged, the most powerful models have become closed off, gated behind proprietary interfaces, with important details of their training data, architectures, and development undisclosed. Given the importance of these details in scientifically studying these models… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.00838v4-abstract-full').style.display = 'inline'; document.getElementById('2402.00838v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.00838v4-abstract-full" style="display: none;"> Language models (LMs) have become ubiquitous in both NLP research and in commercial product offerings. As their commercial importance has surged, the most powerful models have become closed off, gated behind proprietary interfaces, with important details of their training data, architectures, and development undisclosed. Given the importance of these details in scientifically studying these models, including their biases and potential risks, we believe it is essential for the research community to have access to powerful, truly open LMs. To this end, we have built OLMo, a competitive, truly Open Language Model, to enable the scientific study of language models. Unlike most prior efforts that have only released model weights and inference code, we release OLMo alongside open training data and training and evaluation code. We hope this release will empower the open research community and inspire a new wave of innovation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.00838v4-abstract-full').style.display = 'none'; document.getElementById('2402.00838v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.00159">arXiv:2402.00159</a> <span> [<a href="https://arxiv.org/pdf/2402.00159">pdf</a>, <a href="https://arxiv.org/format/2402.00159">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Soldaini%2C+L">Luca Soldaini</a>, <a href="/search/cs?searchtype=author&query=Kinney%2C+R">Rodney Kinney</a>, <a href="/search/cs?searchtype=author&query=Bhagia%2C+A">Akshita Bhagia</a>, <a href="/search/cs?searchtype=author&query=Schwenk%2C+D">Dustin Schwenk</a>, <a href="/search/cs?searchtype=author&query=Atkinson%2C+D">David Atkinson</a>, <a href="/search/cs?searchtype=author&query=Authur%2C+R">Russell Authur</a>, <a href="/search/cs?searchtype=author&query=Bogin%2C+B">Ben Bogin</a>, <a href="/search/cs?searchtype=author&query=Chandu%2C+K">Khyathi Chandu</a>, <a href="/search/cs?searchtype=author&query=Dumas%2C+J">Jennifer Dumas</a>, <a href="/search/cs?searchtype=author&query=Elazar%2C+Y">Yanai Elazar</a>, <a href="/search/cs?searchtype=author&query=Hofmann%2C+V">Valentin Hofmann</a>, <a href="/search/cs?searchtype=author&query=Jha%2C+A+H">Ananya Harsh Jha</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+S">Sachin Kumar</a>, <a href="/search/cs?searchtype=author&query=Lucy%2C+L">Li Lucy</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+X">Xinxi Lyu</a>, <a href="/search/cs?searchtype=author&query=Lambert%2C+N">Nathan Lambert</a>, <a href="/search/cs?searchtype=author&query=Magnusson%2C+I">Ian Magnusson</a>, <a href="/search/cs?searchtype=author&query=Morrison%2C+J">Jacob Morrison</a>, <a href="/search/cs?searchtype=author&query=Muennighoff%2C+N">Niklas Muennighoff</a>, <a href="/search/cs?searchtype=author&query=Naik%2C+A">Aakanksha Naik</a>, <a href="/search/cs?searchtype=author&query=Nam%2C+C">Crystal Nam</a>, <a href="/search/cs?searchtype=author&query=Peters%2C+M+E">Matthew E. Peters</a>, <a href="/search/cs?searchtype=author&query=Ravichander%2C+A">Abhilasha Ravichander</a>, <a href="/search/cs?searchtype=author&query=Richardson%2C+K">Kyle Richardson</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Z">Zejiang Shen</a> , et al. (11 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.00159v2-abstract-short" style="display: inline;"> Information about pretraining corpora used to train the current best-performing language models is seldom discussed: commercial models rarely detail their data, and even open models are often released without accompanying training data or recipes to reproduce them. As a result, it is challenging to conduct and advance scientific research on language modeling, such as understanding how training dat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.00159v2-abstract-full').style.display = 'inline'; document.getElementById('2402.00159v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.00159v2-abstract-full" style="display: none;"> Information about pretraining corpora used to train the current best-performing language models is seldom discussed: commercial models rarely detail their data, and even open models are often released without accompanying training data or recipes to reproduce them. As a result, it is challenging to conduct and advance scientific research on language modeling, such as understanding how training data impacts model capabilities and limitations. To facilitate scientific research on language model pretraining, we curate and release Dolma, a three-trillion-token English corpus, built from a diverse mixture of web content, scientific papers, code, public-domain books, social media, and encyclopedic materials. We extensively document Dolma, including its design principles, details about its construction, and a summary of its contents. We present analyses and experimental results on intermediate states of Dolma to share what we have learned about important data curation practices. Finally, we open-source our data curation toolkit to enable reproduction of our work as well as support further research in large-scale data curation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.00159v2-abstract-full').style.display = 'none'; document.getElementById('2402.00159v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ACL 2024; Dataset: https://hf.co/datasets/allenai/dolma; Code: https://github.com/allenai/dolma</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.17377">arXiv:2401.17377</a> <span> [<a href="https://arxiv.org/pdf/2401.17377">pdf</a>, <a href="https://arxiv.org/format/2401.17377">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Infini-gram: Scaling Unbounded n-gram Language Models to a Trillion Tokens </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiacheng Liu</a>, <a href="/search/cs?searchtype=author&query=Min%2C+S">Sewon Min</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+Y">Yejin Choi</a>, <a href="/search/cs?searchtype=author&query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.17377v3-abstract-short" style="display: inline;"> Are $n$-gram language models still relevant in this era of neural large language models (LLMs)? Our answer is yes, and we showcase their values in both text analysis and improving neural LLMs. This was done by modernizing $n$-gram LMs in two aspects. First, we train them at the same data scale as neural LLMs -- 5 trillion tokens. This is the largest $n$-gram LM ever built. Second, existing $n$-gra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.17377v3-abstract-full').style.display = 'inline'; document.getElementById('2401.17377v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.17377v3-abstract-full" style="display: none;"> Are $n$-gram language models still relevant in this era of neural large language models (LLMs)? Our answer is yes, and we showcase their values in both text analysis and improving neural LLMs. This was done by modernizing $n$-gram LMs in two aspects. First, we train them at the same data scale as neural LLMs -- 5 trillion tokens. This is the largest $n$-gram LM ever built. Second, existing $n$-gram LMs use small $n$ which hinders their performance; we instead allow $n$ to be arbitrarily large, by introducing a new $\infty$-gram LM with backoff. Instead of pre-computing $n$-gram count tables (which would be very expensive), we develop an engine named infini-gram -- powered by suffix arrays -- that can compute $\infty$-gram (as well as $n$-gram with arbitrary $n$) probabilities with millisecond-level latency. The $\infty$-gram framework and infini-gram engine enable us to conduct many novel and interesting analyses of human-written and machine-generated text: we find that the $\infty$-gram LM has fairly high accuracy for next-token prediction (47%), and can complement neural LLMs to greatly reduce their perplexity. When analyzing machine-generated text, we also observe irregularities in the machine--$\infty$-gram agreement level with respect to the suffix length, which indicates deficiencies in neural LLM pretraining and the positional embeddings of Transformers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.17377v3-abstract-full').style.display = 'none'; document.getElementById('2401.17377v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.10440">arXiv:2401.10440</a> <span> [<a href="https://arxiv.org/pdf/2401.10440">pdf</a>, <a href="https://arxiv.org/format/2401.10440">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Breaking the Curse of Multilinguality with Cross-lingual Expert Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Blevins%2C+T">Terra Blevins</a>, <a href="/search/cs?searchtype=author&query=Limisiewicz%2C+T">Tomasz Limisiewicz</a>, <a href="/search/cs?searchtype=author&query=Gururangan%2C+S">Suchin Gururangan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Margaret Li</a>, <a href="/search/cs?searchtype=author&query=Gonen%2C+H">Hila Gonen</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+N+A">Noah A. Smith</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.10440v2-abstract-short" style="display: inline;"> Despite their popularity in non-English NLP, multilingual language models often underperform monolingual ones due to inter-language competition for model parameters. We propose Cross-lingual Expert Language Models (X-ELM), which mitigate this competition by independently training language models on subsets of the multilingual corpus. This process specializes X-ELMs to different languages while rem… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.10440v2-abstract-full').style.display = 'inline'; document.getElementById('2401.10440v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.10440v2-abstract-full" style="display: none;"> Despite their popularity in non-English NLP, multilingual language models often underperform monolingual ones due to inter-language competition for model parameters. We propose Cross-lingual Expert Language Models (X-ELM), which mitigate this competition by independently training language models on subsets of the multilingual corpus. This process specializes X-ELMs to different languages while remaining effective as a multilingual ensemble. Our experiments show that when given the same compute budget, X-ELM outperforms jointly trained multilingual models across all considered languages and that these gains transfer to downstream tasks. X-ELM provides additional benefits over performance improvements: new experts can be iteratively added, adapting X-ELM to new languages without catastrophic forgetting. Furthermore, training is asynchronous, reducing the hardware requirements for multilingual training and democratizing multilingual modeling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.10440v2-abstract-full').style.display = 'none'; document.getElementById('2401.10440v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.05180">arXiv:2312.05180</a> <span> [<a href="https://arxiv.org/pdf/2312.05180">pdf</a>, <a href="https://arxiv.org/format/2312.05180">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> PathFinder: Guided Search over Multi-Step Reasoning Paths </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Golovneva%2C+O">Olga Golovneva</a>, <a href="/search/cs?searchtype=author&query=O%27Brien%2C+S">Sean O'Brien</a>, <a href="/search/cs?searchtype=author&query=Pasunuru%2C+R">Ramakanth Pasunuru</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tianlu Wang</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Fazel-Zarandi%2C+M">Maryam Fazel-Zarandi</a>, <a href="/search/cs?searchtype=author&query=Celikyilmaz%2C+A">Asli Celikyilmaz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.05180v2-abstract-short" style="display: inline;"> With recent advancements in large language models, methods like chain-of-thought prompting to elicit reasoning chains have been shown to improve results on reasoning tasks. However, tasks that require multiple steps of reasoning still pose significant challenges to state-of-the-art models. Drawing inspiration from the beam search algorithm, we propose PathFinder, a tree-search-based reasoning path… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.05180v2-abstract-full').style.display = 'inline'; document.getElementById('2312.05180v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.05180v2-abstract-full" style="display: none;"> With recent advancements in large language models, methods like chain-of-thought prompting to elicit reasoning chains have been shown to improve results on reasoning tasks. However, tasks that require multiple steps of reasoning still pose significant challenges to state-of-the-art models. Drawing inspiration from the beam search algorithm, we propose PathFinder, a tree-search-based reasoning path generation approach. It enhances diverse branching and multi-hop reasoning through the integration of dynamic decoding, enabled by varying sampling methods and parameters. Using constrained reasoning, PathFinder integrates novel quality constraints, pruning, and exploration methods to enhance the efficiency and the quality of generation. Moreover, it includes scoring and ranking features to improve candidate selection. Our approach outperforms competitive baselines on three complex arithmetic and commonsense reasoning tasks by 6% on average. Our model generalizes well to longer, unseen reasoning chains, reflecting similar complexities to beam search with large branching factors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.05180v2-abstract-full').style.display = 'none'; document.getElementById('2312.05180v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2023 R0-FoMo Workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.16789">arXiv:2310.16789</a> <span> [<a href="https://arxiv.org/pdf/2310.16789">pdf</a>, <a href="https://arxiv.org/format/2310.16789">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Detecting Pretraining Data from Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+W">Weijia Shi</a>, <a href="/search/cs?searchtype=author&query=Ajith%2C+A">Anirudh Ajith</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+M">Mengzhou Xia</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yangsibo Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Daogao Liu</a>, <a href="/search/cs?searchtype=author&query=Blevins%2C+T">Terra Blevins</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Danqi Chen</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.16789v3-abstract-short" style="display: inline;"> Although large language models (LLMs) are widely deployed, the data used to train them is rarely disclosed. Given the incredible scale of this data, up to trillions of tokens, it is all but certain that it includes potentially problematic text such as copyrighted materials, personally identifiable information, and test data for widely reported reference benchmarks. However, we currently have no wa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.16789v3-abstract-full').style.display = 'inline'; document.getElementById('2310.16789v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.16789v3-abstract-full" style="display: none;"> Although large language models (LLMs) are widely deployed, the data used to train them is rarely disclosed. Given the incredible scale of this data, up to trillions of tokens, it is all but certain that it includes potentially problematic text such as copyrighted materials, personally identifiable information, and test data for widely reported reference benchmarks. However, we currently have no way to know which data of these types is included or in what proportions. In this paper, we study the pretraining data detection problem: given a piece of text and black-box access to an LLM without knowing the pretraining data, can we determine if the model was trained on the provided text? To facilitate this study, we introduce a dynamic benchmark WIKIMIA that uses data created before and after model training to support gold truth detection. We also introduce a new detection method Min-K% Prob based on a simple hypothesis: an unseen example is likely to contain a few outlier words with low probabilities under the LLM, while a seen example is less likely to have words with such low probabilities. Min-K% Prob can be applied without any knowledge about the pretraining corpus or any additional training, departing from previous detection methods that require training a reference model on data that is similar to the pretraining data. Moreover, our experiments demonstrate that Min-K% Prob achieves a 7.4% improvement on WIKIMIA over these previous methods. We apply Min-K% Prob to three real-world scenarios, copyrighted book detection, contaminated downstream example detection and privacy auditing of machine unlearning, and find it a consistently effective solution. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.16789v3-abstract-full').style.display = 'none'; document.getElementById('2310.16789v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.11564">arXiv:2310.11564</a> <span> [<a href="https://arxiv.org/pdf/2310.11564">pdf</a>, <a href="https://arxiv.org/format/2310.11564">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Personalized Soups: Personalized Large Language Model Alignment via Post-hoc Parameter Merging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jang%2C+J">Joel Jang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Seungone Kim</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B+Y">Bill Yuchen Lin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yizhong Wang</a>, <a href="/search/cs?searchtype=author&query=Hessel%2C+J">Jack Hessel</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+Y">Yejin Choi</a>, <a href="/search/cs?searchtype=author&query=Ammanabrolu%2C+P">Prithviraj Ammanabrolu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.11564v1-abstract-short" style="display: inline;"> While Reinforcement Learning from Human Feedback (RLHF) aligns Large Language Models (LLMs) with general, aggregate human preferences, it is suboptimal for learning diverse, individual perspectives. In this work, we study Reinforcement Learning from Personalized Human Feedback (RLPHF) problem, wherein LLMs are aligned to multiple (sometimes conflicting) preferences by modeling alignment as a Multi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.11564v1-abstract-full').style.display = 'inline'; document.getElementById('2310.11564v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.11564v1-abstract-full" style="display: none;"> While Reinforcement Learning from Human Feedback (RLHF) aligns Large Language Models (LLMs) with general, aggregate human preferences, it is suboptimal for learning diverse, individual perspectives. In this work, we study Reinforcement Learning from Personalized Human Feedback (RLPHF) problem, wherein LLMs are aligned to multiple (sometimes conflicting) preferences by modeling alignment as a Multi-Objective Reinforcement Learning (MORL) problem. Compared to strong single-objective baselines, we show that we can achieve personalized alignment by decomposing preferences into multiple dimensions. These dimensions are defined based on personalizations that are declared as desirable by the user. In this work, we show that they can be efficiently trained independently in a distributed manner and combined effectively post-hoc through parameter merging. The code is available at https://github.com/joeljang/RLPHF. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.11564v1-abstract-full').style.display = 'none'; document.getElementById('2310.11564v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.10638">arXiv:2310.10638</a> <span> [<a href="https://arxiv.org/pdf/2310.10638">pdf</a>, <a href="https://arxiv.org/format/2310.10638">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> In-context Pretraining: Language Modeling Beyond Document Boundaries </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+W">Weijia Shi</a>, <a href="/search/cs?searchtype=author&query=Min%2C+S">Sewon Min</a>, <a href="/search/cs?searchtype=author&query=Lomeli%2C+M">Maria Lomeli</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Chunting Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Margaret Li</a>, <a href="/search/cs?searchtype=author&query=Szilvasy%2C+G">Gergely Szilvasy</a>, <a href="/search/cs?searchtype=author&query=James%2C+R">Rich James</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X+V">Xi Victoria Lin</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+N+A">Noah A. Smith</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Yih%2C+S">Scott Yih</a>, <a href="/search/cs?searchtype=author&query=Lewis%2C+M">Mike Lewis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.10638v6-abstract-short" style="display: inline;"> Large language models (LMs) are currently trained to predict tokens given document prefixes, enabling them to directly perform long-form generation and prompting-style tasks which can be reduced to document completion. Existing pretraining pipelines train LMs by concatenating random sets of short documents to create input contexts but the prior documents provide no signal for predicting the next d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.10638v6-abstract-full').style.display = 'inline'; document.getElementById('2310.10638v6-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.10638v6-abstract-full" style="display: none;"> Large language models (LMs) are currently trained to predict tokens given document prefixes, enabling them to directly perform long-form generation and prompting-style tasks which can be reduced to document completion. Existing pretraining pipelines train LMs by concatenating random sets of short documents to create input contexts but the prior documents provide no signal for predicting the next document. We instead present In-Context Pretraining, a new approach where language models are pretrained on a sequence of related documents, thereby explicitly encouraging them to read and reason across document boundaries. We can do In-Context Pretraining by simply changing the document ordering so that each context contains related documents, and directly applying existing pretraining pipelines. However, this document sorting problem is challenging. There are billions of documents and we would like the sort to maximize contextual similarity for every document without repeating any data. To do this, we introduce approximate algorithms for finding related documents with efficient nearest neighbor search and constructing coherent input contexts with a graph traversal algorithm. Our experiments show In-Context Pretraining offers a simple and scalable approach to significantly enhance LMs'performance: we see notable improvements in tasks that require more complex contextual reasoning, including in-context learning (+8%), reading comprehension (+15%), faithfulness to previous contexts (+16%), long-context reasoning (+5%), and retrieval augmentation (+9%). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.10638v6-abstract-full').style.display = 'none'; document.getElementById('2310.10638v6-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.01352">arXiv:2310.01352</a> <span> [<a href="https://arxiv.org/pdf/2310.01352">pdf</a>, <a href="https://arxiv.org/format/2310.01352">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RA-DIT: Retrieval-Augmented Dual Instruction Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+X+V">Xi Victoria Lin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xilun Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mingda Chen</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+W">Weijia Shi</a>, <a href="/search/cs?searchtype=author&query=Lomeli%2C+M">Maria Lomeli</a>, <a href="/search/cs?searchtype=author&query=James%2C+R">Rich James</a>, <a href="/search/cs?searchtype=author&query=Rodriguez%2C+P">Pedro Rodriguez</a>, <a href="/search/cs?searchtype=author&query=Kahn%2C+J">Jacob Kahn</a>, <a href="/search/cs?searchtype=author&query=Szilvasy%2C+G">Gergely Szilvasy</a>, <a href="/search/cs?searchtype=author&query=Lewis%2C+M">Mike Lewis</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Yih%2C+S">Scott Yih</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.01352v4-abstract-short" style="display: inline;"> Retrieval-augmented language models (RALMs) improve performance by accessing long-tail and up-to-date knowledge from external data stores, but are challenging to build. Existing approaches require either expensive retrieval-specific modifications to LM pre-training or use post-hoc integration of the data store that leads to suboptimal performance. We introduce Retrieval-Augmented Dual Instruction… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.01352v4-abstract-full').style.display = 'inline'; document.getElementById('2310.01352v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.01352v4-abstract-full" style="display: none;"> Retrieval-augmented language models (RALMs) improve performance by accessing long-tail and up-to-date knowledge from external data stores, but are challenging to build. Existing approaches require either expensive retrieval-specific modifications to LM pre-training or use post-hoc integration of the data store that leads to suboptimal performance. We introduce Retrieval-Augmented Dual Instruction Tuning (RA-DIT), a lightweight fine-tuning methodology that provides a third option by retrofitting any LLM with retrieval capabilities. Our approach operates in two distinct fine-tuning steps: (1) one updates a pre-trained LM to better use retrieved information, while (2) the other updates the retriever to return more relevant results, as preferred by the LM. By fine-tuning over tasks that require both knowledge utilization and contextual awareness, we demonstrate that each stage yields significant performance improvements, and using both leads to additional gains. Our best model, RA-DIT 65B, achieves state-of-the-art performance across a range of knowledge-intensive zero- and few-shot learning benchmarks, significantly outperforming existing in-context RALM approaches by up to +8.9% in 0-shot setting and +1.4% in 5-shot setting on average. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.01352v4-abstract-full').style.display = 'none'; document.getElementById('2310.01352v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">v4: ICLR 2024 camera-ready version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.16671">arXiv:2309.16671</a> <span> [<a href="https://arxiv.org/pdf/2309.16671">pdf</a>, <a href="https://arxiv.org/format/2309.16671">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Demystifying CLIP Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hu Xu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Saining Xie</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+X+E">Xiaoqing Ellen Tan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+P">Po-Yao Huang</a>, <a href="/search/cs?searchtype=author&query=Howes%2C+R">Russell Howes</a>, <a href="/search/cs?searchtype=author&query=Sharma%2C+V">Vasu Sharma</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shang-Wen Li</a>, <a href="/search/cs?searchtype=author&query=Ghosh%2C+G">Gargi Ghosh</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Feichtenhofer%2C+C">Christoph Feichtenhofer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.16671v4-abstract-short" style="display: inline;"> Contrastive Language-Image Pre-training (CLIP) is an approach that has advanced research and applications in computer vision, fueling modern recognition systems and generative models. We believe that the main ingredient to the success of CLIP is its data and not the model architecture or pre-training objective. However, CLIP only provides very limited information about its data and how it has been… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.16671v4-abstract-full').style.display = 'inline'; document.getElementById('2309.16671v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.16671v4-abstract-full" style="display: none;"> Contrastive Language-Image Pre-training (CLIP) is an approach that has advanced research and applications in computer vision, fueling modern recognition systems and generative models. We believe that the main ingredient to the success of CLIP is its data and not the model architecture or pre-training objective. However, CLIP only provides very limited information about its data and how it has been collected, leading to works that aim to reproduce CLIP's data by filtering with its model parameters. In this work, we intend to reveal CLIP's data curation approach and in our pursuit of making it open to the community introduce Metadata-Curated Language-Image Pre-training (MetaCLIP). MetaCLIP takes a raw data pool and metadata (derived from CLIP's concepts) and yields a balanced subset over the metadata distribution. Our experimental study rigorously isolates the model and training settings, concentrating solely on data. MetaCLIP applied to CommonCrawl with 400M image-text data pairs outperforms CLIP's data on multiple standard benchmarks. In zero-shot ImageNet classification, MetaCLIP achieves 70.8% accuracy, surpassing CLIP's 68.3% on ViT-B models. Scaling to 1B data, while maintaining the same training budget, attains 72.4%. Our observations hold across various model sizes, exemplified by ViT-H achieving 80.5%, without any bells-and-whistles. Curation code and training data distribution on metadata is made available at https://github.com/facebookresearch/MetaCLIP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.16671v4-abstract-full').style.display = 'none'; document.getElementById('2309.16671v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages. arXiv admin note: text overlap with arXiv:2103.00020 by other authors</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.02591">arXiv:2309.02591</a> <span> [<a href="https://arxiv.org/pdf/2309.02591">pdf</a>, <a href="https://arxiv.org/format/2309.02591">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Scaling Autoregressive Multi-Modal Models: Pretraining and Instruction Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+L">Lili Yu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+B">Bowen Shi</a>, <a href="/search/cs?searchtype=author&query=Pasunuru%2C+R">Ramakanth Pasunuru</a>, <a href="/search/cs?searchtype=author&query=Muller%2C+B">Benjamin Muller</a>, <a href="/search/cs?searchtype=author&query=Golovneva%2C+O">Olga Golovneva</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tianlu Wang</a>, <a href="/search/cs?searchtype=author&query=Babu%2C+A">Arun Babu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+B">Binh Tang</a>, <a href="/search/cs?searchtype=author&query=Karrer%2C+B">Brian Karrer</a>, <a href="/search/cs?searchtype=author&query=Sheynin%2C+S">Shelly Sheynin</a>, <a href="/search/cs?searchtype=author&query=Ross%2C+C">Candace Ross</a>, <a href="/search/cs?searchtype=author&query=Polyak%2C+A">Adam Polyak</a>, <a href="/search/cs?searchtype=author&query=Howes%2C+R">Russell Howes</a>, <a href="/search/cs?searchtype=author&query=Sharma%2C+V">Vasu Sharma</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+P">Puxin Xu</a>, <a href="/search/cs?searchtype=author&query=Tamoyan%2C+H">Hovhannes Tamoyan</a>, <a href="/search/cs?searchtype=author&query=Ashual%2C+O">Oron Ashual</a>, <a href="/search/cs?searchtype=author&query=Singer%2C+U">Uriel Singer</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shang-Wen Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Susan Zhang</a>, <a href="/search/cs?searchtype=author&query=James%2C+R">Richard James</a>, <a href="/search/cs?searchtype=author&query=Ghosh%2C+G">Gargi Ghosh</a>, <a href="/search/cs?searchtype=author&query=Taigman%2C+Y">Yaniv Taigman</a>, <a href="/search/cs?searchtype=author&query=Fazel-Zarandi%2C+M">Maryam Fazel-Zarandi</a>, <a href="/search/cs?searchtype=author&query=Celikyilmaz%2C+A">Asli Celikyilmaz</a> , et al. (2 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.02591v1-abstract-short" style="display: inline;"> We present CM3Leon (pronounced "Chameleon"), a retrieval-augmented, token-based, decoder-only multi-modal language model capable of generating and infilling both text and images. CM3Leon uses the CM3 multi-modal architecture but additionally shows the extreme benefits of scaling up and tuning on more diverse instruction-style data. It is the first multi-modal model trained with a recipe adapted fr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.02591v1-abstract-full').style.display = 'inline'; document.getElementById('2309.02591v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.02591v1-abstract-full" style="display: none;"> We present CM3Leon (pronounced "Chameleon"), a retrieval-augmented, token-based, decoder-only multi-modal language model capable of generating and infilling both text and images. CM3Leon uses the CM3 multi-modal architecture but additionally shows the extreme benefits of scaling up and tuning on more diverse instruction-style data. It is the first multi-modal model trained with a recipe adapted from text-only language models, including a large-scale retrieval-augmented pre-training stage and a second multi-task supervised fine-tuning (SFT) stage. It is also a general-purpose model that can do both text-to-image and image-to-text generation, allowing us to introduce self-contained contrastive decoding methods that produce high-quality outputs. Extensive experiments demonstrate that this recipe is highly effective for multi-modal models. CM3Leon achieves state-of-the-art performance in text-to-image generation with 5x less training compute than comparable methods (zero-shot MS-COCO FID of 4.88). After SFT, CM3Leon can also demonstrate unprecedented levels of controllability in tasks ranging from language-guided image editing to image-controlled generation and segmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.02591v1-abstract-full').style.display = 'none'; document.getElementById('2309.02591v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.16884">arXiv:2308.16884</a> <span> [<a href="https://arxiv.org/pdf/2308.16884">pdf</a>, <a href="https://arxiv.org/format/2308.16884">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.18653/v1/2024.acl-long.44">10.18653/v1/2024.acl-long.44 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bandarkar%2C+L">Lucas Bandarkar</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+D">Davis Liang</a>, <a href="/search/cs?searchtype=author&query=Muller%2C+B">Benjamin Muller</a>, <a href="/search/cs?searchtype=author&query=Artetxe%2C+M">Mikel Artetxe</a>, <a href="/search/cs?searchtype=author&query=Shukla%2C+S+N">Satya Narayan Shukla</a>, <a href="/search/cs?searchtype=author&query=Husa%2C+D">Donald Husa</a>, <a href="/search/cs?searchtype=author&query=Goyal%2C+N">Naman Goyal</a>, <a href="/search/cs?searchtype=author&query=Krishnan%2C+A">Abhinandan Krishnan</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Khabsa%2C+M">Madian Khabsa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.16884v2-abstract-short" style="display: inline;"> We present Belebele, a multiple-choice machine reading comprehension (MRC) dataset spanning 122 language variants. Significantly expanding the language coverage of natural language understanding (NLU) benchmarks, this dataset enables the evaluation of text models in high-, medium-, and low-resource languages. Each question is based on a short passage from the Flores-200 dataset and has four multip… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.16884v2-abstract-full').style.display = 'inline'; document.getElementById('2308.16884v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.16884v2-abstract-full" style="display: none;"> We present Belebele, a multiple-choice machine reading comprehension (MRC) dataset spanning 122 language variants. Significantly expanding the language coverage of natural language understanding (NLU) benchmarks, this dataset enables the evaluation of text models in high-, medium-, and low-resource languages. Each question is based on a short passage from the Flores-200 dataset and has four multiple-choice answers. The questions were carefully curated to discriminate between models with different levels of general language comprehension. The English dataset on its own proves difficult enough to challenge state-of-the-art language models. Being fully parallel, this dataset enables direct comparison of model performance across all languages. We use this dataset to evaluate the capabilities of multilingual masked language models (MLMs) and large language models (LLMs). We present extensive results and find that despite significant cross-lingual transfer in English-centric LLMs, much smaller MLMs pretrained on balanced multilingual data still understand far more languages. We also observe that larger vocabulary size and conscious vocabulary construction correlate with better performance on low-resource languages. Overall, Belebele opens up new avenues for evaluating and analyzing the multilingual capabilities of NLP systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.16884v2-abstract-full').style.display = 'none'; document.getElementById('2308.16884v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACL 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics 749-775 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.16871">arXiv:2308.16871</a> <span> [<a href="https://arxiv.org/pdf/2308.16871">pdf</a>, <a href="https://arxiv.org/format/2308.16871">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> The Gender-GAP Pipeline: A Gender-Aware Polyglot Pipeline for Gender Characterisation in 55 Languages </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Muller%2C+B">Benjamin Muller</a>, <a href="/search/cs?searchtype=author&query=Alastruey%2C+B">Belen Alastruey</a>, <a href="/search/cs?searchtype=author&query=Hansanti%2C+P">Prangthip Hansanti</a>, <a href="/search/cs?searchtype=author&query=Kalbassi%2C+E">Elahe Kalbassi</a>, <a href="/search/cs?searchtype=author&query=Ropers%2C+C">Christophe Ropers</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+E+M">Eric Michael Smith</a>, <a href="/search/cs?searchtype=author&query=Williams%2C+A">Adina Williams</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Andrews%2C+P">Pierre Andrews</a>, <a href="/search/cs?searchtype=author&query=Costa-juss%C3%A0%2C+M+R">Marta R. Costa-juss脿</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.16871v1-abstract-short" style="display: inline;"> Gender biases in language generation systems are challenging to mitigate. One possible source for these biases is gender representation disparities in the training and evaluation data. Despite recent progress in documenting this problem and many attempts at mitigating it, we still lack shared methodology and tooling to report gender representation in large datasets. Such quantitative reporting wil… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.16871v1-abstract-full').style.display = 'inline'; document.getElementById('2308.16871v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.16871v1-abstract-full" style="display: none;"> Gender biases in language generation systems are challenging to mitigate. One possible source for these biases is gender representation disparities in the training and evaluation data. Despite recent progress in documenting this problem and many attempts at mitigating it, we still lack shared methodology and tooling to report gender representation in large datasets. Such quantitative reporting will enable further mitigation, e.g., via data augmentation. This paper describes the Gender-GAP Pipeline (for Gender-Aware Polyglot Pipeline), an automatic pipeline to characterize gender representation in large-scale datasets for 55 languages. The pipeline uses a multilingual lexicon of gendered person-nouns to quantify the gender representation in text. We showcase it to report gender representation in WMT training data and development data for the News task, confirming that current data is skewed towards masculine representation. Having unbalanced datasets may indirectly optimize our systems towards outperforming one gender over the others. We suggest introducing our gender quantification pipeline in current datasets and, ideally, modifying them toward a balanced representation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.16871v1-abstract-full').style.display = 'none'; document.getElementById('2308.16871v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.06259">arXiv:2308.06259</a> <span> [<a href="https://arxiv.org/pdf/2308.06259">pdf</a>, <a href="https://arxiv.org/format/2308.06259">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Self-Alignment with Instruction Backtranslation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xian Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P">Ping Yu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Chunting Zhou</a>, <a href="/search/cs?searchtype=author&query=Schick%2C+T">Timo Schick</a>, <a href="/search/cs?searchtype=author&query=Levy%2C+O">Omer Levy</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Lewis%2C+M">Mike Lewis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.06259v3-abstract-short" style="display: inline;"> We present a scalable method to build a high quality instruction following language model by automatically labelling human-written text with corresponding instructions. Our approach, named instruction backtranslation, starts with a language model finetuned on a small amount of seed data, and a given web corpus. The seed model is used to construct training examples by generating instruction prompts… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.06259v3-abstract-full').style.display = 'inline'; document.getElementById('2308.06259v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.06259v3-abstract-full" style="display: none;"> We present a scalable method to build a high quality instruction following language model by automatically labelling human-written text with corresponding instructions. Our approach, named instruction backtranslation, starts with a language model finetuned on a small amount of seed data, and a given web corpus. The seed model is used to construct training examples by generating instruction prompts for web documents (self-augmentation), and then selecting high quality examples from among these candidates (self-curation). This data is then used to finetune a stronger model. Finetuning LLaMa on two iterations of our approach yields a model that outperforms all other LLaMa-based models on the Alpaca leaderboard not relying on distillation data, demonstrating highly effective self-alignment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.06259v3-abstract-full').style.display = 'none'; document.getElementById('2308.06259v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR2024 camera ready</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.04592">arXiv:2308.04592</a> <span> [<a href="https://arxiv.org/pdf/2308.04592">pdf</a>, <a href="https://arxiv.org/format/2308.04592">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Shepherd: A Critic for Language Model Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tianlu Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P">Ping Yu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+X+E">Xiaoqing Ellen Tan</a>, <a href="/search/cs?searchtype=author&query=O%27Brien%2C+S">Sean O'Brien</a>, <a href="/search/cs?searchtype=author&query=Pasunuru%2C+R">Ramakanth Pasunuru</a>, <a href="/search/cs?searchtype=author&query=Dwivedi-Yu%2C+J">Jane Dwivedi-Yu</a>, <a href="/search/cs?searchtype=author&query=Golovneva%2C+O">Olga Golovneva</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Fazel-Zarandi%2C+M">Maryam Fazel-Zarandi</a>, <a href="/search/cs?searchtype=author&query=Celikyilmaz%2C+A">Asli Celikyilmaz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.04592v1-abstract-short" style="display: inline;"> As large language models improve, there is increasing interest in techniques that leverage these models' capabilities to refine their own outputs. In this work, we introduce Shepherd, a language model specifically tuned to critique responses and suggest refinements, extending beyond the capabilities of an untuned model to identify diverse errors and provide suggestions to remedy them. At the core… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.04592v1-abstract-full').style.display = 'inline'; document.getElementById('2308.04592v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.04592v1-abstract-full" style="display: none;"> As large language models improve, there is increasing interest in techniques that leverage these models' capabilities to refine their own outputs. In this work, we introduce Shepherd, a language model specifically tuned to critique responses and suggest refinements, extending beyond the capabilities of an untuned model to identify diverse errors and provide suggestions to remedy them. At the core of our approach is a high quality feedback dataset, which we curate from community feedback and human annotations. Even though Shepherd is small (7B parameters), its critiques are either equivalent or preferred to those from established models including ChatGPT. Using GPT-4 for evaluation, Shepherd reaches an average win-rate of 53-87% compared to competitive alternatives. In human evaluation, Shepherd strictly outperforms other models and on average closely ties with ChatGPT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.04592v1-abstract-full').style.display = 'none'; document.getElementById('2308.04592v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 figures, 7 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.04430">arXiv:2308.04430</a> <span> [<a href="https://arxiv.org/pdf/2308.04430">pdf</a>, <a href="https://arxiv.org/format/2308.04430">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SILO Language Models: Isolating Legal Risk In a Nonparametric Datastore </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Min%2C+S">Sewon Min</a>, <a href="/search/cs?searchtype=author&query=Gururangan%2C+S">Suchin Gururangan</a>, <a href="/search/cs?searchtype=author&query=Wallace%2C+E">Eric Wallace</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+W">Weijia Shi</a>, <a href="/search/cs?searchtype=author&query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+N+A">Noah A. Smith</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.04430v2-abstract-short" style="display: inline;"> The legality of training language models (LMs) on copyrighted or otherwise restricted data is under intense debate. However, as we show, model performance significantly degrades if trained only on low-risk text (e.g., out-of-copyright books or government documents), due to its limited size and domain coverage. We present SILO, a new language model that manages this risk-performance tradeoff during… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.04430v2-abstract-full').style.display = 'inline'; document.getElementById('2308.04430v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.04430v2-abstract-full" style="display: none;"> The legality of training language models (LMs) on copyrighted or otherwise restricted data is under intense debate. However, as we show, model performance significantly degrades if trained only on low-risk text (e.g., out-of-copyright books or government documents), due to its limited size and domain coverage. We present SILO, a new language model that manages this risk-performance tradeoff during inference. SILO is built by (1) training a parametric LM on Open License Corpus (OLC), a new corpus we curate with 228B tokens of public domain and permissively licensed text and (2) augmenting it with a more general and easily modifiable nonparametric datastore (e.g., containing copyrighted books or news) that is only queried during inference. The datastore allows use of high-risk data without training on it, supports sentence-level data attribution, and enables data producers to opt out from the model by removing content from the store. These capabilities can foster compliance with data-use regulations such as the fair use doctrine in the United States and the GDPR in the European Union. Our experiments show that the parametric LM struggles on domains not covered by OLC. However, access to the datastore greatly improves out of domain performance, closing 90% of the performance gap with an LM trained on the Pile, a more diverse corpus with mostly high-risk text. We also analyze which nonparametric approach works best, where the remaining errors lie, and how performance scales with datastore size. Our results suggest that it is possible to build high quality language models while mitigating their legal risk. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.04430v2-abstract-full').style.display = 'none'; document.getElementById('2308.04430v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">29 pages; 7 figures. Published as a conference paper at ICLR 2024 (spotlight). Code, models, and data available at https://github.com/kernelmachine/silo-lm</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.00189">arXiv:2308.00189</a> <span> [<a href="https://arxiv.org/pdf/2308.00189">pdf</a>, <a href="https://arxiv.org/format/2308.00189">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Generative Models as a Complex Systems Science: How can we make sense of large language model behavior? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Holtzman%2C+A">Ari Holtzman</a>, <a href="/search/cs?searchtype=author&query=West%2C+P">Peter West</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.00189v1-abstract-short" style="display: inline;"> Coaxing out desired behavior from pretrained models, while avoiding undesirable ones, has redefined NLP and is reshaping how we interact with computers. What was once a scientific engineering discipline-in which building blocks are stacked one on top of the other-is arguably already a complex systems science, in which emergent behaviors are sought out to support previously unimagined use cases.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.00189v1-abstract-full').style.display = 'inline'; document.getElementById('2308.00189v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.00189v1-abstract-full" style="display: none;"> Coaxing out desired behavior from pretrained models, while avoiding undesirable ones, has redefined NLP and is reshaping how we interact with computers. What was once a scientific engineering discipline-in which building blocks are stacked one on top of the other-is arguably already a complex systems science, in which emergent behaviors are sought out to support previously unimagined use cases. Despite the ever increasing number of benchmarks that measure task performance, we lack explanations of what behaviors language models exhibit that allow them to complete these tasks in the first place. We argue for a systematic effort to decompose language model behavior into categories that explain cross-task performance, to guide mechanistic explanations and help future-proof analytic research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.00189v1-abstract-full').style.display = 'none'; document.getElementById('2308.00189v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.14739">arXiv:2305.14739</a> <span> [<a href="https://arxiv.org/pdf/2305.14739">pdf</a>, <a href="https://arxiv.org/format/2305.14739">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Trusting Your Evidence: Hallucinate Less with Context-aware Decoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+W">Weijia Shi</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiaochuang Han</a>, <a href="/search/cs?searchtype=author&query=Lewis%2C+M">Mike Lewis</a>, <a href="/search/cs?searchtype=author&query=Tsvetkov%2C+Y">Yulia Tsvetkov</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Yih%2C+S+W">Scott Wen-tau Yih</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.14739v1-abstract-short" style="display: inline;"> Language models (LMs) often struggle to pay enough attention to the input context, and generate texts that are unfaithful or contain hallucinations. To mitigate this issue, we present context-aware decoding (CAD), which follows a contrastive output distribution that amplifies the difference between the output probabilities when a model is used with and without context. Our experiments show that CA… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14739v1-abstract-full').style.display = 'inline'; document.getElementById('2305.14739v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.14739v1-abstract-full" style="display: none;"> Language models (LMs) often struggle to pay enough attention to the input context, and generate texts that are unfaithful or contain hallucinations. To mitigate this issue, we present context-aware decoding (CAD), which follows a contrastive output distribution that amplifies the difference between the output probabilities when a model is used with and without context. Our experiments show that CAD, without additional training, significantly improves the faithfulness of different LM families, including OPT, GPT, LLaMA and FLAN-T5 for summarization tasks (e.g., 14.3% gain for LLaMA in factuality metrics). Furthermore, CAD is particularly effective in overriding a model's prior knowledge when it contradicts the provided context, leading to substantial improvements in tasks where resolving the knowledge conflict is essential. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14739v1-abstract-full').style.display = 'none'; document.getElementById('2305.14739v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.14628">arXiv:2305.14628</a> <span> [<a href="https://arxiv.org/pdf/2305.14628">pdf</a>, <a href="https://arxiv.org/format/2305.14628">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Getting MoRE out of Mixture of Language Model Reasoning Experts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Si%2C+C">Chenglei Si</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+W">Weijia Shi</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chen Zhao</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Boyd-Graber%2C+J">Jordan Boyd-Graber</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.14628v2-abstract-short" style="display: inline;"> While recent large language models (LLMs) improve on various question answering (QA) datasets, it remains difficult for a single model to generalize across question types that require distinct reasoning abilities. We provide empirical evidence that state-of-the-art LLMs suffer from poor generalizability on reasoning types beyond those seen in the prompt. To remedy this, we propose a Mixture-of-Rea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14628v2-abstract-full').style.display = 'inline'; document.getElementById('2305.14628v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.14628v2-abstract-full" style="display: none;"> While recent large language models (LLMs) improve on various question answering (QA) datasets, it remains difficult for a single model to generalize across question types that require distinct reasoning abilities. We provide empirical evidence that state-of-the-art LLMs suffer from poor generalizability on reasoning types beyond those seen in the prompt. To remedy this, we propose a Mixture-of-Reasoning-Experts (MoRE) framework that ensembles diverse specialized language models. We specialize the backbone language model with prompts optimized for different reasoning categories, including factual, multihop, mathematical, and commonsense reasoning. Our key insight is to leverage agreement among the specialized experts to select the best answer for each question, or to abstain from answering. This gives MoRE higher accuracy than any single specialized model on a collection of 12 QA datasets from four reasoning types. Beyond generalizability, the interpretable design of MoRE improves selective question answering results compared to baselines without incorporating inter-expert agreement. This framework is also more interpretable and useful to human consumers of QA outputs. Our human study confirms that presenting expert predictions and the answer selection process helps annotators more accurately calibrate when to trust the system's output. We release all code and data to facilitate future work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14628v2-abstract-full').style.display = 'none'; document.getElementById('2305.14628v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2023 Findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.14314">arXiv:2305.14314</a> <span> [<a href="https://arxiv.org/pdf/2305.14314">pdf</a>, <a href="https://arxiv.org/format/2305.14314">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> QLoRA: Efficient Finetuning of Quantized LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dettmers%2C+T">Tim Dettmers</a>, <a href="/search/cs?searchtype=author&query=Pagnoni%2C+A">Artidoro Pagnoni</a>, <a href="/search/cs?searchtype=author&query=Holtzman%2C+A">Ari Holtzman</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.14314v1-abstract-short" style="display: inline;"> We present QLoRA, an efficient finetuning approach that reduces memory usage enough to finetune a 65B parameter model on a single 48GB GPU while preserving full 16-bit finetuning task performance. QLoRA backpropagates gradients through a frozen, 4-bit quantized pretrained language model into Low Rank Adapters~(LoRA). Our best model family, which we name Guanaco, outperforms all previous openly rel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14314v1-abstract-full').style.display = 'inline'; document.getElementById('2305.14314v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.14314v1-abstract-full" style="display: none;"> We present QLoRA, an efficient finetuning approach that reduces memory usage enough to finetune a 65B parameter model on a single 48GB GPU while preserving full 16-bit finetuning task performance. QLoRA backpropagates gradients through a frozen, 4-bit quantized pretrained language model into Low Rank Adapters~(LoRA). Our best model family, which we name Guanaco, outperforms all previous openly released models on the Vicuna benchmark, reaching 99.3% of the performance level of ChatGPT while only requiring 24 hours of finetuning on a single GPU. QLoRA introduces a number of innovations to save memory without sacrificing performance: (a) 4-bit NormalFloat (NF4), a new data type that is information theoretically optimal for normally distributed weights (b) double quantization to reduce the average memory footprint by quantizing the quantization constants, and (c) paged optimziers to manage memory spikes. We use QLoRA to finetune more than 1,000 models, providing a detailed analysis of instruction following and chatbot performance across 8 instruction datasets, multiple model types (LLaMA, T5), and model scales that would be infeasible to run with regular finetuning (e.g. 33B and 65B parameter models). Our results show that QLoRA finetuning on a small high-quality dataset leads to state-of-the-art results, even when using smaller models than the previous SoTA. We provide a detailed analysis of chatbot performance based on both human and GPT-4 evaluations showing that GPT-4 evaluations are a cheap and reasonable alternative to human evaluation. Furthermore, we find that current chatbot benchmarks are not trustworthy to accurately evaluate the performance levels of chatbots. A lemon-picked analysis demonstrates where Guanaco fails compared to ChatGPT. We release all of our models and code, including CUDA kernels for 4-bit training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14314v1-abstract-full').style.display = 'none'; document.getElementById('2305.14314v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Extended NeurIPS submission</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.14251">arXiv:2305.14251</a> <span> [<a href="https://arxiv.org/pdf/2305.14251">pdf</a>, <a href="https://arxiv.org/format/2305.14251">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FActScore: Fine-grained Atomic Evaluation of Factual Precision in Long Form Text Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Min%2C+S">Sewon Min</a>, <a href="/search/cs?searchtype=author&query=Krishna%2C+K">Kalpesh Krishna</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+X">Xinxi Lyu</a>, <a href="/search/cs?searchtype=author&query=Lewis%2C+M">Mike Lewis</a>, <a href="/search/cs?searchtype=author&query=Yih%2C+W">Wen-tau Yih</a>, <a href="/search/cs?searchtype=author&query=Koh%2C+P+W">Pang Wei Koh</a>, <a href="/search/cs?searchtype=author&query=Iyyer%2C+M">Mohit Iyyer</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.14251v2-abstract-short" style="display: inline;"> Evaluating the factuality of long-form text generated by large language models (LMs) is non-trivial because (1) generations often contain a mixture of supported and unsupported pieces of information, making binary judgments of quality inadequate, and (2) human evaluation is time-consuming and costly. In this paper, we introduce FACTSCORE, a new evaluation that breaks a generation into a series of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14251v2-abstract-full').style.display = 'inline'; document.getElementById('2305.14251v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.14251v2-abstract-full" style="display: none;"> Evaluating the factuality of long-form text generated by large language models (LMs) is non-trivial because (1) generations often contain a mixture of supported and unsupported pieces of information, making binary judgments of quality inadequate, and (2) human evaluation is time-consuming and costly. In this paper, we introduce FACTSCORE, a new evaluation that breaks a generation into a series of atomic facts and computes the percentage of atomic facts supported by a reliable knowledge source. We conduct an extensive human evaluation to obtain FACTSCOREs of people biographies generated by several state-of-the-art commercial LMs -- InstructGPT, ChatGPT, and the retrieval-augmented PerplexityAI -- and report new analysis demonstrating the need for such a fine-grained score (e.g., ChatGPT only achieves 58%). Since human evaluation is costly, we also introduce an automated model that estimates FACTSCORE using retrieval and a strong language model, with less than a 2% error rate. Finally, we use this automated metric to evaluate 6,500 generations from a new set of 13 recent LMs that would have cost $26K if evaluated by humans, with various findings: GPT-4 and ChatGPT are more factual than public models, and Vicuna and Alpaca are some of the best public models. FACTSCORE is available for public use via `pip install factscore`. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14251v2-abstract-full').style.display = 'none'; document.getElementById('2305.14251v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages; 7 figures. Published as a main conference paper at EMNLP 2023. Code available at https://github.com/shmsw25/FActScore</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.14240">arXiv:2305.14240</a> <span> [<a href="https://arxiv.org/pdf/2305.14240">pdf</a>, <a href="https://arxiv.org/format/2305.14240">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Revisiting Machine Translation for Cross-lingual Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Artetxe%2C+M">Mikel Artetxe</a>, <a href="/search/cs?searchtype=author&query=Goswami%2C+V">Vedanuj Goswami</a>, <a href="/search/cs?searchtype=author&query=Bhosale%2C+S">Shruti Bhosale</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+A">Angela Fan</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.14240v1-abstract-short" style="display: inline;"> Machine Translation (MT) has been widely used for cross-lingual classification, either by translating the test set into English and running inference with a monolingual model (translate-test), or translating the training set into the target languages and finetuning a multilingual model (translate-train). However, most research in the area focuses on the multilingual models rather than the MT compo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14240v1-abstract-full').style.display = 'inline'; document.getElementById('2305.14240v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.14240v1-abstract-full" style="display: none;"> Machine Translation (MT) has been widely used for cross-lingual classification, either by translating the test set into English and running inference with a monolingual model (translate-test), or translating the training set into the target languages and finetuning a multilingual model (translate-train). However, most research in the area focuses on the multilingual models rather than the MT component. We show that, by using a stronger MT system and mitigating the mismatch between training on original text and running inference on machine translated text, translate-test can do substantially better than previously assumed. The optimal approach, however, is highly task dependent, as we identify various sources of cross-lingual transfer gap that affect different tasks and approaches differently. Our work calls into question the dominance of multilingual models for cross-lingual classification, and prompts to pay more attention to MT-based baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14240v1-abstract-full').style.display = 'none'; document.getElementById('2305.14240v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.11206">arXiv:2305.11206</a> <span> [<a href="https://arxiv.org/pdf/2305.11206">pdf</a>, <a href="https://arxiv.org/format/2305.11206">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LIMA: Less Is More for Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Chunting Zhou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pengfei Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+P">Puxin Xu</a>, <a href="/search/cs?searchtype=author&query=Iyer%2C+S">Srini Iyer</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jiao Sun</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Yuning Mao</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xuezhe Ma</a>, <a href="/search/cs?searchtype=author&query=Efrat%2C+A">Avia Efrat</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P">Ping Yu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+L">Lili Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Susan Zhang</a>, <a href="/search/cs?searchtype=author&query=Ghosh%2C+G">Gargi Ghosh</a>, <a href="/search/cs?searchtype=author&query=Lewis%2C+M">Mike Lewis</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Levy%2C+O">Omer Levy</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.11206v1-abstract-short" style="display: inline;"> Large language models are trained in two stages: (1) unsupervised pretraining from raw text, to learn general-purpose representations, and (2) large scale instruction tuning and reinforcement learning, to better align to end tasks and user preferences. We measure the relative importance of these two stages by training LIMA, a 65B parameter LLaMa language model fine-tuned with the standard supervis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.11206v1-abstract-full').style.display = 'inline'; document.getElementById('2305.11206v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.11206v1-abstract-full" style="display: none;"> Large language models are trained in two stages: (1) unsupervised pretraining from raw text, to learn general-purpose representations, and (2) large scale instruction tuning and reinforcement learning, to better align to end tasks and user preferences. We measure the relative importance of these two stages by training LIMA, a 65B parameter LLaMa language model fine-tuned with the standard supervised loss on only 1,000 carefully curated prompts and responses, without any reinforcement learning or human preference modeling. LIMA demonstrates remarkably strong performance, learning to follow specific response formats from only a handful of examples in the training data, including complex queries that range from planning trip itineraries to speculating about alternate history. Moreover, the model tends to generalize well to unseen tasks that did not appear in the training data. In a controlled human study, responses from LIMA are either equivalent or strictly preferred to GPT-4 in 43% of cases; this statistic is as high as 58% when compared to Bard and 65% versus DaVinci003, which was trained with human feedback. Taken together, these results strongly suggest that almost all knowledge in large language models is learned during pretraining, and only limited instruction tuning data is necessary to teach models to produce high quality output. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.11206v1-abstract-full').style.display = 'none'; document.getElementById('2305.11206v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.07185">arXiv:2305.07185</a> <span> [<a href="https://arxiv.org/pdf/2305.07185">pdf</a>, <a href="https://arxiv.org/format/2305.07185">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MEGABYTE: Predicting Million-byte Sequences with Multiscale Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+L">Lili Yu</a>, <a href="/search/cs?searchtype=author&query=Simig%2C+D">D谩niel Simig</a>, <a href="/search/cs?searchtype=author&query=Flaherty%2C+C">Colin Flaherty</a>, <a href="/search/cs?searchtype=author&query=Aghajanyan%2C+A">Armen Aghajanyan</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Lewis%2C+M">Mike Lewis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.07185v2-abstract-short" style="display: inline;"> Autoregressive transformers are spectacular models for short sequences but scale poorly to long sequences such as high-resolution images, podcasts, code, or books. We proposed Megabyte, a multi-scale decoder architecture that enables end-to-end differentiable modeling of sequences of over one million bytes. Megabyte segments sequences into patches and uses a local submodel within patches and a glo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.07185v2-abstract-full').style.display = 'inline'; document.getElementById('2305.07185v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.07185v2-abstract-full" style="display: none;"> Autoregressive transformers are spectacular models for short sequences but scale poorly to long sequences such as high-resolution images, podcasts, code, or books. We proposed Megabyte, a multi-scale decoder architecture that enables end-to-end differentiable modeling of sequences of over one million bytes. Megabyte segments sequences into patches and uses a local submodel within patches and a global model between patches. This enables sub-quadratic self-attention, much larger feedforward layers for the same compute, and improved parallelism during decoding -- unlocking better performance at reduced cost for both training and generation. Extensive experiments show that Megabyte allows byte-level models to perform competitively with subword models on long context language modeling, achieve state-of-the-art density estimation on ImageNet, and model audio from raw files. Together, these results establish the viability of tokenization-free autoregressive sequence modeling at scale. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.07185v2-abstract-full').style.display = 'none'; document.getElementById('2305.07185v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.13803">arXiv:2304.13803</a> <span> [<a href="https://arxiv.org/pdf/2304.13803">pdf</a>, <a href="https://arxiv.org/format/2304.13803">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Translate to Disambiguate: Zero-shot Multilingual Word Sense Disambiguation with Pretrained Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+H">Haoqiang Kang</a>, <a href="/search/cs?searchtype=author&query=Blevins%2C+T">Terra Blevins</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.13803v1-abstract-short" style="display: inline;"> Pretrained Language Models (PLMs) learn rich cross-lingual knowledge and can be finetuned to perform well on diverse tasks such as translation and multilingual word sense disambiguation (WSD). However, they often struggle at disambiguating word sense in a zero-shot setting. To better understand this contrast, we present a new study investigating how well PLMs capture cross-lingual word sense with… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.13803v1-abstract-full').style.display = 'inline'; document.getElementById('2304.13803v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.13803v1-abstract-full" style="display: none;"> Pretrained Language Models (PLMs) learn rich cross-lingual knowledge and can be finetuned to perform well on diverse tasks such as translation and multilingual word sense disambiguation (WSD). However, they often struggle at disambiguating word sense in a zero-shot setting. To better understand this contrast, we present a new study investigating how well PLMs capture cross-lingual word sense with Contextual Word-Level Translation (C-WLT), an extension of word-level translation that prompts the model to translate a given word in context. We find that as the model size increases, PLMs encode more cross-lingual word sense knowledge and better use context to improve WLT performance. Building on C-WLT, we introduce a zero-shot approach for WSD, tested on 18 languages from the XL-WSD dataset. Our method outperforms fully supervised baselines on recall for many evaluation languages without additional training or finetuning. This study presents a first step towards understanding how to best leverage the cross-lingual knowledge inside PLMs for robust zero-shot reasoning in any language. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.13803v1-abstract-full').style.display = 'none'; document.getElementById('2304.13803v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zettlemoyer%2C+L&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zettlemoyer%2C+L&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zettlemoyer%2C+L&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zettlemoyer%2C+L&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zettlemoyer%2C+L&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zettlemoyer%2C+L&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository