CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–18 of 18 results for author: <span class="mathjax">Soares, L B</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Soares%2C+L+B">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Soares, L B"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Soares%2C+L+B&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Soares, L B"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.00118">arXiv:2408.00118</a> <span> [<a href="https://arxiv.org/pdf/2408.00118">pdf</a>, <a href="https://arxiv.org/format/2408.00118">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Gemma 2: Improving Open Language Models at a Practical Size </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gemma+Team"> Gemma Team</a>, <a href="/search/cs?searchtype=author&query=Riviere%2C+M">Morgane Riviere</a>, <a href="/search/cs?searchtype=author&query=Pathak%2C+S">Shreya Pathak</a>, <a href="/search/cs?searchtype=author&query=Sessa%2C+P+G">Pier Giuseppe Sessa</a>, <a href="/search/cs?searchtype=author&query=Hardin%2C+C">Cassidy Hardin</a>, <a href="/search/cs?searchtype=author&query=Bhupatiraju%2C+S">Surya Bhupatiraju</a>, <a href="/search/cs?searchtype=author&query=Hussenot%2C+L">L茅onard Hussenot</a>, <a href="/search/cs?searchtype=author&query=Mesnard%2C+T">Thomas Mesnard</a>, <a href="/search/cs?searchtype=author&query=Shahriari%2C+B">Bobak Shahriari</a>, <a href="/search/cs?searchtype=author&query=Ram%C3%A9%2C+A">Alexandre Ram茅</a>, <a href="/search/cs?searchtype=author&query=Ferret%2C+J">Johan Ferret</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peter Liu</a>, <a href="/search/cs?searchtype=author&query=Tafti%2C+P">Pouya Tafti</a>, <a href="/search/cs?searchtype=author&query=Friesen%2C+A">Abe Friesen</a>, <a href="/search/cs?searchtype=author&query=Casbon%2C+M">Michelle Casbon</a>, <a href="/search/cs?searchtype=author&query=Ramos%2C+S">Sabela Ramos</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+R">Ravin Kumar</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+C+L">Charline Le Lan</a>, <a href="/search/cs?searchtype=author&query=Jerome%2C+S">Sammy Jerome</a>, <a href="/search/cs?searchtype=author&query=Tsitsulin%2C+A">Anton Tsitsulin</a>, <a href="/search/cs?searchtype=author&query=Vieillard%2C+N">Nino Vieillard</a>, <a href="/search/cs?searchtype=author&query=Stanczyk%2C+P">Piotr Stanczyk</a>, <a href="/search/cs?searchtype=author&query=Girgin%2C+S">Sertan Girgin</a>, <a href="/search/cs?searchtype=author&query=Momchev%2C+N">Nikola Momchev</a>, <a href="/search/cs?searchtype=author&query=Hoffman%2C+M">Matt Hoffman</a> , et al. (173 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.00118v3-abstract-short" style="display: inline;"> In this work, we introduce Gemma 2, a new addition to the Gemma family of lightweight, state-of-the-art open models, ranging in scale from 2 billion to 27 billion parameters. In this new version, we apply several known technical modifications to the Transformer architecture, such as interleaving local-global attentions (Beltagy et al., 2020a) and group-query attention (Ainslie et al., 2023). We al… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00118v3-abstract-full').style.display = 'inline'; document.getElementById('2408.00118v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.00118v3-abstract-full" style="display: none;"> In this work, we introduce Gemma 2, a new addition to the Gemma family of lightweight, state-of-the-art open models, ranging in scale from 2 billion to 27 billion parameters. In this new version, we apply several known technical modifications to the Transformer architecture, such as interleaving local-global attentions (Beltagy et al., 2020a) and group-query attention (Ainslie et al., 2023). We also train the 2B and 9B models with knowledge distillation (Hinton et al., 2015) instead of next token prediction. The resulting models deliver the best performance for their size, and even offer competitive alternatives to models that are 2-3 times bigger. We release all our models to the community. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00118v3-abstract-full').style.display = 'none'; document.getElementById('2408.00118v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.00361">arXiv:2407.00361</a> <span> [<a href="https://arxiv.org/pdf/2407.00361">pdf</a>, <a href="https://arxiv.org/format/2407.00361">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> From RAG to RICHES: Retrieval Interlaced with Sequence Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jain%2C+P">Palak Jain</a>, <a href="/search/cs?searchtype=author&query=Soares%2C+L+B">Livio Baldini Soares</a>, <a href="/search/cs?searchtype=author&query=Kwiatkowski%2C+T">Tom Kwiatkowski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.00361v1-abstract-short" style="display: inline;"> We present RICHES, a novel approach that interleaves retrieval with sequence generation tasks. RICHES offers an alternative to conventional RAG systems by eliminating the need for separate retriever and generator. It retrieves documents by directly decoding their contents, constrained on the corpus. Unifying retrieval with generation allows us to adapt to diverse new tasks via prompting alone. RIC… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00361v1-abstract-full').style.display = 'inline'; document.getElementById('2407.00361v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.00361v1-abstract-full" style="display: none;"> We present RICHES, a novel approach that interleaves retrieval with sequence generation tasks. RICHES offers an alternative to conventional RAG systems by eliminating the need for separate retriever and generator. It retrieves documents by directly decoding their contents, constrained on the corpus. Unifying retrieval with generation allows us to adapt to diverse new tasks via prompting alone. RICHES can work with any Instruction-tuned model, without additional training. It provides attributed evidence, supports multi-hop retrievals and interleaves thoughts to plan on what to retrieve next, all within a single decoding pass of the LLM. We demonstrate the strong performance of RICHES across ODQA tasks including attributed and multi-hop QA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00361v1-abstract-full').style.display = 'none'; document.getElementById('2407.00361v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 3 figures, Preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.05530">arXiv:2403.05530</a> <span> [<a href="https://arxiv.org/pdf/2403.05530">pdf</a>, <a href="https://arxiv.org/format/2403.05530">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gemini+Team"> Gemini Team</a>, <a href="/search/cs?searchtype=author&query=Georgiev%2C+P">Petko Georgiev</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+V+I">Ving Ian Lei</a>, <a href="/search/cs?searchtype=author&query=Burnell%2C+R">Ryan Burnell</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Libin Bai</a>, <a href="/search/cs?searchtype=author&query=Gulati%2C+A">Anmol Gulati</a>, <a href="/search/cs?searchtype=author&query=Tanzer%2C+G">Garrett Tanzer</a>, <a href="/search/cs?searchtype=author&query=Vincent%2C+D">Damien Vincent</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Z">Zhufeng Pan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shibo Wang</a>, <a href="/search/cs?searchtype=author&query=Mariooryad%2C+S">Soroosh Mariooryad</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yifan Ding</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+X">Xinyang Geng</a>, <a href="/search/cs?searchtype=author&query=Alcober%2C+F">Fred Alcober</a>, <a href="/search/cs?searchtype=author&query=Frostig%2C+R">Roy Frostig</a>, <a href="/search/cs?searchtype=author&query=Omernick%2C+M">Mark Omernick</a>, <a href="/search/cs?searchtype=author&query=Walker%2C+L">Lexi Walker</a>, <a href="/search/cs?searchtype=author&query=Paduraru%2C+C">Cosmin Paduraru</a>, <a href="/search/cs?searchtype=author&query=Sorokin%2C+C">Christina Sorokin</a>, <a href="/search/cs?searchtype=author&query=Tacchetti%2C+A">Andrea Tacchetti</a>, <a href="/search/cs?searchtype=author&query=Gaffney%2C+C">Colin Gaffney</a>, <a href="/search/cs?searchtype=author&query=Daruki%2C+S">Samira Daruki</a>, <a href="/search/cs?searchtype=author&query=Sercinoglu%2C+O">Olcan Sercinoglu</a>, <a href="/search/cs?searchtype=author&query=Gleicher%2C+Z">Zach Gleicher</a>, <a href="/search/cs?searchtype=author&query=Love%2C+J">Juliette Love</a> , et al. (1110 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.05530v4-abstract-short" style="display: inline;"> In this report, we introduce the Gemini 1.5 family of models, representing the next generation of highly compute-efficient multimodal models capable of recalling and reasoning over fine-grained information from millions of tokens of context, including multiple long documents and hours of video and audio. The family includes two new models: (1) an updated Gemini 1.5 Pro, which exceeds the February… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.05530v4-abstract-full').style.display = 'inline'; document.getElementById('2403.05530v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.05530v4-abstract-full" style="display: none;"> In this report, we introduce the Gemini 1.5 family of models, representing the next generation of highly compute-efficient multimodal models capable of recalling and reasoning over fine-grained information from millions of tokens of context, including multiple long documents and hours of video and audio. The family includes two new models: (1) an updated Gemini 1.5 Pro, which exceeds the February version on the great majority of capabilities and benchmarks; (2) Gemini 1.5 Flash, a more lightweight variant designed for efficiency with minimal regression in quality. Gemini 1.5 models achieve near-perfect recall on long-context retrieval tasks across modalities, improve the state-of-the-art in long-document QA, long-video QA and long-context ASR, and match or surpass Gemini 1.0 Ultra's state-of-the-art performance across a broad set of benchmarks. Studying the limits of Gemini 1.5's long-context ability, we find continued improvement in next-token prediction and near-perfect retrieval (>99%) up to at least 10M tokens, a generational leap over existing models such as Claude 3.0 (200k) and GPT-4 Turbo (128k). Finally, we highlight real-world use cases, such as Gemini 1.5 collaborating with professionals on completing their tasks achieving 26 to 75% time savings across 10 different job categories, as well as surprising new capabilities of large language models at the frontier; when given a grammar manual for Kalamang, a language with fewer than 200 speakers worldwide, the model learns to translate English to Kalamang at a similar level to a person who learned from the same content. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.05530v4-abstract-full').style.display = 'none'; document.getElementById('2403.05530v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.11805">arXiv:2312.11805</a> <span> [<a href="https://arxiv.org/pdf/2312.11805">pdf</a>, <a href="https://arxiv.org/format/2312.11805">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Gemini: A Family of Highly Capable Multimodal Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gemini+Team"> Gemini Team</a>, <a href="/search/cs?searchtype=author&query=Anil%2C+R">Rohan Anil</a>, <a href="/search/cs?searchtype=author&query=Borgeaud%2C+S">Sebastian Borgeaud</a>, <a href="/search/cs?searchtype=author&query=Alayrac%2C+J">Jean-Baptiste Alayrac</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jiahui Yu</a>, <a href="/search/cs?searchtype=author&query=Soricut%2C+R">Radu Soricut</a>, <a href="/search/cs?searchtype=author&query=Schalkwyk%2C+J">Johan Schalkwyk</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+A+M">Andrew M. Dai</a>, <a href="/search/cs?searchtype=author&query=Hauth%2C+A">Anja Hauth</a>, <a href="/search/cs?searchtype=author&query=Millican%2C+K">Katie Millican</a>, <a href="/search/cs?searchtype=author&query=Silver%2C+D">David Silver</a>, <a href="/search/cs?searchtype=author&query=Johnson%2C+M">Melvin Johnson</a>, <a href="/search/cs?searchtype=author&query=Antonoglou%2C+I">Ioannis Antonoglou</a>, <a href="/search/cs?searchtype=author&query=Schrittwieser%2C+J">Julian Schrittwieser</a>, <a href="/search/cs?searchtype=author&query=Glaese%2C+A">Amelia Glaese</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jilin Chen</a>, <a href="/search/cs?searchtype=author&query=Pitler%2C+E">Emily Pitler</a>, <a href="/search/cs?searchtype=author&query=Lillicrap%2C+T">Timothy Lillicrap</a>, <a href="/search/cs?searchtype=author&query=Lazaridou%2C+A">Angeliki Lazaridou</a>, <a href="/search/cs?searchtype=author&query=Firat%2C+O">Orhan Firat</a>, <a href="/search/cs?searchtype=author&query=Molloy%2C+J">James Molloy</a>, <a href="/search/cs?searchtype=author&query=Isard%2C+M">Michael Isard</a>, <a href="/search/cs?searchtype=author&query=Barham%2C+P+R">Paul R. Barham</a>, <a href="/search/cs?searchtype=author&query=Hennigan%2C+T">Tom Hennigan</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+B">Benjamin Lee</a> , et al. (1325 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.11805v4-abstract-short" style="display: inline;"> This report introduces a new family of multimodal models, Gemini, that exhibit remarkable capabilities across image, audio, video, and text understanding. The Gemini family consists of Ultra, Pro, and Nano sizes, suitable for applications ranging from complex reasoning tasks to on-device memory-constrained use-cases. Evaluation on a broad range of benchmarks shows that our most-capable Gemini Ultr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.11805v4-abstract-full').style.display = 'inline'; document.getElementById('2312.11805v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.11805v4-abstract-full" style="display: none;"> This report introduces a new family of multimodal models, Gemini, that exhibit remarkable capabilities across image, audio, video, and text understanding. The Gemini family consists of Ultra, Pro, and Nano sizes, suitable for applications ranging from complex reasoning tasks to on-device memory-constrained use-cases. Evaluation on a broad range of benchmarks shows that our most-capable Gemini Ultra model advances the state of the art in 30 of 32 of these benchmarks - notably being the first model to achieve human-expert performance on the well-studied exam benchmark MMLU, and improving the state of the art in every one of the 20 multimodal benchmarks we examined. We believe that the new capabilities of the Gemini family in cross-modal reasoning and language understanding will enable a wide variety of use cases. We discuss our approach toward post-training and deploying Gemini models responsibly to users through services including Gemini, Gemini Advanced, Google AI Studio, and Cloud Vertex AI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.11805v4-abstract-full').style.display = 'none'; document.getElementById('2312.11805v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.16568">arXiv:2310.16568</a> <span> [<a href="https://arxiv.org/pdf/2310.16568">pdf</a>, <a href="https://arxiv.org/format/2310.16568">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> 1-PAGER: One Pass Answer Generation and Evidence Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jain%2C+P">Palak Jain</a>, <a href="/search/cs?searchtype=author&query=Soares%2C+L+B">Livio Baldini Soares</a>, <a href="/search/cs?searchtype=author&query=Kwiatkowski%2C+T">Tom Kwiatkowski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.16568v1-abstract-short" style="display: inline;"> We present 1-Pager the first system that answers a question and retrieves evidence using a single Transformer-based model and decoding process. 1-Pager incrementally partitions the retrieval corpus using constrained decoding to select a document and answer string, and we show that this is competitive with comparable retrieve-and-read alternatives according to both retrieval and answer accuracy met… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.16568v1-abstract-full').style.display = 'inline'; document.getElementById('2310.16568v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.16568v1-abstract-full" style="display: none;"> We present 1-Pager the first system that answers a question and retrieves evidence using a single Transformer-based model and decoding process. 1-Pager incrementally partitions the retrieval corpus using constrained decoding to select a document and answer string, and we show that this is competitive with comparable retrieve-and-read alternatives according to both retrieval and answer accuracy metrics. 1-Pager also outperforms the equivalent closed-book question answering model, by grounding predictions in an evidence corpus. While 1-Pager is not yet on-par with more expensive systems that read many more documents before generating an answer, we argue that it provides an important step toward attributed generation by folding retrieval into the sequence-to-sequence paradigm that is currently dominant in NLP. We also show that the search paths used to partition the corpus are easy to read and understand, paving a way forward for interpretable neural retrieval. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.16568v1-abstract-full').style.display = 'none'; document.getElementById('2310.16568v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at EMNLP 2023 (Findings)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.08764">arXiv:2310.08764</a> <span> [<a href="https://arxiv.org/pdf/2310.08764">pdf</a>, <a href="https://arxiv.org/format/2310.08764">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Calibrating Likelihoods towards Consistency in Summarization Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zablotskaia%2C+P">Polina Zablotskaia</a>, <a href="/search/cs?searchtype=author&query=Khalman%2C+M">Misha Khalman</a>, <a href="/search/cs?searchtype=author&query=Joshi%2C+R">Rishabh Joshi</a>, <a href="/search/cs?searchtype=author&query=Soares%2C+L+B">Livio Baldini Soares</a>, <a href="/search/cs?searchtype=author&query=Jakobovits%2C+S">Shoshana Jakobovits</a>, <a href="/search/cs?searchtype=author&query=Maynez%2C+J">Joshua Maynez</a>, <a href="/search/cs?searchtype=author&query=Narayan%2C+S">Shashi Narayan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.08764v1-abstract-short" style="display: inline;"> Despite the recent advances in abstractive text summarization, current summarization models still suffer from generating factually inconsistent summaries, reducing their utility for real-world application. We argue that the main reason for such behavior is that the summarization models trained with maximum likelihood objective assign high probability to plausible sequences given the context, but t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.08764v1-abstract-full').style.display = 'inline'; document.getElementById('2310.08764v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.08764v1-abstract-full" style="display: none;"> Despite the recent advances in abstractive text summarization, current summarization models still suffer from generating factually inconsistent summaries, reducing their utility for real-world application. We argue that the main reason for such behavior is that the summarization models trained with maximum likelihood objective assign high probability to plausible sequences given the context, but they often do not accurately rank sequences by their consistency. In this work, we solve this problem by calibrating the likelihood of model generated sequences to better align with a consistency metric measured by natural language inference (NLI) models. The human evaluation study and automatic metrics show that the calibrated models generate more consistent and higher-quality summaries. We also show that the models trained using our method return probabilities that are better aligned with the NLI scores, which significantly increase reliability of summarization models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.08764v1-abstract-full').style.display = 'none'; document.getElementById('2310.08764v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.14499">arXiv:2305.14499</a> <span> [<a href="https://arxiv.org/pdf/2305.14499">pdf</a>, <a href="https://arxiv.org/format/2305.14499">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> NAIL: Lexical Retrieval Indices with Efficient Non-Autoregressive Decoders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Soares%2C+L+B">Livio Baldini Soares</a>, <a href="/search/cs?searchtype=author&query=Gillick%2C+D">Daniel Gillick</a>, <a href="/search/cs?searchtype=author&query=Cole%2C+J+R">Jeremy R. Cole</a>, <a href="/search/cs?searchtype=author&query=Kwiatkowski%2C+T">Tom Kwiatkowski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.14499v2-abstract-short" style="display: inline;"> Neural document rerankers are extremely effective in terms of accuracy. However, the best models require dedicated hardware for serving, which is costly and often not feasible. To avoid this serving-time requirement, we present a method of capturing up to 86% of the gains of a Transformer cross-attention model with a lexicalized scoring function that only requires 10-6% of the Transformer's FLOPs… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14499v2-abstract-full').style.display = 'inline'; document.getElementById('2305.14499v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.14499v2-abstract-full" style="display: none;"> Neural document rerankers are extremely effective in terms of accuracy. However, the best models require dedicated hardware for serving, which is costly and often not feasible. To avoid this serving-time requirement, we present a method of capturing up to 86% of the gains of a Transformer cross-attention model with a lexicalized scoring function that only requires 10-6% of the Transformer's FLOPs per document and can be served using commodity CPUs. When combined with a BM25 retriever, this approach matches the quality of a state-of-the art dual encoder retriever, that still requires an accelerator for query encoding. We introduce NAIL (Non-Autoregressive Indexing with Language models) as a model architecture that is compatible with recent encoder-decoder and decoder-only large language models, such as T5, GPT-3 and PaLM. This model architecture can leverage existing pre-trained checkpoints and can be fine-tuned for efficiently constructing document representations that do not require neural processing of queries. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14499v2-abstract-full').style.display = 'none'; document.getElementById('2305.14499v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at EMNLP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.14332">arXiv:2305.14332</a> <span> [<a href="https://arxiv.org/pdf/2305.14332">pdf</a>, <a href="https://arxiv.org/format/2305.14332">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Evaluating and Modeling Attribution for Cross-Lingual Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Muller%2C+B">Benjamin Muller</a>, <a href="/search/cs?searchtype=author&query=Wieting%2C+J">John Wieting</a>, <a href="/search/cs?searchtype=author&query=Clark%2C+J+H">Jonathan H. Clark</a>, <a href="/search/cs?searchtype=author&query=Kwiatkowski%2C+T">Tom Kwiatkowski</a>, <a href="/search/cs?searchtype=author&query=Ruder%2C+S">Sebastian Ruder</a>, <a href="/search/cs?searchtype=author&query=Soares%2C+L+B">Livio Baldini Soares</a>, <a href="/search/cs?searchtype=author&query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&query=Herzig%2C+J">Jonathan Herzig</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinyi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.14332v2-abstract-short" style="display: inline;"> Trustworthy answer content is abundant in many high-resource languages and is instantly accessible through question answering systems, yet this content can be hard to access for those that do not speak these languages. The leap forward in cross-lingual modeling quality offered by generative language models offers much promise, yet their raw generations often fall short in factuality. To improve tr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14332v2-abstract-full').style.display = 'inline'; document.getElementById('2305.14332v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.14332v2-abstract-full" style="display: none;"> Trustworthy answer content is abundant in many high-resource languages and is instantly accessible through question answering systems, yet this content can be hard to access for those that do not speak these languages. The leap forward in cross-lingual modeling quality offered by generative language models offers much promise, yet their raw generations often fall short in factuality. To improve trustworthiness in these systems, a promising direction is to attribute the answer to a retrieved source, possibly in a content-rich language different from the query. Our work is the first to study attribution for cross-lingual question answering. First, we collect data in 5 languages to assess the attribution level of a state-of-the-art cross-lingual QA system. To our surprise, we find that a substantial portion of the answers is not attributable to any retrieved passages (up to 50% of answers exactly matching a gold reference) despite the system being able to attend directly to the retrieved text. Second, to address this poor attribution level, we experiment with a wide range of attribution detection techniques. We find that Natural Language Inference models and PaLM 2 fine-tuned on a very small amount of attribution data can accurately detect attribution. Based on these models, we improve the attribution level of a cross-lingual question-answering system. Overall, we show that current academic generative cross-lingual QA systems have substantial shortcomings in attribution and we build tooling to mitigate these issues. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14332v2-abstract-full').style.display = 'none'; document.getElementById('2305.14332v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published as a long paper at EMNLP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.08037">arXiv:2212.08037</a> <span> [<a href="https://arxiv.org/pdf/2212.08037">pdf</a>, <a href="https://arxiv.org/format/2212.08037">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Attributed Question Answering: Evaluation and Modeling for Attributed Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bohnet%2C+B">Bernd Bohnet</a>, <a href="/search/cs?searchtype=author&query=Tran%2C+V+Q">Vinh Q. Tran</a>, <a href="/search/cs?searchtype=author&query=Verga%2C+P">Pat Verga</a>, <a href="/search/cs?searchtype=author&query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&query=Andor%2C+D">Daniel Andor</a>, <a href="/search/cs?searchtype=author&query=Soares%2C+L+B">Livio Baldini Soares</a>, <a href="/search/cs?searchtype=author&query=Ciaramita%2C+M">Massimiliano Ciaramita</a>, <a href="/search/cs?searchtype=author&query=Eisenstein%2C+J">Jacob Eisenstein</a>, <a href="/search/cs?searchtype=author&query=Ganchev%2C+K">Kuzman Ganchev</a>, <a href="/search/cs?searchtype=author&query=Herzig%2C+J">Jonathan Herzig</a>, <a href="/search/cs?searchtype=author&query=Hui%2C+K">Kai Hui</a>, <a href="/search/cs?searchtype=author&query=Kwiatkowski%2C+T">Tom Kwiatkowski</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Ji Ma</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+J">Jianmo Ni</a>, <a href="/search/cs?searchtype=author&query=Saralegui%2C+L+S">Lierni Sestorain Saralegui</a>, <a href="/search/cs?searchtype=author&query=Schuster%2C+T">Tal Schuster</a>, <a href="/search/cs?searchtype=author&query=Cohen%2C+W+W">William W. Cohen</a>, <a href="/search/cs?searchtype=author&query=Collins%2C+M">Michael Collins</a>, <a href="/search/cs?searchtype=author&query=Das%2C+D">Dipanjan Das</a>, <a href="/search/cs?searchtype=author&query=Metzler%2C+D">Donald Metzler</a>, <a href="/search/cs?searchtype=author&query=Petrov%2C+S">Slav Petrov</a>, <a href="/search/cs?searchtype=author&query=Webster%2C+K">Kellie Webster</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.08037v2-abstract-short" style="display: inline;"> Large language models (LLMs) have shown impressive results while requiring little or no direct supervision. Further, there is mounting evidence that LLMs may have potential in information-seeking scenarios. We believe the ability of an LLM to attribute the text that it generates is likely to be crucial in this setting. We formulate and study Attributed QA as a key first step in the development of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.08037v2-abstract-full').style.display = 'inline'; document.getElementById('2212.08037v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.08037v2-abstract-full" style="display: none;"> Large language models (LLMs) have shown impressive results while requiring little or no direct supervision. Further, there is mounting evidence that LLMs may have potential in information-seeking scenarios. We believe the ability of an LLM to attribute the text that it generates is likely to be crucial in this setting. We formulate and study Attributed QA as a key first step in the development of attributed LLMs. We propose a reproducible evaluation framework for the task and benchmark a broad set of architectures. We take human annotations as a gold standard and show that a correlated automatic metric is suitable for development. Our experimental work gives concrete answers to two key questions (How to measure attribution?, and How well do current state-of-the-art methods perform on attribution?), and give some hints as to how to address a third (How to build LLMs with attribution?). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.08037v2-abstract-full').style.display = 'none'; document.getElementById('2212.08037v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.17189">arXiv:2203.17189</a> <span> [<a href="https://arxiv.org/pdf/2203.17189">pdf</a>, <a href="https://arxiv.org/format/2203.17189">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Scaling Up Models and Data with $\texttt{t5x}$ and $\texttt{seqio}$ </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Roberts%2C+A">Adam Roberts</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+H+W">Hyung Won Chung</a>, <a href="/search/cs?searchtype=author&query=Levskaya%2C+A">Anselm Levskaya</a>, <a href="/search/cs?searchtype=author&query=Mishra%2C+G">Gaurav Mishra</a>, <a href="/search/cs?searchtype=author&query=Bradbury%2C+J">James Bradbury</a>, <a href="/search/cs?searchtype=author&query=Andor%2C+D">Daniel Andor</a>, <a href="/search/cs?searchtype=author&query=Narang%2C+S">Sharan Narang</a>, <a href="/search/cs?searchtype=author&query=Lester%2C+B">Brian Lester</a>, <a href="/search/cs?searchtype=author&query=Gaffney%2C+C">Colin Gaffney</a>, <a href="/search/cs?searchtype=author&query=Mohiuddin%2C+A">Afroz Mohiuddin</a>, <a href="/search/cs?searchtype=author&query=Hawthorne%2C+C">Curtis Hawthorne</a>, <a href="/search/cs?searchtype=author&query=Lewkowycz%2C+A">Aitor Lewkowycz</a>, <a href="/search/cs?searchtype=author&query=Salcianu%2C+A">Alex Salcianu</a>, <a href="/search/cs?searchtype=author&query=van+Zee%2C+M">Marc van Zee</a>, <a href="/search/cs?searchtype=author&query=Austin%2C+J">Jacob Austin</a>, <a href="/search/cs?searchtype=author&query=Goodman%2C+S">Sebastian Goodman</a>, <a href="/search/cs?searchtype=author&query=Soares%2C+L+B">Livio Baldini Soares</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Haitang Hu</a>, <a href="/search/cs?searchtype=author&query=Tsvyashchenko%2C+S">Sasha Tsvyashchenko</a>, <a href="/search/cs?searchtype=author&query=Chowdhery%2C+A">Aakanksha Chowdhery</a>, <a href="/search/cs?searchtype=author&query=Bastings%2C+J">Jasmijn Bastings</a>, <a href="/search/cs?searchtype=author&query=Bulian%2C+J">Jannis Bulian</a>, <a href="/search/cs?searchtype=author&query=Garcia%2C+X">Xavier Garcia</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+J">Jianmo Ni</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+A">Andrew Chen</a> , et al. (18 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.17189v1-abstract-short" style="display: inline;"> Recent neural network-based language models have benefited greatly from scaling up the size of training datasets and the number of parameters in the models themselves. Scaling can be complicated due to various factors including the need to distribute computation on supercomputer clusters (e.g., TPUs), prevent bottlenecks when infeeding data, and ensure reproducible results. In this work, we presen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.17189v1-abstract-full').style.display = 'inline'; document.getElementById('2203.17189v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.17189v1-abstract-full" style="display: none;"> Recent neural network-based language models have benefited greatly from scaling up the size of training datasets and the number of parameters in the models themselves. Scaling can be complicated due to various factors including the need to distribute computation on supercomputer clusters (e.g., TPUs), prevent bottlenecks when infeeding data, and ensure reproducible results. In this work, we present two software libraries that ease these issues: $\texttt{t5x}$ simplifies the process of building and training large language models at scale while maintaining ease of use, and $\texttt{seqio}$ provides a task-based API for simple creation of fast and reproducible training data and evaluation pipelines. These open-source libraries have been used to train models with hundreds of billions of parameters on datasets with multiple terabytes of training data. Along with the libraries, we release configurations and instructions for T5-like encoder-decoder models as well as GPT-like decoder-only architectures. $\texttt{t5x}$ and $\texttt{seqio}$ are open source and available at https://github.com/google-research/t5x and https://github.com/google/seqio, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.17189v1-abstract-full').style.display = 'none'; document.getElementById('2203.17189v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2012.00893">arXiv:2012.00893</a> <span> [<a href="https://arxiv.org/pdf/2012.00893">pdf</a>, <a href="https://arxiv.org/format/2012.00893">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Evaluating Explanations: How much do explanations from the teacher aid students? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pruthi%2C+D">Danish Pruthi</a>, <a href="/search/cs?searchtype=author&query=Bansal%2C+R">Rachit Bansal</a>, <a href="/search/cs?searchtype=author&query=Dhingra%2C+B">Bhuwan Dhingra</a>, <a href="/search/cs?searchtype=author&query=Soares%2C+L+B">Livio Baldini Soares</a>, <a href="/search/cs?searchtype=author&query=Collins%2C+M">Michael Collins</a>, <a href="/search/cs?searchtype=author&query=Lipton%2C+Z+C">Zachary C. Lipton</a>, <a href="/search/cs?searchtype=author&query=Neubig%2C+G">Graham Neubig</a>, <a href="/search/cs?searchtype=author&query=Cohen%2C+W+W">William W. Cohen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2012.00893v2-abstract-short" style="display: inline;"> While many methods purport to explain predictions by highlighting salient features, what aims these explanations serve and how they ought to be evaluated often go unstated. In this work, we introduce a framework to quantify the value of explanations via the accuracy gains that they confer on a student model trained to simulate a teacher model. Crucially, the explanations are available to the stude… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.00893v2-abstract-full').style.display = 'inline'; document.getElementById('2012.00893v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2012.00893v2-abstract-full" style="display: none;"> While many methods purport to explain predictions by highlighting salient features, what aims these explanations serve and how they ought to be evaluated often go unstated. In this work, we introduce a framework to quantify the value of explanations via the accuracy gains that they confer on a student model trained to simulate a teacher model. Crucially, the explanations are available to the student during training, but are not available at test time. Compared to prior proposals, our approach is less easily gamed, enabling principled, automatic, model-agnostic evaluation of attributions. Using our framework, we compare numerous attribution methods for text classification and question answering, and observe quantitative differences that are consistent (to a moderate to high degree) across different student model architectures and learning strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.00893v2-abstract-full').style.display = 'none'; document.getElementById('2012.00893v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 December, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">TACL 2021 (pre-MIT Press publication version)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2009.06354">arXiv:2009.06354</a> <span> [<a href="https://arxiv.org/pdf/2009.06354">pdf</a>, <a href="https://arxiv.org/format/2009.06354">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> QED: A Framework and Dataset for Explanations in Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lamm%2C+M">Matthew Lamm</a>, <a href="/search/cs?searchtype=author&query=Palomaki%2C+J">Jennimaria Palomaki</a>, <a href="/search/cs?searchtype=author&query=Alberti%2C+C">Chris Alberti</a>, <a href="/search/cs?searchtype=author&query=Andor%2C+D">Daniel Andor</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+E">Eunsol Choi</a>, <a href="/search/cs?searchtype=author&query=Soares%2C+L+B">Livio Baldini Soares</a>, <a href="/search/cs?searchtype=author&query=Collins%2C+M">Michael Collins</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2009.06354v1-abstract-short" style="display: inline;"> A question answering system that in addition to providing an answer provides an explanation of the reasoning that leads to that answer has potential advantages in terms of debuggability, extensibility and trust. To this end, we propose QED, a linguistically informed, extensible framework for explanations in question answering. A QED explanation specifies the relationship between a question and ans… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2009.06354v1-abstract-full').style.display = 'inline'; document.getElementById('2009.06354v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2009.06354v1-abstract-full" style="display: none;"> A question answering system that in addition to providing an answer provides an explanation of the reasoning that leads to that answer has potential advantages in terms of debuggability, extensibility and trust. To this end, we propose QED, a linguistically informed, extensible framework for explanations in question answering. A QED explanation specifies the relationship between a question and answer according to formal semantic notions such as referential equality, sentencehood, and entailment. We describe and publicly release an expert-annotated dataset of QED explanations built upon a subset of the Google Natural Questions dataset, and report baseline models on two tasks -- post-hoc explanation generation given an answer, and joint question answering and explanation generation. In the joint setting, a promising result suggests that training on a relatively small amount of QED data can improve question answering. In addition to describing the formal, language-theoretic motivations for the QED approach, we describe a large user study showing that the presence of QED explanations significantly improves the ability of untrained raters to spot errors made by a strong neural QA baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2009.06354v1-abstract-full').style.display = 'none'; document.getElementById('2009.06354v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 September, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2007.00849">arXiv:2007.00849</a> <span> [<a href="https://arxiv.org/pdf/2007.00849">pdf</a>, <a href="https://arxiv.org/format/2007.00849">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Facts as Experts: Adaptable and Interpretable Neural Memory over Symbolic Knowledge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Verga%2C+P">Pat Verga</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Haitian Sun</a>, <a href="/search/cs?searchtype=author&query=Soares%2C+L+B">Livio Baldini Soares</a>, <a href="/search/cs?searchtype=author&query=Cohen%2C+W+W">William W. Cohen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2007.00849v1-abstract-short" style="display: inline;"> Massive language models are the core of modern NLP modeling and have been shown to encode impressive amounts of commonsense and factual information. However, that knowledge exists only within the latent parameters of the model, inaccessible to inspection and interpretation, and even worse, factual information memorized from the training corpora is likely to become stale as the world changes. Knowl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.00849v1-abstract-full').style.display = 'inline'; document.getElementById('2007.00849v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2007.00849v1-abstract-full" style="display: none;"> Massive language models are the core of modern NLP modeling and have been shown to encode impressive amounts of commonsense and factual information. However, that knowledge exists only within the latent parameters of the model, inaccessible to inspection and interpretation, and even worse, factual information memorized from the training corpora is likely to become stale as the world changes. Knowledge stored as parameters will also inevitably exhibit all of the biases inherent in the source materials. To address these problems, we develop a neural language model that includes an explicit interface between symbolically interpretable factual information and subsymbolic neural knowledge. We show that this model dramatically improves performance on two knowledge-intensive question-answering tasks. More interestingly, the model can be updated without re-training by manipulating its symbolic representations. In particular this model allows us to add new facts and overwrite existing ones in ways that are not possible for earlier models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.00849v1-abstract-full').style.display = 'none'; document.getElementById('2007.00849v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2005.14253">arXiv:2005.14253</a> <span> [<a href="https://arxiv.org/pdf/2005.14253">pdf</a>, <a href="https://arxiv.org/ps/2005.14253">ps</a>, <a href="https://arxiv.org/format/2005.14253">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Empirical Evaluation of Pretraining Strategies for Supervised Entity Linking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=F%C3%A9vry%2C+T">Thibault F茅vry</a>, <a href="/search/cs?searchtype=author&query=FitzGerald%2C+N">Nicholas FitzGerald</a>, <a href="/search/cs?searchtype=author&query=Soares%2C+L+B">Livio Baldini Soares</a>, <a href="/search/cs?searchtype=author&query=Kwiatkowski%2C+T">Tom Kwiatkowski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2005.14253v1-abstract-short" style="display: inline;"> In this work, we present an entity linking model which combines a Transformer architecture with large scale pretraining from Wikipedia links. Our model achieves the state-of-the-art on two commonly used entity linking datasets: 96.7% on CoNLL and 94.9% on TAC-KBP. We present detailed analyses to understand what design choices are important for entity linking, including choices of negative entity c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.14253v1-abstract-full').style.display = 'inline'; document.getElementById('2005.14253v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2005.14253v1-abstract-full" style="display: none;"> In this work, we present an entity linking model which combines a Transformer architecture with large scale pretraining from Wikipedia links. Our model achieves the state-of-the-art on two commonly used entity linking datasets: 96.7% on CoNLL and 94.9% on TAC-KBP. We present detailed analyses to understand what design choices are important for entity linking, including choices of negative entity candidates, Transformer architecture, and input perturbations. Lastly, we present promising results on more challenging settings such as end-to-end entity linking and entity linking without in-domain training data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.14253v1-abstract-full').style.display = 'none'; document.getElementById('2005.14253v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 8 figures, appearing at AKBC 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2004.11997">arXiv:2004.11997</a> <span> [<a href="https://arxiv.org/pdf/2004.11997">pdf</a>, <a href="https://arxiv.org/format/2004.11997">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> New Protocols and Negative Results for Textual Entailment Data Collection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bowman%2C+S+R">Samuel R. Bowman</a>, <a href="/search/cs?searchtype=author&query=Palomaki%2C+J">Jennimaria Palomaki</a>, <a href="/search/cs?searchtype=author&query=Soares%2C+L+B">Livio Baldini Soares</a>, <a href="/search/cs?searchtype=author&query=Pitler%2C+E">Emily Pitler</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2004.11997v2-abstract-short" style="display: inline;"> Natural language inference (NLI) data has proven useful in benchmarking and, especially, as pretraining data for tasks requiring language understanding. However, the crowdsourcing protocol that was used to collect this data has known issues and was not explicitly optimized for either of these purposes, so it is likely far from ideal. We propose four alternative protocols, each aimed at improving e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.11997v2-abstract-full').style.display = 'inline'; document.getElementById('2004.11997v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2004.11997v2-abstract-full" style="display: none;"> Natural language inference (NLI) data has proven useful in benchmarking and, especially, as pretraining data for tasks requiring language understanding. However, the crowdsourcing protocol that was used to collect this data has known issues and was not explicitly optimized for either of these purposes, so it is likely far from ideal. We propose four alternative protocols, each aimed at improving either the ease with which annotators can produce sound training examples or the quality and diversity of those examples. Using these alternatives and a fifth baseline protocol, we collect and compare five new 8.5k-example training sets. In evaluations focused on transfer learning applications, our results are solidly negative, with models trained on our baseline dataset yielding good transfer performance to downstream tasks, but none of our four new methods (nor the recent ANLI) showing any improvements over that baseline. In a small silver lining, we observe that all four new protocols, especially those where annotators edit pre-filled text boxes, reduce previously observed issues with annotation artifacts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.11997v2-abstract-full').style.display = 'none'; document.getElementById('2004.11997v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 April, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at EMNLP 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2004.07202">arXiv:2004.07202</a> <span> [<a href="https://arxiv.org/pdf/2004.07202">pdf</a>, <a href="https://arxiv.org/format/2004.07202">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Entities as Experts: Sparse Memory Access with Entity Supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=F%C3%A9vry%2C+T">Thibault F茅vry</a>, <a href="/search/cs?searchtype=author&query=Soares%2C+L+B">Livio Baldini Soares</a>, <a href="/search/cs?searchtype=author&query=FitzGerald%2C+N">Nicholas FitzGerald</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+E">Eunsol Choi</a>, <a href="/search/cs?searchtype=author&query=Kwiatkowski%2C+T">Tom Kwiatkowski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2004.07202v2-abstract-short" style="display: inline;"> We focus on the problem of capturing declarative knowledge about entities in the learned parameters of a language model. We introduce a new model - Entities as Experts (EAE) - that can access distinct memories of the entities mentioned in a piece of text. Unlike previous efforts to integrate entity knowledge into sequence models, EAE's entity representations are learned directly from text. We show… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.07202v2-abstract-full').style.display = 'inline'; document.getElementById('2004.07202v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2004.07202v2-abstract-full" style="display: none;"> We focus on the problem of capturing declarative knowledge about entities in the learned parameters of a language model. We introduce a new model - Entities as Experts (EAE) - that can access distinct memories of the entities mentioned in a piece of text. Unlike previous efforts to integrate entity knowledge into sequence models, EAE's entity representations are learned directly from text. We show that EAE's learned representations capture sufficient knowledge to answer TriviaQA questions such as "Which Dr. Who villain has been played by Roger Delgado, Anthony Ainley, Eric Roberts?", outperforming an encoder-generator Transformer model with 10x the parameters. According to the LAMA knowledge probes, EAE contains more factual knowledge than a similarly sized BERT, as well as previous approaches that integrate external sources of entity knowledge. Because EAE associates parameters with specific entities, it only needs to access a fraction of its parameters at inference time, and we show that the correct identification and representation of entities is essential to EAE's performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.07202v2-abstract-full').style.display = 'none'; document.getElementById('2004.07202v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 April, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2001.03765">arXiv:2001.03765</a> <span> [<a href="https://arxiv.org/pdf/2001.03765">pdf</a>, <a href="https://arxiv.org/format/2001.03765">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Learning Cross-Context Entity Representations from Text </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ling%2C+J">Jeffrey Ling</a>, <a href="/search/cs?searchtype=author&query=FitzGerald%2C+N">Nicholas FitzGerald</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+Z">Zifei Shan</a>, <a href="/search/cs?searchtype=author&query=Soares%2C+L+B">Livio Baldini Soares</a>, <a href="/search/cs?searchtype=author&query=F%C3%A9vry%2C+T">Thibault F茅vry</a>, <a href="/search/cs?searchtype=author&query=Weiss%2C+D">David Weiss</a>, <a href="/search/cs?searchtype=author&query=Kwiatkowski%2C+T">Tom Kwiatkowski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2001.03765v1-abstract-short" style="display: inline;"> Language modeling tasks, in which words, or word-pieces, are predicted on the basis of a local context, have been very effective for learning word embeddings and context dependent representations of phrases. Motivated by the observation that efforts to code world knowledge into machine readable knowledge bases or human readable encyclopedias tend to be entity-centric, we investigate the use of a f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.03765v1-abstract-full').style.display = 'inline'; document.getElementById('2001.03765v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2001.03765v1-abstract-full" style="display: none;"> Language modeling tasks, in which words, or word-pieces, are predicted on the basis of a local context, have been very effective for learning word embeddings and context dependent representations of phrases. Motivated by the observation that efforts to code world knowledge into machine readable knowledge bases or human readable encyclopedias tend to be entity-centric, we investigate the use of a fill-in-the-blank task to learn context independent representations of entities from the text contexts in which those entities were mentioned. We show that large scale training of neural models allows us to learn high quality entity representations, and we demonstrate successful results on four domains: (1) existing entity-level typing benchmarks, including a 64% error reduction over previous work on TypeNet (Murty et al., 2018); (2) a novel few-shot category reconstruction task; (3) existing entity linking benchmarks, where we match the state-of-the-art on CoNLL-Aida without linking-specific features and obtain a score of 89.8% on TAC-KBP 2010 without using any alias table, external knowledge base or in domain training data and (4) answering trivia questions, which uniquely identify entities. Our global entity representations encode fine-grained type categories, such as Scottish footballers, and can answer trivia questions such as: Who was the last inmate of Spandau jail in Berlin? <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.03765v1-abstract-full').style.display = 'none'; document.getElementById('2001.03765v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1906.03158">arXiv:1906.03158</a> <span> [<a href="https://arxiv.org/pdf/1906.03158">pdf</a>, <a href="https://arxiv.org/format/1906.03158">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Matching the Blanks: Distributional Similarity for Relation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Soares%2C+L+B">Livio Baldini Soares</a>, <a href="/search/cs?searchtype=author&query=FitzGerald%2C+N">Nicholas FitzGerald</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+J">Jeffrey Ling</a>, <a href="/search/cs?searchtype=author&query=Kwiatkowski%2C+T">Tom Kwiatkowski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1906.03158v1-abstract-short" style="display: inline;"> General purpose relation extractors, which can model arbitrary relations, are a core aspiration in information extraction. Efforts have been made to build general purpose extractors that represent relations with their surface forms, or which jointly embed surface forms with relations from an existing knowledge graph. However, both of these approaches are limited in their ability to generalize. In… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.03158v1-abstract-full').style.display = 'inline'; document.getElementById('1906.03158v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1906.03158v1-abstract-full" style="display: none;"> General purpose relation extractors, which can model arbitrary relations, are a core aspiration in information extraction. Efforts have been made to build general purpose extractors that represent relations with their surface forms, or which jointly embed surface forms with relations from an existing knowledge graph. However, both of these approaches are limited in their ability to generalize. In this paper, we build on extensions of Harris' distributional hypothesis to relations, as well as recent advances in learning text representations (specifically, BERT), to build task agnostic relation representations solely from entity-linked text. We show that these representations significantly outperform previous work on exemplar based relation extraction (FewRel) even without using any of that task's training data. We also show that models initialized with our task agnostic representations, and then tuned on supervised relation extraction datasets, significantly outperform the previous methods on SemEval 2010 Task 8, KBP37, and TACRED. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.03158v1-abstract-full').style.display = 'none'; document.getElementById('1906.03158v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at ACL 2019</span> </p> </li> </ol> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>