CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;18 of 18 results for author: <span class="mathjax">Tran, V Q</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Tran%2C+V+Q">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Tran, V Q"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Tran%2C+V+Q&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Tran, V Q"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04530">arXiv:2411.04530</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.04530">pdf</a>, <a href="https://arxiv.org/format/2411.04530">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Tomato, Tomahto, Tomate: Measuring the Role of Shared Semantics among Subwords in Multilingual Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xinyu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jing Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Tran%2C+V+Q">Vinh Q. Tran</a>, <a href="/search/cs?searchtype=author&amp;query=Schuster%2C+T">Tal Schuster</a>, <a href="/search/cs?searchtype=author&amp;query=Metzler%2C+D">Donald Metzler</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+J">Jimmy Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04530v1-abstract-short" style="display: inline;"> Human understanding of language is robust to different word choices as far as they represent similar semantic concepts. To what extent does our human intuition transfer to language models, which represent all subwords as distinct embeddings? In this work, we take an initial step on measuring the role of shared semantics among subwords in the encoder-only multilingual language models (mLMs). To thi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04530v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04530v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04530v1-abstract-full" style="display: none;"> Human understanding of language is robust to different word choices as far as they represent similar semantic concepts. To what extent does our human intuition transfer to language models, which represent all subwords as distinct embeddings? In this work, we take an initial step on measuring the role of shared semantics among subwords in the encoder-only multilingual language models (mLMs). To this end, we form &#34;semantic tokens&#34; by merging the semantically similar subwords and their embeddings, and evaluate the updated mLMs on 5 heterogeneous multilingual downstream tasks. Results show that the general shared semantics could get the models a long way in making the predictions on mLMs with different tokenizers and model sizes. Inspections on the grouped subwords show that they exhibit a wide range of semantic similarities, including synonyms and translations across many languages and scripts. Lastly, we found the zero-shot results with semantic tokens are on par or even better than the original models on certain classification tasks, suggesting that the shared subword-level semantics may serve as the anchors for cross-lingual transferring. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04530v1-abstract-full').style.display = 'none'; document.getElementById('2411.04530v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.16737">arXiv:2408.16737</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.16737">pdf</a>, <a href="https://arxiv.org/format/2408.16737">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Smaller, Weaker, Yet Better: Training LLM Reasoners via Compute-Optimal Sampling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bansal%2C+H">Hritik Bansal</a>, <a href="/search/cs?searchtype=author&amp;query=Hosseini%2C+A">Arian Hosseini</a>, <a href="/search/cs?searchtype=author&amp;query=Agarwal%2C+R">Rishabh Agarwal</a>, <a href="/search/cs?searchtype=author&amp;query=Tran%2C+V+Q">Vinh Q. Tran</a>, <a href="/search/cs?searchtype=author&amp;query=Kazemi%2C+M">Mehran Kazemi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.16737v2-abstract-short" style="display: inline;"> Training on high-quality synthetic data from strong language models (LMs) is a common strategy to improve the reasoning performance of LMs. In this work, we revisit whether this strategy is compute-optimal under a fixed inference budget (e.g., FLOPs). To do so, we investigate the trade-offs between generating synthetic data using a stronger but more expensive (SE) model versus a weaker but cheaper&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16737v2-abstract-full').style.display = 'inline'; document.getElementById('2408.16737v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.16737v2-abstract-full" style="display: none;"> Training on high-quality synthetic data from strong language models (LMs) is a common strategy to improve the reasoning performance of LMs. In this work, we revisit whether this strategy is compute-optimal under a fixed inference budget (e.g., FLOPs). To do so, we investigate the trade-offs between generating synthetic data using a stronger but more expensive (SE) model versus a weaker but cheaper (WC) model. We evaluate the generated data across three key metrics: coverage, diversity, and false positive rate, and show that the data from WC models may have higher coverage and diversity, but also exhibit higher false positive rates. We then finetune LMs on data from SE and WC models in different settings: knowledge distillation, self-improvement, and a novel weak-to-strong improvement setup where a weaker LM teaches reasoning to a stronger LM. Our findings reveal that models finetuned on WC-generated data consistently outperform those trained on SE-generated data across multiple benchmarks and multiple choices of WC and SE models. These results challenge the prevailing practice of relying on SE models for synthetic data generation, suggesting that WC may be the compute-optimal approach for training advanced LM reasoners. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16737v2-abstract-full').style.display = 'none'; document.getElementById('2408.16737v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.01825">arXiv:2402.01825</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.01825">pdf</a>, <a href="https://arxiv.org/format/2402.01825">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Fractal Patterns May Illuminate the Success of Next-Token Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Alabdulmohsin%2C+I">Ibrahim Alabdulmohsin</a>, <a href="/search/cs?searchtype=author&amp;query=Tran%2C+V+Q">Vinh Q. Tran</a>, <a href="/search/cs?searchtype=author&amp;query=Dehghani%2C+M">Mostafa Dehghani</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.01825v2-abstract-short" style="display: inline;"> We study the fractal structure of language, aiming to provide a precise formalism for quantifying properties that may have been previously suspected but not formally shown. We establish that language is: (1) self-similar, exhibiting complexities at all levels of granularity, with no particular characteristic context length, and (2) long-range dependent (LRD), with a Hurst parameter of approximatel&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.01825v2-abstract-full').style.display = 'inline'; document.getElementById('2402.01825v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.01825v2-abstract-full" style="display: none;"> We study the fractal structure of language, aiming to provide a precise formalism for quantifying properties that may have been previously suspected but not formally shown. We establish that language is: (1) self-similar, exhibiting complexities at all levels of granularity, with no particular characteristic context length, and (2) long-range dependent (LRD), with a Hurst parameter of approximately H=0.7. Based on these findings, we argue that short-term patterns/dependencies in language, such as in paragraphs, mirror the patterns/dependencies over larger scopes, like entire documents. This may shed some light on how next-token prediction can capture the structure of text across multiple levels of granularity, from words and clauses to broader contexts and intents. In addition, we carry out an extensive analysis across different domains and architectures, showing that fractal parameters are robust. Finally, we demonstrate that the tiny variations in fractal parameters seen across LLMs improve upon perplexity-based bits-per-byte (BPB) in predicting their downstream performance. We hope these findings offer a fresh perspective on language and the mechanisms underlying the success of LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.01825v2-abstract-full').style.display = 'none'; document.getElementById('2402.01825v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 10 tables, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.11841">arXiv:2305.11841</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.11841">pdf</a>, <a href="https://arxiv.org/format/2305.11841">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> How Does Generative Retrieval Scale to Millions of Passages? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pradeep%2C+R">Ronak Pradeep</a>, <a href="/search/cs?searchtype=author&amp;query=Hui%2C+K">Kai Hui</a>, <a href="/search/cs?searchtype=author&amp;query=Gupta%2C+J">Jai Gupta</a>, <a href="/search/cs?searchtype=author&amp;query=Lelkes%2C+A+D">Adam D. Lelkes</a>, <a href="/search/cs?searchtype=author&amp;query=Zhuang%2C+H">Honglei Zhuang</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+J">Jimmy Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Metzler%2C+D">Donald Metzler</a>, <a href="/search/cs?searchtype=author&amp;query=Tran%2C+V+Q">Vinh Q. Tran</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.11841v1-abstract-short" style="display: inline;"> Popularized by the Differentiable Search Index, the emerging paradigm of generative retrieval re-frames the classic information retrieval problem into a sequence-to-sequence modeling task, forgoing external indices and encoding an entire document corpus within a single Transformer. Although many different approaches have been proposed to improve the effectiveness of generative retrieval, they have&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.11841v1-abstract-full').style.display = 'inline'; document.getElementById('2305.11841v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.11841v1-abstract-full" style="display: none;"> Popularized by the Differentiable Search Index, the emerging paradigm of generative retrieval re-frames the classic information retrieval problem into a sequence-to-sequence modeling task, forgoing external indices and encoding an entire document corpus within a single Transformer. Although many different approaches have been proposed to improve the effectiveness of generative retrieval, they have only been evaluated on document corpora on the order of 100k in size. We conduct the first empirical study of generative retrieval techniques across various corpus scales, ultimately scaling up to the entire MS MARCO passage ranking task with a corpus of 8.8M passages and evaluating model sizes up to 11B parameters. We uncover several findings about scaling generative retrieval to millions of passages; notably, the central importance of using synthetic queries as document representations during indexing, the ineffectiveness of existing proposed architecture modifications when accounting for compute cost, and the limits of naively scaling model parameters with respect to retrieval performance. While we find that generative retrieval is competitive with state-of-the-art dual encoders on small corpora, scaling to millions of passages remains an important and unsolved challenge. We believe these findings will be valuable for the community to clarify the current state of generative retrieval, highlight the unique challenges, and inspire new research directions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.11841v1-abstract-full').style.display = 'none'; document.getElementById('2305.11841v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.05065">arXiv:2305.05065</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.05065">pdf</a>, <a href="https://arxiv.org/format/2305.05065">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Recommender Systems with Generative Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Rajput%2C+S">Shashank Rajput</a>, <a href="/search/cs?searchtype=author&amp;query=Mehta%2C+N">Nikhil Mehta</a>, <a href="/search/cs?searchtype=author&amp;query=Singh%2C+A">Anima Singh</a>, <a href="/search/cs?searchtype=author&amp;query=Keshavan%2C+R+H">Raghunandan H. Keshavan</a>, <a href="/search/cs?searchtype=author&amp;query=Vu%2C+T">Trung Vu</a>, <a href="/search/cs?searchtype=author&amp;query=Heldt%2C+L">Lukasz Heldt</a>, <a href="/search/cs?searchtype=author&amp;query=Hong%2C+L">Lichan Hong</a>, <a href="/search/cs?searchtype=author&amp;query=Tay%2C+Y">Yi Tay</a>, <a href="/search/cs?searchtype=author&amp;query=Tran%2C+V+Q">Vinh Q. Tran</a>, <a href="/search/cs?searchtype=author&amp;query=Samost%2C+J">Jonah Samost</a>, <a href="/search/cs?searchtype=author&amp;query=Kula%2C+M">Maciej Kula</a>, <a href="/search/cs?searchtype=author&amp;query=Chi%2C+E+H">Ed H. Chi</a>, <a href="/search/cs?searchtype=author&amp;query=Sathiamoorthy%2C+M">Maheswaran Sathiamoorthy</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.05065v3-abstract-short" style="display: inline;"> Modern recommender systems perform large-scale retrieval by first embedding queries and item candidates in the same unified space, followed by approximate nearest neighbor search to select top candidates given a query embedding. In this paper, we propose a novel generative retrieval approach, where the retrieval model autoregressively decodes the identifiers of the target candidates. To that end,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.05065v3-abstract-full').style.display = 'inline'; document.getElementById('2305.05065v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.05065v3-abstract-full" style="display: none;"> Modern recommender systems perform large-scale retrieval by first embedding queries and item candidates in the same unified space, followed by approximate nearest neighbor search to select top candidates given a query embedding. In this paper, we propose a novel generative retrieval approach, where the retrieval model autoregressively decodes the identifiers of the target candidates. To that end, we create semantically meaningful tuple of codewords to serve as a Semantic ID for each item. Given Semantic IDs for items in a user session, a Transformer-based sequence-to-sequence model is trained to predict the Semantic ID of the next item that the user will interact with. To the best of our knowledge, this is the first Semantic ID-based generative model for recommendation tasks. We show that recommender systems trained with the proposed paradigm significantly outperform the current SOTA models on various datasets. In addition, we show that incorporating Semantic IDs into the sequence-to-sequence model enhances its ability to generalize, as evidenced by the improved retrieval performance observed for items with no prior interaction history. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.05065v3-abstract-full').style.display = 'none'; document.getElementById('2305.05065v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in The 37th Conference on Neural Information Processing Systems (NeurIPS 2023)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.13898">arXiv:2212.13898</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2212.13898">pdf</a>, <a href="https://arxiv.org/format/2212.13898">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Dense Feature Memory Augmented Transformers for COVID-19 Vaccination Search Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gupta%2C+J">Jai Gupta</a>, <a href="/search/cs?searchtype=author&amp;query=Tay%2C+Y">Yi Tay</a>, <a href="/search/cs?searchtype=author&amp;query=Kamath%2C+C">Chaitanya Kamath</a>, <a href="/search/cs?searchtype=author&amp;query=Tran%2C+V+Q">Vinh Q. Tran</a>, <a href="/search/cs?searchtype=author&amp;query=Metzler%2C+D">Donald Metzler</a>, <a href="/search/cs?searchtype=author&amp;query=Bavadekar%2C+S">Shailesh Bavadekar</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Mimi Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Gabrilovich%2C+E">Evgeniy Gabrilovich</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.13898v1-abstract-short" style="display: inline;"> With the devastating outbreak of COVID-19, vaccines are one of the crucial lines of defense against mass infection in this global pandemic. Given the protection they provide, vaccines are becoming mandatory in certain social and professional settings. This paper presents a classification model for detecting COVID-19 vaccination related search queries, a machine learning model that is used to gener&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.13898v1-abstract-full').style.display = 'inline'; document.getElementById('2212.13898v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.13898v1-abstract-full" style="display: none;"> With the devastating outbreak of COVID-19, vaccines are one of the crucial lines of defense against mass infection in this global pandemic. Given the protection they provide, vaccines are becoming mandatory in certain social and professional settings. This paper presents a classification model for detecting COVID-19 vaccination related search queries, a machine learning model that is used to generate search insights for COVID-19 vaccinations. The proposed method combines and leverages advancements from modern state-of-the-art (SOTA) natural language understanding (NLU) techniques such as pretrained Transformers with traditional dense features. We propose a novel approach of considering dense features as memory tokens that the model can attend to. We show that this new modeling approach enables a significant improvement to the Vaccine Search Insights (VSI) task, improving a strong well-established gradient-boosting baseline by relative +15% improvement in F1 score and +14% in precision. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.13898v1-abstract-full').style.display = 'none'; document.getElementById('2212.13898v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2022</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.09744">arXiv:2212.09744</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2212.09744">pdf</a>, <a href="https://arxiv.org/format/2212.09744">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DSI++: Updating Transformer Memory with New Documents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mehta%2C+S+V">Sanket Vaibhav Mehta</a>, <a href="/search/cs?searchtype=author&amp;query=Gupta%2C+J">Jai Gupta</a>, <a href="/search/cs?searchtype=author&amp;query=Tay%2C+Y">Yi Tay</a>, <a href="/search/cs?searchtype=author&amp;query=Dehghani%2C+M">Mostafa Dehghani</a>, <a href="/search/cs?searchtype=author&amp;query=Tran%2C+V+Q">Vinh Q. Tran</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+J">Jinfeng Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Najork%2C+M">Marc Najork</a>, <a href="/search/cs?searchtype=author&amp;query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&amp;query=Metzler%2C+D">Donald Metzler</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.09744v3-abstract-short" style="display: inline;"> Differentiable Search Indices (DSIs) encode a corpus of documents in model parameters and use the same model to answer user queries directly. Despite the strong performance of DSI models, deploying them in situations where the corpus changes over time is computationally expensive because reindexing the corpus requires re-training the model. In this work, we introduce DSI++, a continual learning ch&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.09744v3-abstract-full').style.display = 'inline'; document.getElementById('2212.09744v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.09744v3-abstract-full" style="display: none;"> Differentiable Search Indices (DSIs) encode a corpus of documents in model parameters and use the same model to answer user queries directly. Despite the strong performance of DSI models, deploying them in situations where the corpus changes over time is computationally expensive because reindexing the corpus requires re-training the model. In this work, we introduce DSI++, a continual learning challenge for DSI to incrementally index new documents while being able to answer queries related to both previously and newly indexed documents. Across different model scales and document identifier representations, we show that continual indexing of new documents leads to considerable forgetting of previously indexed documents. We also hypothesize and verify that the model experiences forgetting events during training, leading to unstable learning. To mitigate these issues, we investigate two approaches. The first focuses on modifying the training dynamics. Flatter minima implicitly alleviate forgetting, so we optimize for flatter loss basins and show that the model stably memorizes more documents ($+12\%$). Next, we introduce a generative memory to sample pseudo-queries for documents and supplement them during continual indexing to prevent forgetting for the retrieval task. Extensive experiments on novel continual indexing benchmarks based on Natural Questions (NQ) and MS MARCO demonstrate that our proposed solution mitigates forgetting significantly. Concretely, it improves the average Hits@10 by $+21.1\%$ over competitive baselines for NQ and requires $6$ times fewer model updates compared to re-training the DSI model for incrementally indexing five corpora in a sequence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.09744v3-abstract-full').style.display = 'none'; document.getElementById('2212.09744v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at EMNLP 2023 main conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.08037">arXiv:2212.08037</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2212.08037">pdf</a>, <a href="https://arxiv.org/format/2212.08037">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Attributed Question Answering: Evaluation and Modeling for Attributed Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bohnet%2C+B">Bernd Bohnet</a>, <a href="/search/cs?searchtype=author&amp;query=Tran%2C+V+Q">Vinh Q. Tran</a>, <a href="/search/cs?searchtype=author&amp;query=Verga%2C+P">Pat Verga</a>, <a href="/search/cs?searchtype=author&amp;query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&amp;query=Andor%2C+D">Daniel Andor</a>, <a href="/search/cs?searchtype=author&amp;query=Soares%2C+L+B">Livio Baldini Soares</a>, <a href="/search/cs?searchtype=author&amp;query=Ciaramita%2C+M">Massimiliano Ciaramita</a>, <a href="/search/cs?searchtype=author&amp;query=Eisenstein%2C+J">Jacob Eisenstein</a>, <a href="/search/cs?searchtype=author&amp;query=Ganchev%2C+K">Kuzman Ganchev</a>, <a href="/search/cs?searchtype=author&amp;query=Herzig%2C+J">Jonathan Herzig</a>, <a href="/search/cs?searchtype=author&amp;query=Hui%2C+K">Kai Hui</a>, <a href="/search/cs?searchtype=author&amp;query=Kwiatkowski%2C+T">Tom Kwiatkowski</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+J">Ji Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Ni%2C+J">Jianmo Ni</a>, <a href="/search/cs?searchtype=author&amp;query=Saralegui%2C+L+S">Lierni Sestorain Saralegui</a>, <a href="/search/cs?searchtype=author&amp;query=Schuster%2C+T">Tal Schuster</a>, <a href="/search/cs?searchtype=author&amp;query=Cohen%2C+W+W">William W. Cohen</a>, <a href="/search/cs?searchtype=author&amp;query=Collins%2C+M">Michael Collins</a>, <a href="/search/cs?searchtype=author&amp;query=Das%2C+D">Dipanjan Das</a>, <a href="/search/cs?searchtype=author&amp;query=Metzler%2C+D">Donald Metzler</a>, <a href="/search/cs?searchtype=author&amp;query=Petrov%2C+S">Slav Petrov</a>, <a href="/search/cs?searchtype=author&amp;query=Webster%2C+K">Kellie Webster</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.08037v2-abstract-short" style="display: inline;"> Large language models (LLMs) have shown impressive results while requiring little or no direct supervision. Further, there is mounting evidence that LLMs may have potential in information-seeking scenarios. We believe the ability of an LLM to attribute the text that it generates is likely to be crucial in this setting. We formulate and study Attributed QA as a key first step in the development of&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.08037v2-abstract-full').style.display = 'inline'; document.getElementById('2212.08037v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.08037v2-abstract-full" style="display: none;"> Large language models (LLMs) have shown impressive results while requiring little or no direct supervision. Further, there is mounting evidence that LLMs may have potential in information-seeking scenarios. We believe the ability of an LLM to attribute the text that it generates is likely to be crucial in this setting. We formulate and study Attributed QA as a key first step in the development of attributed LLMs. We propose a reproducible evaluation framework for the task and benchmark a broad set of architectures. We take human annotations as a gold standard and show that a correlated automatic metric is suitable for development. Our experimental work gives concrete answers to two key questions (How to measure attribution?, and How well do current state-of-the-art methods perform on attribution?), and give some hints as to how to address a third (How to build LLMs with attribution?). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.08037v2-abstract-full').style.display = 'none'; document.getElementById('2212.08037v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.11399">arXiv:2210.11399</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2210.11399">pdf</a>, <a href="https://arxiv.org/format/2210.11399">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Transcending Scaling Laws with 0.1% Extra Compute </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tay%2C+Y">Yi Tay</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+J">Jason Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Chung%2C+H+W">Hyung Won Chung</a>, <a href="/search/cs?searchtype=author&amp;query=Tran%2C+V+Q">Vinh Q. Tran</a>, <a href="/search/cs?searchtype=author&amp;query=So%2C+D+R">David R. So</a>, <a href="/search/cs?searchtype=author&amp;query=Shakeri%2C+S">Siamak Shakeri</a>, <a href="/search/cs?searchtype=author&amp;query=Garcia%2C+X">Xavier Garcia</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+H+S">Huaixiu Steven Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+J">Jinfeng Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Chowdhery%2C+A">Aakanksha Chowdhery</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+D">Denny Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Metzler%2C+D">Donald Metzler</a>, <a href="/search/cs?searchtype=author&amp;query=Petrov%2C+S">Slav Petrov</a>, <a href="/search/cs?searchtype=author&amp;query=Houlsby%2C+N">Neil Houlsby</a>, <a href="/search/cs?searchtype=author&amp;query=Le%2C+Q+V">Quoc V. Le</a>, <a href="/search/cs?searchtype=author&amp;query=Dehghani%2C+M">Mostafa Dehghani</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.11399v2-abstract-short" style="display: inline;"> Scaling language models improves performance but comes with significant computational costs. This paper proposes UL2R, a method that substantially improves existing language models and their scaling curves with a relatively tiny amount of extra compute. The key idea is to continue training a state-of-the-art large language model (e.g., PaLM) on a few more steps with UL2&#39;s mixture-of-denoiser objec&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.11399v2-abstract-full').style.display = 'inline'; document.getElementById('2210.11399v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.11399v2-abstract-full" style="display: none;"> Scaling language models improves performance but comes with significant computational costs. This paper proposes UL2R, a method that substantially improves existing language models and their scaling curves with a relatively tiny amount of extra compute. The key idea is to continue training a state-of-the-art large language model (e.g., PaLM) on a few more steps with UL2&#39;s mixture-of-denoiser objective. We show that, with almost negligible extra computational costs and no new sources of data, we are able to substantially improve the scaling properties of large language models on downstream metrics. In this paper, we continue training PaLM with UL2R, introducing a new set of models at 8B, 62B, and 540B scale which we call U-PaLM. Impressively, at 540B scale, we show an approximately 2x computational savings rate where U-PaLM achieves the same performance as the final PaLM 540B model at around half its computational budget (i.e., saving $\sim$4.4 million TPUv4 hours). We further show that this improved scaling curve leads to &#39;emergent abilities&#39; on challenging BIG-Bench tasks -- for instance, U-PaLM does much better than PaLM on some tasks or demonstrates better quality at much smaller scale (62B as opposed to 540B). Overall, we show that U-PaLM outperforms PaLM on many few-shot setups, i.e., English NLP tasks (e.g., commonsense reasoning, question answering), reasoning tasks with chain-of-thought (e.g., GSM8K), multilingual tasks (MGSM, TydiQA), MMLU and challenging BIG-Bench tasks. Finally, we provide qualitative examples showing the new capabilities of U-PaLM for single and multi-span infilling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.11399v2-abstract-full').style.display = 'none'; document.getElementById('2210.11399v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">V2 has updated references/related work</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.10551">arXiv:2207.10551</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2207.10551">pdf</a>, <a href="https://arxiv.org/format/2207.10551">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Scaling Laws vs Model Architectures: How does Inductive Bias Influence Scaling? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tay%2C+Y">Yi Tay</a>, <a href="/search/cs?searchtype=author&amp;query=Dehghani%2C+M">Mostafa Dehghani</a>, <a href="/search/cs?searchtype=author&amp;query=Abnar%2C+S">Samira Abnar</a>, <a href="/search/cs?searchtype=author&amp;query=Chung%2C+H+W">Hyung Won Chung</a>, <a href="/search/cs?searchtype=author&amp;query=Fedus%2C+W">William Fedus</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+J">Jinfeng Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Narang%2C+S">Sharan Narang</a>, <a href="/search/cs?searchtype=author&amp;query=Tran%2C+V+Q">Vinh Q. Tran</a>, <a href="/search/cs?searchtype=author&amp;query=Yogatama%2C+D">Dani Yogatama</a>, <a href="/search/cs?searchtype=author&amp;query=Metzler%2C+D">Donald Metzler</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.10551v1-abstract-short" style="display: inline;"> There have been a lot of interest in the scaling properties of Transformer models. However, not much has been done on the front of investigating the effect of scaling properties of different inductive biases and model architectures. Do model architectures scale differently? If so, how does inductive bias affect scaling behaviour? How does this influence upstream (pretraining) and downstream (trans&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.10551v1-abstract-full').style.display = 'inline'; document.getElementById('2207.10551v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.10551v1-abstract-full" style="display: none;"> There have been a lot of interest in the scaling properties of Transformer models. However, not much has been done on the front of investigating the effect of scaling properties of different inductive biases and model architectures. Do model architectures scale differently? If so, how does inductive bias affect scaling behaviour? How does this influence upstream (pretraining) and downstream (transfer)? This paper conducts a systematic study of scaling behaviour of ten diverse model architectures such as Transformers, Switch Transformers, Universal Transformers, Dynamic convolutions, Performers, and recently proposed MLP-Mixers. Via extensive experiments, we show that (1) architecture is an indeed an important consideration when performing scaling and (2) the best performing model can fluctuate at different scales. We believe that the findings outlined in this work has significant implications to how model architectures are currently evaluated in the community. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.10551v1-abstract-full').style.display = 'none'; document.getElementById('2207.10551v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.07061">arXiv:2207.07061</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2207.07061">pdf</a>, <a href="https://arxiv.org/format/2207.07061">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Confident Adaptive Language Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Schuster%2C+T">Tal Schuster</a>, <a href="/search/cs?searchtype=author&amp;query=Fisch%2C+A">Adam Fisch</a>, <a href="/search/cs?searchtype=author&amp;query=Gupta%2C+J">Jai Gupta</a>, <a href="/search/cs?searchtype=author&amp;query=Dehghani%2C+M">Mostafa Dehghani</a>, <a href="/search/cs?searchtype=author&amp;query=Bahri%2C+D">Dara Bahri</a>, <a href="/search/cs?searchtype=author&amp;query=Tran%2C+V+Q">Vinh Q. Tran</a>, <a href="/search/cs?searchtype=author&amp;query=Tay%2C+Y">Yi Tay</a>, <a href="/search/cs?searchtype=author&amp;query=Metzler%2C+D">Donald Metzler</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.07061v2-abstract-short" style="display: inline;"> Recent advances in Transformer-based large language models (LLMs) have led to significant performance improvements across many tasks. These gains come with a drastic increase in the models&#39; size, potentially leading to slow and costly use at inference time. In practice, however, the series of generations made by LLMs is composed of varying levels of difficulty. While certain predictions truly bene&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.07061v2-abstract-full').style.display = 'inline'; document.getElementById('2207.07061v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.07061v2-abstract-full" style="display: none;"> Recent advances in Transformer-based large language models (LLMs) have led to significant performance improvements across many tasks. These gains come with a drastic increase in the models&#39; size, potentially leading to slow and costly use at inference time. In practice, however, the series of generations made by LLMs is composed of varying levels of difficulty. While certain predictions truly benefit from the models&#39; full capacity, other continuations are more trivial and can be solved with reduced compute. In this work, we introduce Confident Adaptive Language Modeling (CALM), a framework for dynamically allocating different amounts of compute per input and generation timestep. Early exit decoding involves several challenges that we address here, such as: (1) what confidence measure to use; (2) connecting sequence-level constraints to local per-token exit decisions; and (3) attending back to missing hidden representations due to early exits in previous tokens. Through theoretical analysis and empirical experiments on three diverse text generation tasks, we demonstrate the efficacy of our framework in reducing compute -- potential speedup of up to $\times 3$ -- while provably maintaining high performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.07061v2-abstract-full').style.display = 'none'; document.getElementById('2207.07061v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2022 (selected as Oral)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.05131">arXiv:2205.05131</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2205.05131">pdf</a>, <a href="https://arxiv.org/format/2205.05131">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> UL2: Unifying Language Learning Paradigms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tay%2C+Y">Yi Tay</a>, <a href="/search/cs?searchtype=author&amp;query=Dehghani%2C+M">Mostafa Dehghani</a>, <a href="/search/cs?searchtype=author&amp;query=Tran%2C+V+Q">Vinh Q. Tran</a>, <a href="/search/cs?searchtype=author&amp;query=Garcia%2C+X">Xavier Garcia</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+J">Jason Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xuezhi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chung%2C+H+W">Hyung Won Chung</a>, <a href="/search/cs?searchtype=author&amp;query=Shakeri%2C+S">Siamak Shakeri</a>, <a href="/search/cs?searchtype=author&amp;query=Bahri%2C+D">Dara Bahri</a>, <a href="/search/cs?searchtype=author&amp;query=Schuster%2C+T">Tal Schuster</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+H+S">Huaixiu Steven Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+D">Denny Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Houlsby%2C+N">Neil Houlsby</a>, <a href="/search/cs?searchtype=author&amp;query=Metzler%2C+D">Donald Metzler</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.05131v3-abstract-short" style="display: inline;"> Existing pre-trained models are generally geared towards a particular class of problems. To date, there seems to be still no consensus on what the right architecture and pre-training setup should be. This paper presents a unified framework for pre-training models that are universally effective across datasets and setups. We begin by disentangling architectural archetypes with pre-training objectiv&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.05131v3-abstract-full').style.display = 'inline'; document.getElementById('2205.05131v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.05131v3-abstract-full" style="display: none;"> Existing pre-trained models are generally geared towards a particular class of problems. To date, there seems to be still no consensus on what the right architecture and pre-training setup should be. This paper presents a unified framework for pre-training models that are universally effective across datasets and setups. We begin by disentangling architectural archetypes with pre-training objectives -- two concepts that are commonly conflated. Next, we present a generalized &amp; unified perspective for self-supervision in NLP and show how different pre-training objectives can be cast as one another and how interpolating between different objectives can be effective. We then propose Mixture-of-Denoisers (MoD), a pre-training objective that combines diverse pre-training paradigms together. We furthermore introduce a notion of mode switching, wherein downstream fine-tuning is associated with specific pre-training schemes. We conduct extensive ablative experiments to compare multiple pre-training objectives and find that our method pushes the Pareto-frontier by outperforming T5 &amp; GPT-like models across multiple diverse setups. By scaling our model up to 20B parameters, we achieve SOTA performance on 50 well-established supervised finetuning based NLP tasks. Our model also achieve strong results at in-context learning, outperforming 175B GPT-3 on zero-shot SuperGLUE and tripling the performance of T5-XXL on one-shot summarization. On 0-shot MMLU, UL2 20B outperforms T0 and T5 models. UL2 20B also works well with chain-of-thought prompting and reasoning, making it an appealing choice for research into reasoning at a small to medium scale of 20B parameters. Finally, we apply FLAN instruction tuning to the UL2 20B model, achieving MMLU and Big-Bench scores competitive to FLAN-PaLM 62B. We release Flax-based T5X checkpoints for the UL2 20B &amp; Flan-UL2 20B. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.05131v3-abstract-full').style.display = 'none'; document.getElementById('2205.05131v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Updated Q1 2023 with Flan-UL2 20B release! :)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.11176">arXiv:2202.11176</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2202.11176">pdf</a>, <a href="https://arxiv.org/format/2202.11176">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A New Generation of Perspective API: Efficient Multilingual Character-level Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lees%2C+A">Alyssa Lees</a>, <a href="/search/cs?searchtype=author&amp;query=Tran%2C+V+Q">Vinh Q. Tran</a>, <a href="/search/cs?searchtype=author&amp;query=Tay%2C+Y">Yi Tay</a>, <a href="/search/cs?searchtype=author&amp;query=Sorensen%2C+J">Jeffrey Sorensen</a>, <a href="/search/cs?searchtype=author&amp;query=Gupta%2C+J">Jai Gupta</a>, <a href="/search/cs?searchtype=author&amp;query=Metzler%2C+D">Donald Metzler</a>, <a href="/search/cs?searchtype=author&amp;query=Vasserman%2C+L">Lucy Vasserman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.11176v1-abstract-short" style="display: inline;"> On the world wide web, toxic content detectors are a crucial line of defense against potentially hateful and offensive messages. As such, building highly effective classifiers that enable a safer internet is an important research area. Moreover, the web is a highly multilingual, cross-cultural community that develops its own lingo over time. As such, it is crucial to develop models that are effect&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.11176v1-abstract-full').style.display = 'inline'; document.getElementById('2202.11176v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.11176v1-abstract-full" style="display: none;"> On the world wide web, toxic content detectors are a crucial line of defense against potentially hateful and offensive messages. As such, building highly effective classifiers that enable a safer internet is an important research area. Moreover, the web is a highly multilingual, cross-cultural community that develops its own lingo over time. As such, it is crucial to develop models that are effective across a diverse range of languages, usages, and styles. In this paper, we present the fundamentals behind the next version of the Perspective API from Google Jigsaw. At the heart of the approach is a single multilingual token-free Charformer model that is applicable across a range of languages, domains, and tasks. We demonstrate that by forgoing static vocabularies, we gain flexibility across a variety of settings. We additionally outline the techniques employed to make such a byte-level model efficient and feasible for productionization. Through extensive experiments on multilingual toxic comment classification benchmarks derived from real API traffic and evaluation on an array of code-switching, covert toxicity, emoji-based hate, human-readable obfuscation, distribution shift, and bias evaluation settings, we show that our proposed approach outperforms strong baselines. Finally, we present our findings from deploying this system in production. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.11176v1-abstract-full').style.display = 'none'; document.getElementById('2202.11176v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.06991">arXiv:2202.06991</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2202.06991">pdf</a>, <a href="https://arxiv.org/format/2202.06991">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Transformer Memory as a Differentiable Search Index </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tay%2C+Y">Yi Tay</a>, <a href="/search/cs?searchtype=author&amp;query=Tran%2C+V+Q">Vinh Q. Tran</a>, <a href="/search/cs?searchtype=author&amp;query=Dehghani%2C+M">Mostafa Dehghani</a>, <a href="/search/cs?searchtype=author&amp;query=Ni%2C+J">Jianmo Ni</a>, <a href="/search/cs?searchtype=author&amp;query=Bahri%2C+D">Dara Bahri</a>, <a href="/search/cs?searchtype=author&amp;query=Mehta%2C+H">Harsh Mehta</a>, <a href="/search/cs?searchtype=author&amp;query=Qin%2C+Z">Zhen Qin</a>, <a href="/search/cs?searchtype=author&amp;query=Hui%2C+K">Kai Hui</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Z">Zhe Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Gupta%2C+J">Jai Gupta</a>, <a href="/search/cs?searchtype=author&amp;query=Schuster%2C+T">Tal Schuster</a>, <a href="/search/cs?searchtype=author&amp;query=Cohen%2C+W+W">William W. Cohen</a>, <a href="/search/cs?searchtype=author&amp;query=Metzler%2C+D">Donald Metzler</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.06991v3-abstract-short" style="display: inline;"> In this paper, we demonstrate that information retrieval can be accomplished with a single Transformer, in which all information about the corpus is encoded in the parameters of the model. To this end, we introduce the Differentiable Search Index (DSI), a new paradigm that learns a text-to-text model that maps string queries directly to relevant docids; in other words, a DSI model answers queries&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.06991v3-abstract-full').style.display = 'inline'; document.getElementById('2202.06991v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.06991v3-abstract-full" style="display: none;"> In this paper, we demonstrate that information retrieval can be accomplished with a single Transformer, in which all information about the corpus is encoded in the parameters of the model. To this end, we introduce the Differentiable Search Index (DSI), a new paradigm that learns a text-to-text model that maps string queries directly to relevant docids; in other words, a DSI model answers queries directly using only its parameters, dramatically simplifying the whole retrieval process. We study variations in how documents and their identifiers are represented, variations in training procedures, and the interplay between models and corpus sizes. Experiments demonstrate that given appropriate design choices, DSI significantly outperforms strong baselines such as dual encoder models. Moreover, DSI demonstrates strong generalization capabilities, outperforming a BM25 baseline in a zero-shot setup. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.06991v3-abstract-full').style.display = 'none'; document.getElementById('2202.06991v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.10952">arXiv:2111.10952</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2111.10952">pdf</a>, <a href="https://arxiv.org/format/2111.10952">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ExT5: Towards Extreme Multi-Task Scaling for Transfer Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Aribandi%2C+V">Vamsi Aribandi</a>, <a href="/search/cs?searchtype=author&amp;query=Tay%2C+Y">Yi Tay</a>, <a href="/search/cs?searchtype=author&amp;query=Schuster%2C+T">Tal Schuster</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+J">Jinfeng Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+H+S">Huaixiu Steven Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Mehta%2C+S+V">Sanket Vaibhav Mehta</a>, <a href="/search/cs?searchtype=author&amp;query=Zhuang%2C+H">Honglei Zhuang</a>, <a href="/search/cs?searchtype=author&amp;query=Tran%2C+V+Q">Vinh Q. Tran</a>, <a href="/search/cs?searchtype=author&amp;query=Bahri%2C+D">Dara Bahri</a>, <a href="/search/cs?searchtype=author&amp;query=Ni%2C+J">Jianmo Ni</a>, <a href="/search/cs?searchtype=author&amp;query=Gupta%2C+J">Jai Gupta</a>, <a href="/search/cs?searchtype=author&amp;query=Hui%2C+K">Kai Hui</a>, <a href="/search/cs?searchtype=author&amp;query=Ruder%2C+S">Sebastian Ruder</a>, <a href="/search/cs?searchtype=author&amp;query=Metzler%2C+D">Donald Metzler</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.10952v2-abstract-short" style="display: inline;"> Despite the recent success of multi-task learning and transfer learning for natural language processing (NLP), few works have systematically studied the effect of scaling up the number of tasks during pre-training. Towards this goal, this paper introduces ExMix (Extreme Mixture): a massive collection of 107 supervised NLP tasks across diverse domains and task-families. Using ExMix, we study the ef&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.10952v2-abstract-full').style.display = 'inline'; document.getElementById('2111.10952v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.10952v2-abstract-full" style="display: none;"> Despite the recent success of multi-task learning and transfer learning for natural language processing (NLP), few works have systematically studied the effect of scaling up the number of tasks during pre-training. Towards this goal, this paper introduces ExMix (Extreme Mixture): a massive collection of 107 supervised NLP tasks across diverse domains and task-families. Using ExMix, we study the effect of multi-task pre-training at the largest scale to date, and analyze co-training transfer amongst common families of tasks. Through this analysis, we show that manually curating an ideal set of tasks for multi-task pre-training is not straightforward, and that multi-task scaling can vastly improve models on its own. Finally, we propose ExT5: a model pre-trained using a multi-task objective of self-supervised span denoising and supervised ExMix. Via extensive experiments, we show that ExT5 outperforms strong T5 baselines on SuperGLUE, GEM, Rainbow, Closed-Book QA tasks, and several tasks outside of ExMix. ExT5 also significantly improves sample efficiency while pre-training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.10952v2-abstract-full').style.display = 'none'; document.getElementById('2111.10952v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2022; see https://youtu.be/FbRcbM4T-50 for a video overview of the paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.12672">arXiv:2106.12672</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2106.12672">pdf</a>, <a href="https://arxiv.org/format/2106.12672">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Charformer: Fast Character Transformers via Gradient-based Subword Tokenization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tay%2C+Y">Yi Tay</a>, <a href="/search/cs?searchtype=author&amp;query=Tran%2C+V+Q">Vinh Q. Tran</a>, <a href="/search/cs?searchtype=author&amp;query=Ruder%2C+S">Sebastian Ruder</a>, <a href="/search/cs?searchtype=author&amp;query=Gupta%2C+J">Jai Gupta</a>, <a href="/search/cs?searchtype=author&amp;query=Chung%2C+H+W">Hyung Won Chung</a>, <a href="/search/cs?searchtype=author&amp;query=Bahri%2C+D">Dara Bahri</a>, <a href="/search/cs?searchtype=author&amp;query=Qin%2C+Z">Zhen Qin</a>, <a href="/search/cs?searchtype=author&amp;query=Baumgartner%2C+S">Simon Baumgartner</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+C">Cong Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Metzler%2C+D">Donald Metzler</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.12672v3-abstract-short" style="display: inline;"> State-of-the-art models in natural language processing rely on separate rigid subword tokenization algorithms, which limit their generalization ability and adaptation to new settings. In this paper, we propose a new model inductive bias that learns a subword tokenization end-to-end as part of the model. To this end, we introduce a soft gradient-based subword tokenization module (GBST) that automat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.12672v3-abstract-full').style.display = 'inline'; document.getElementById('2106.12672v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.12672v3-abstract-full" style="display: none;"> State-of-the-art models in natural language processing rely on separate rigid subword tokenization algorithms, which limit their generalization ability and adaptation to new settings. In this paper, we propose a new model inductive bias that learns a subword tokenization end-to-end as part of the model. To this end, we introduce a soft gradient-based subword tokenization module (GBST) that automatically learns latent subword representations from characters in a data-driven fashion. Concretely, GBST enumerates candidate subword blocks and learns to score them in a position-wise fashion using a block scoring network. We additionally introduce Charformer, a deep Transformer model that integrates GBST and operates on the byte level. Via extensive experiments on English GLUE, multilingual, and noisy text datasets, we show that Charformer outperforms a series of competitive byte-level baselines while generally performing on par and sometimes outperforming subword-based models. Additionally, Charformer is fast, improving the speed of both vanilla byte-level and subword-level Transformers by 28%-100% while maintaining competitive quality. We believe this work paves the way for highly performant token-free models that are trained completely end-to-end. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.12672v3-abstract-full').style.display = 'none'; document.getElementById('2106.12672v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2022 Camera Ready</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.02278">arXiv:2106.02278</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2106.02278">pdf</a>, <a href="https://arxiv.org/format/2106.02278">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> AgreeSum: Agreement-Oriented Multi-Document Summarization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pang%2C+R+Y">Richard Yuanzhe Pang</a>, <a href="/search/cs?searchtype=author&amp;query=Lelkes%2C+A+D">Adam D. Lelkes</a>, <a href="/search/cs?searchtype=author&amp;query=Tran%2C+V+Q">Vinh Q. Tran</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+C">Cong Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.02278v1-abstract-short" style="display: inline;"> We aim to renew interest in a particular multi-document summarization (MDS) task which we call AgreeSum: agreement-oriented multi-document summarization. Given a cluster of articles, the goal is to provide abstractive summaries that represent information common and faithful to all input articles. Given the lack of existing datasets, we create a dataset for AgreeSum, and provide annotations on arti&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.02278v1-abstract-full').style.display = 'inline'; document.getElementById('2106.02278v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.02278v1-abstract-full" style="display: none;"> We aim to renew interest in a particular multi-document summarization (MDS) task which we call AgreeSum: agreement-oriented multi-document summarization. Given a cluster of articles, the goal is to provide abstractive summaries that represent information common and faithful to all input articles. Given the lack of existing datasets, we create a dataset for AgreeSum, and provide annotations on article-summary entailment relations for a subset of the clusters in the dataset. We aim to create strong baselines for the task by applying the top-performing pretrained single-document summarization model PEGASUS onto AgreeSum, leveraging both annotated clusters by supervised losses, and unannotated clusters by T5-based entailment-related and language-related losses. Compared to other baselines, both automatic evaluation and human evaluation show better article-summary and cluster-summary entailment in generated summaries. On a separate note, we hope that our article-summary entailment annotations contribute to the community&#39;s effort in improving abstractive summarization faithfulness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.02278v1-abstract-full').style.display = 'none'; document.getElementById('2106.02278v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Findings of ACL 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2102.09094">arXiv:2102.09094</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2102.09094">pdf</a>, <a href="https://arxiv.org/format/2102.09094">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Quiz-Style Question Generation for News Stories </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lelkes%2C+A+D">Adam D. Lelkes</a>, <a href="/search/cs?searchtype=author&amp;query=Tran%2C+V+Q">Vinh Q. Tran</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+C">Cong Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2102.09094v1-abstract-short" style="display: inline;"> A large majority of American adults get at least some of their news from the Internet. Even though many online news products have the goal of informing their users about the news, they lack scalable and reliable tools for measuring how well they are achieving this goal, and therefore have to resort to noisy proxy metrics (e.g., click-through rates or reading time) to track their performance. As&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.09094v1-abstract-full').style.display = 'inline'; document.getElementById('2102.09094v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2102.09094v1-abstract-full" style="display: none;"> A large majority of American adults get at least some of their news from the Internet. Even though many online news products have the goal of informing their users about the news, they lack scalable and reliable tools for measuring how well they are achieving this goal, and therefore have to resort to noisy proxy metrics (e.g., click-through rates or reading time) to track their performance. As a first step towards measuring news informedness at a scale, we study the problem of quiz-style multiple-choice question generation, which may be used to survey users about their knowledge of recent news. In particular, we formulate the problem as two sequence-to-sequence tasks: question-answer generation (QAG) and distractor, or incorrect answer, generation (DG). We introduce NewsQuizQA, the first dataset intended for quiz-style question-answer generation, containing 20K human written question-answer pairs from 5K news article summaries. Using this dataset, we propose a series of novel techniques for applying large pre-trained Transformer encoder-decoder models, namely PEGASUS and T5, to the tasks of question-answer generation and distractor generation. We show that our models outperform strong baselines using both automated metrics and human raters. We provide a case study of running weekly quizzes on real-world users via the Google Surveys platform over the course of two months. We found that users generally found the automatically generated questions to be educational and enjoyable. Finally, to serve the research community, we are releasing the NewsQuizQA dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.09094v1-abstract-full').style.display = 'none'; document.getElementById('2102.09094v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2021. </p> </li> </ol> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10