CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;21 of 21 results for author: <span class="mathjax">Sanh, V</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Sanh%2C+V">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Sanh, V"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Sanh%2C+V&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Sanh, V"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12637">arXiv:2408.12637</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.12637">pdf</a>, <a href="https://arxiv.org/format/2408.12637">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Building and better understanding vision-language models: insights and future directions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lauren%C3%A7on%2C+H">Hugo Lauren莽on</a>, <a href="/search/cs?searchtype=author&amp;query=Marafioti%2C+A">Andr茅s Marafioti</a>, <a href="/search/cs?searchtype=author&amp;query=Sanh%2C+V">Victor Sanh</a>, <a href="/search/cs?searchtype=author&amp;query=Tronchon%2C+L">L茅o Tronchon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12637v1-abstract-short" style="display: inline;"> The field of vision-language models (VLMs), which take images and texts as inputs and output texts, is rapidly evolving and has yet to reach consensus on several key aspects of the development pipeline, including data, architecture, and training methods. This paper can be seen as a tutorial for building a VLM. We begin by providing a comprehensive overview of the current state-of-the-art approache&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12637v1-abstract-full').style.display = 'inline'; document.getElementById('2408.12637v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12637v1-abstract-full" style="display: none;"> The field of vision-language models (VLMs), which take images and texts as inputs and output texts, is rapidly evolving and has yet to reach consensus on several key aspects of the development pipeline, including data, architecture, and training methods. This paper can be seen as a tutorial for building a VLM. We begin by providing a comprehensive overview of the current state-of-the-art approaches, highlighting the strengths and weaknesses of each, addressing the major challenges in the field, and suggesting promising research directions for underexplored areas. We then walk through the practical steps to build Idefics3-8B, a powerful VLM that significantly outperforms its predecessor Idefics2-8B, while being trained efficiently, exclusively on open datasets, and using a straightforward pipeline. These steps include the creation of Docmatix, a dataset for improving document understanding capabilities, which is 240 times larger than previously available datasets. We release the model along with the datasets created for its training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12637v1-abstract-full').style.display = 'none'; document.getElementById('2408.12637v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.16746">arXiv:2406.16746</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.16746">pdf</a>, <a href="https://arxiv.org/format/2406.16746">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> The Responsible Foundation Model Development Cheatsheet: A Review of Tools &amp; Resources </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Longpre%2C+S">Shayne Longpre</a>, <a href="/search/cs?searchtype=author&amp;query=Biderman%2C+S">Stella Biderman</a>, <a href="/search/cs?searchtype=author&amp;query=Albalak%2C+A">Alon Albalak</a>, <a href="/search/cs?searchtype=author&amp;query=Schoelkopf%2C+H">Hailey Schoelkopf</a>, <a href="/search/cs?searchtype=author&amp;query=McDuff%2C+D">Daniel McDuff</a>, <a href="/search/cs?searchtype=author&amp;query=Kapoor%2C+S">Sayash Kapoor</a>, <a href="/search/cs?searchtype=author&amp;query=Klyman%2C+K">Kevin Klyman</a>, <a href="/search/cs?searchtype=author&amp;query=Lo%2C+K">Kyle Lo</a>, <a href="/search/cs?searchtype=author&amp;query=Ilharco%2C+G">Gabriel Ilharco</a>, <a href="/search/cs?searchtype=author&amp;query=San%2C+N">Nay San</a>, <a href="/search/cs?searchtype=author&amp;query=Rauh%2C+M">Maribeth Rauh</a>, <a href="/search/cs?searchtype=author&amp;query=Skowron%2C+A">Aviya Skowron</a>, <a href="/search/cs?searchtype=author&amp;query=Vidgen%2C+B">Bertie Vidgen</a>, <a href="/search/cs?searchtype=author&amp;query=Weidinger%2C+L">Laura Weidinger</a>, <a href="/search/cs?searchtype=author&amp;query=Narayanan%2C+A">Arvind Narayanan</a>, <a href="/search/cs?searchtype=author&amp;query=Sanh%2C+V">Victor Sanh</a>, <a href="/search/cs?searchtype=author&amp;query=Adelani%2C+D">David Adelani</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+P">Percy Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Bommasani%2C+R">Rishi Bommasani</a>, <a href="/search/cs?searchtype=author&amp;query=Henderson%2C+P">Peter Henderson</a>, <a href="/search/cs?searchtype=author&amp;query=Luccioni%2C+S">Sasha Luccioni</a>, <a href="/search/cs?searchtype=author&amp;query=Jernite%2C+Y">Yacine Jernite</a>, <a href="/search/cs?searchtype=author&amp;query=Soldaini%2C+L">Luca Soldaini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.16746v3-abstract-short" style="display: inline;"> Foundation model development attracts a rapidly expanding body of contributors, scientists, and applications. To help shape responsible development practices, we introduce the Foundation Model Development Cheatsheet: a growing collection of 250+ tools and resources spanning text, vision, and speech modalities. We draw on a large body of prior work to survey resources (e.g. software, documentation,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16746v3-abstract-full').style.display = 'inline'; document.getElementById('2406.16746v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.16746v3-abstract-full" style="display: none;"> Foundation model development attracts a rapidly expanding body of contributors, scientists, and applications. To help shape responsible development practices, we introduce the Foundation Model Development Cheatsheet: a growing collection of 250+ tools and resources spanning text, vision, and speech modalities. We draw on a large body of prior work to survey resources (e.g. software, documentation, frameworks, guides, and practical tools) that support informed data selection, processing, and understanding, precise and limitation-aware artifact documentation, efficient model training, advance awareness of the environmental impact from training, careful model evaluation of capabilities, risks, and claims, as well as responsible model release, licensing and deployment practices. We hope this curated collection of resources helps guide more responsible development. The process of curating this list, enabled us to review the AI development ecosystem, revealing what tools are critically missing, misused, or over-used in existing practices. We find that (i) tools for data sourcing, model evaluation, and monitoring are critically under-serving ethical and real-world needs, (ii) evaluations for model safety, capabilities, and environmental impact all lack reproducibility and transparency, (iii) text and particularly English-centric analyses continue to dominate over multilingual and multi-modal analyses, and (iv) evaluation of systems, rather than just models, is needed so that capabilities and impact are assessed in context. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16746v3-abstract-full').style.display = 'none'; document.getElementById('2406.16746v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.02246">arXiv:2405.02246</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.02246">pdf</a>, <a href="https://arxiv.org/format/2405.02246">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> What matters when building vision-language models? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lauren%C3%A7on%2C+H">Hugo Lauren莽on</a>, <a href="/search/cs?searchtype=author&amp;query=Tronchon%2C+L">L茅o Tronchon</a>, <a href="/search/cs?searchtype=author&amp;query=Cord%2C+M">Matthieu Cord</a>, <a href="/search/cs?searchtype=author&amp;query=Sanh%2C+V">Victor Sanh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.02246v1-abstract-short" style="display: inline;"> The growing interest in vision-language models (VLMs) has been driven by improvements in large language models and vision transformers. Despite the abundance of literature on this subject, we observe that critical decisions regarding the design of VLMs are often not justified. We argue that these unsupported decisions impede progress in the field by making it difficult to identify which choices im&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02246v1-abstract-full').style.display = 'inline'; document.getElementById('2405.02246v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.02246v1-abstract-full" style="display: none;"> The growing interest in vision-language models (VLMs) has been driven by improvements in large language models and vision transformers. Despite the abundance of literature on this subject, we observe that critical decisions regarding the design of VLMs are often not justified. We argue that these unsupported decisions impede progress in the field by making it difficult to identify which choices improve model performance. To address this issue, we conduct extensive experiments around pre-trained models, architecture choice, data, and training methods. Our consolidation of findings includes the development of Idefics2, an efficient foundational VLM of 8 billion parameters. Idefics2 achieves state-of-the-art performance within its size category across various multimodal benchmarks, and is often on par with models four times its size. We release the model (base, instructed, and chat) along with the datasets created for its training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02246v1-abstract-full').style.display = 'none'; document.getElementById('2405.02246v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.09029">arXiv:2403.09029</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.09029">pdf</a>, <a href="https://arxiv.org/format/2403.09029">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unlocking the conversion of Web Screenshots into HTML Code with the WebSight Dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lauren%C3%A7on%2C+H">Hugo Lauren莽on</a>, <a href="/search/cs?searchtype=author&amp;query=Tronchon%2C+L">L茅o Tronchon</a>, <a href="/search/cs?searchtype=author&amp;query=Sanh%2C+V">Victor Sanh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.09029v1-abstract-short" style="display: inline;"> Using vision-language models (VLMs) in web development presents a promising strategy to increase efficiency and unblock no-code solutions: by providing a screenshot or a sketch of a UI, a VLM could generate the code to reproduce it, for instance in a language like HTML. Despite the advancements in VLMs for various tasks, the specific challenge of converting a screenshot into a corresponding HTML h&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.09029v1-abstract-full').style.display = 'inline'; document.getElementById('2403.09029v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.09029v1-abstract-full" style="display: none;"> Using vision-language models (VLMs) in web development presents a promising strategy to increase efficiency and unblock no-code solutions: by providing a screenshot or a sketch of a UI, a VLM could generate the code to reproduce it, for instance in a language like HTML. Despite the advancements in VLMs for various tasks, the specific challenge of converting a screenshot into a corresponding HTML has been minimally explored. We posit that this is mainly due to the absence of a suitable, high-quality dataset. This work introduces WebSight, a synthetic dataset consisting of 2 million pairs of HTML codes and their corresponding screenshots. We fine-tune a foundational VLM on our dataset and show proficiency in converting webpage screenshots to functional HTML code. To accelerate the research in this area, we open-source WebSight. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.09029v1-abstract-full').style.display = 'none'; document.getElementById('2403.09029v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.16527">arXiv:2306.16527</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2306.16527">pdf</a>, <a href="https://arxiv.org/format/2306.16527">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lauren%C3%A7on%2C+H">Hugo Lauren莽on</a>, <a href="/search/cs?searchtype=author&amp;query=Saulnier%2C+L">Lucile Saulnier</a>, <a href="/search/cs?searchtype=author&amp;query=Tronchon%2C+L">L茅o Tronchon</a>, <a href="/search/cs?searchtype=author&amp;query=Bekman%2C+S">Stas Bekman</a>, <a href="/search/cs?searchtype=author&amp;query=Singh%2C+A">Amanpreet Singh</a>, <a href="/search/cs?searchtype=author&amp;query=Lozhkov%2C+A">Anton Lozhkov</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+T">Thomas Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Karamcheti%2C+S">Siddharth Karamcheti</a>, <a href="/search/cs?searchtype=author&amp;query=Rush%2C+A+M">Alexander M. Rush</a>, <a href="/search/cs?searchtype=author&amp;query=Kiela%2C+D">Douwe Kiela</a>, <a href="/search/cs?searchtype=author&amp;query=Cord%2C+M">Matthieu Cord</a>, <a href="/search/cs?searchtype=author&amp;query=Sanh%2C+V">Victor Sanh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.16527v2-abstract-short" style="display: inline;"> Large multimodal models trained on natural documents, which interleave images and text, outperform models trained on image-text pairs on various multimodal benchmarks. However, the datasets used to train these models have not been released, and the collection process has not been fully specified. We introduce the OBELICS dataset, an open web-scale filtered dataset of interleaved image-text documen&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.16527v2-abstract-full').style.display = 'inline'; document.getElementById('2306.16527v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.16527v2-abstract-full" style="display: none;"> Large multimodal models trained on natural documents, which interleave images and text, outperform models trained on image-text pairs on various multimodal benchmarks. However, the datasets used to train these models have not been released, and the collection process has not been fully specified. We introduce the OBELICS dataset, an open web-scale filtered dataset of interleaved image-text documents comprising 141 million web pages extracted from Common Crawl, 353 million associated images, and 115 billion text tokens. We describe the dataset creation process, present comprehensive filtering rules, and provide an analysis of the dataset&#39;s content. To show the viability of OBELICS, we train vision and language models of 9 and 80 billion parameters named IDEFICS, and obtain competitive performance on different multimodal benchmarks. We release our dataset, models and code. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.16527v2-abstract-full').style.display = 'none'; document.getElementById('2306.16527v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.05100">arXiv:2211.05100</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2211.05100">pdf</a>, <a href="https://arxiv.org/format/2211.05100">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> BLOOM: A 176B-Parameter Open-Access Multilingual Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Workshop%2C+B">BigScience Workshop</a>, <a href="/search/cs?searchtype=author&amp;query=%3A"> :</a>, <a href="/search/cs?searchtype=author&amp;query=Scao%2C+T+L">Teven Le Scao</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+A">Angela Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Akiki%2C+C">Christopher Akiki</a>, <a href="/search/cs?searchtype=author&amp;query=Pavlick%2C+E">Ellie Pavlick</a>, <a href="/search/cs?searchtype=author&amp;query=Ili%C4%87%2C+S">Suzana Ili膰</a>, <a href="/search/cs?searchtype=author&amp;query=Hesslow%2C+D">Daniel Hesslow</a>, <a href="/search/cs?searchtype=author&amp;query=Castagn%C3%A9%2C+R">Roman Castagn茅</a>, <a href="/search/cs?searchtype=author&amp;query=Luccioni%2C+A+S">Alexandra Sasha Luccioni</a>, <a href="/search/cs?searchtype=author&amp;query=Yvon%2C+F">Fran莽ois Yvon</a>, <a href="/search/cs?searchtype=author&amp;query=Gall%C3%A9%2C+M">Matthias Gall茅</a>, <a href="/search/cs?searchtype=author&amp;query=Tow%2C+J">Jonathan Tow</a>, <a href="/search/cs?searchtype=author&amp;query=Rush%2C+A+M">Alexander M. Rush</a>, <a href="/search/cs?searchtype=author&amp;query=Biderman%2C+S">Stella Biderman</a>, <a href="/search/cs?searchtype=author&amp;query=Webson%2C+A">Albert Webson</a>, <a href="/search/cs?searchtype=author&amp;query=Ammanamanchi%2C+P+S">Pawan Sasanka Ammanamanchi</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+T">Thomas Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Sagot%2C+B">Beno卯t Sagot</a>, <a href="/search/cs?searchtype=author&amp;query=Muennighoff%2C+N">Niklas Muennighoff</a>, <a href="/search/cs?searchtype=author&amp;query=del+Moral%2C+A+V">Albert Villanova del Moral</a>, <a href="/search/cs?searchtype=author&amp;query=Ruwase%2C+O">Olatunji Ruwase</a>, <a href="/search/cs?searchtype=author&amp;query=Bawden%2C+R">Rachel Bawden</a>, <a href="/search/cs?searchtype=author&amp;query=Bekman%2C+S">Stas Bekman</a>, <a href="/search/cs?searchtype=author&amp;query=McMillan-Major%2C+A">Angelina McMillan-Major</a> , et al. (369 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.05100v4-abstract-short" style="display: inline;"> Large language models (LLMs) have been shown to be able to perform new tasks based on a few demonstrations or natural language instructions. While these capabilities have led to widespread adoption, most LLMs are developed by resource-rich organizations and are frequently kept from the public. As a step towards democratizing this powerful technology, we present BLOOM, a 176B-parameter open-access&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.05100v4-abstract-full').style.display = 'inline'; document.getElementById('2211.05100v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.05100v4-abstract-full" style="display: none;"> Large language models (LLMs) have been shown to be able to perform new tasks based on a few demonstrations or natural language instructions. While these capabilities have led to widespread adoption, most LLMs are developed by resource-rich organizations and are frequently kept from the public. As a step towards democratizing this powerful technology, we present BLOOM, a 176B-parameter open-access language model designed and built thanks to a collaboration of hundreds of researchers. BLOOM is a decoder-only Transformer language model that was trained on the ROOTS corpus, a dataset comprising hundreds of sources in 46 natural and 13 programming languages (59 in total). We find that BLOOM achieves competitive performance on a wide variety of benchmarks, with stronger results after undergoing multitask prompted finetuning. To facilitate future research and applications using LLMs, we publicly release our models and code under the Responsible AI License. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.05100v4-abstract-full').style.display = 'none'; document.getElementById('2211.05100v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.15424">arXiv:2210.15424</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2210.15424">pdf</a>, <a href="https://arxiv.org/format/2210.15424">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> What Language Model to Train if You Have One Million GPU Hours? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Scao%2C+T+L">Teven Le Scao</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+T">Thomas Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Hesslow%2C+D">Daniel Hesslow</a>, <a href="/search/cs?searchtype=author&amp;query=Saulnier%2C+L">Lucile Saulnier</a>, <a href="/search/cs?searchtype=author&amp;query=Bekman%2C+S">Stas Bekman</a>, <a href="/search/cs?searchtype=author&amp;query=Bari%2C+M+S">M Saiful Bari</a>, <a href="/search/cs?searchtype=author&amp;query=Biderman%2C+S">Stella Biderman</a>, <a href="/search/cs?searchtype=author&amp;query=Elsahar%2C+H">Hady Elsahar</a>, <a href="/search/cs?searchtype=author&amp;query=Muennighoff%2C+N">Niklas Muennighoff</a>, <a href="/search/cs?searchtype=author&amp;query=Phang%2C+J">Jason Phang</a>, <a href="/search/cs?searchtype=author&amp;query=Press%2C+O">Ofir Press</a>, <a href="/search/cs?searchtype=author&amp;query=Raffel%2C+C">Colin Raffel</a>, <a href="/search/cs?searchtype=author&amp;query=Sanh%2C+V">Victor Sanh</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+S">Sheng Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Sutawika%2C+L">Lintang Sutawika</a>, <a href="/search/cs?searchtype=author&amp;query=Tae%2C+J">Jaesung Tae</a>, <a href="/search/cs?searchtype=author&amp;query=Yong%2C+Z+X">Zheng Xin Yong</a>, <a href="/search/cs?searchtype=author&amp;query=Launay%2C+J">Julien Launay</a>, <a href="/search/cs?searchtype=author&amp;query=Beltagy%2C+I">Iz Beltagy</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.15424v2-abstract-short" style="display: inline;"> The crystallization of modeling methods around the Transformer architecture has been a boon for practitioners. Simple, well-motivated architectural variations can transfer across tasks and scale, increasing the impact of modeling research. However, with the emergence of state-of-the-art 100B+ parameters models, large language models are increasingly expensive to accurately design and train. Notabl&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.15424v2-abstract-full').style.display = 'inline'; document.getElementById('2210.15424v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.15424v2-abstract-full" style="display: none;"> The crystallization of modeling methods around the Transformer architecture has been a boon for practitioners. Simple, well-motivated architectural variations can transfer across tasks and scale, increasing the impact of modeling research. However, with the emergence of state-of-the-art 100B+ parameters models, large language models are increasingly expensive to accurately design and train. Notably, it can be difficult to evaluate how modeling decisions may impact emergent capabilities, given that these capabilities arise mainly from sheer scale alone. In the process of building BLOOM--the Big Science Large Open-science Open-access Multilingual language model--our goal is to identify an architecture and training setup that makes the best use of our 1,000,000 A100-GPU-hours budget. Specifically, we perform an ablation study at the billion-parameter scale comparing different modeling practices and their impact on zero-shot generalization. In addition, we study the impact of various popular pre-training corpora on zero-shot generalization. We also study the performance of a multilingual model and how it compares to the English-only one. Finally, we consider the scaling behaviour of Transformers to choose the target model size, shape, and training setup. All our models and code are open-sourced at https://huggingface.co/bigscience . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.15424v2-abstract-full').style.display = 'none'; document.getElementById('2210.15424v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Findings of EMNLP 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.07852">arXiv:2208.07852</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2208.07852">pdf</a>, <a href="https://arxiv.org/format/2208.07852">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Interactive and Visual Prompt Engineering for Ad-hoc Task Adaptation with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Strobelt%2C+H">Hendrik Strobelt</a>, <a href="/search/cs?searchtype=author&amp;query=Webson%2C+A">Albert Webson</a>, <a href="/search/cs?searchtype=author&amp;query=Sanh%2C+V">Victor Sanh</a>, <a href="/search/cs?searchtype=author&amp;query=Hoover%2C+B">Benjamin Hoover</a>, <a href="/search/cs?searchtype=author&amp;query=Beyer%2C+J">Johanna Beyer</a>, <a href="/search/cs?searchtype=author&amp;query=Pfister%2C+H">Hanspeter Pfister</a>, <a href="/search/cs?searchtype=author&amp;query=Rush%2C+A+M">Alexander M. Rush</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.07852v1-abstract-short" style="display: inline;"> State-of-the-art neural language models can now be used to solve ad-hoc language tasks through zero-shot prompting without the need for supervised training. This approach has gained popularity in recent years, and researchers have demonstrated prompts that achieve strong accuracy on specific NLP tasks. However, finding a prompt for new tasks requires experimentation. Different prompt templates wit&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.07852v1-abstract-full').style.display = 'inline'; document.getElementById('2208.07852v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.07852v1-abstract-full" style="display: none;"> State-of-the-art neural language models can now be used to solve ad-hoc language tasks through zero-shot prompting without the need for supervised training. This approach has gained popularity in recent years, and researchers have demonstrated prompts that achieve strong accuracy on specific NLP tasks. However, finding a prompt for new tasks requires experimentation. Different prompt templates with different wording choices lead to significant accuracy differences. PromptIDE allows users to experiment with prompt variations, visualize prompt performance, and iteratively optimize prompts. We developed a workflow that allows users to first focus on model feedback using small data before moving on to a large data regime that allows empirical grounding of promising prompts using quantitative measures of the task. The tool then allows easy deployment of the newly created ad-hoc models. We demonstrate the utility of PromptIDE (demo at http://prompt.vizhub.ai) and our workflow using several real-world use cases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.07852v1-abstract-full').style.display = 'none'; document.getElementById('2208.07852v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages content, 2 pages references</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.01279">arXiv:2202.01279</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2202.01279">pdf</a>, <a href="https://arxiv.org/format/2202.01279">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> PromptSource: An Integrated Development Environment and Repository for Natural Language Prompts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bach%2C+S+H">Stephen H. Bach</a>, <a href="/search/cs?searchtype=author&amp;query=Sanh%2C+V">Victor Sanh</a>, <a href="/search/cs?searchtype=author&amp;query=Yong%2C+Z">Zheng-Xin Yong</a>, <a href="/search/cs?searchtype=author&amp;query=Webson%2C+A">Albert Webson</a>, <a href="/search/cs?searchtype=author&amp;query=Raffel%2C+C">Colin Raffel</a>, <a href="/search/cs?searchtype=author&amp;query=Nayak%2C+N+V">Nihal V. Nayak</a>, <a href="/search/cs?searchtype=author&amp;query=Sharma%2C+A">Abheesht Sharma</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+T">Taewoon Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Bari%2C+M+S">M Saiful Bari</a>, <a href="/search/cs?searchtype=author&amp;query=Fevry%2C+T">Thibault Fevry</a>, <a href="/search/cs?searchtype=author&amp;query=Alyafeai%2C+Z">Zaid Alyafeai</a>, <a href="/search/cs?searchtype=author&amp;query=Dey%2C+M">Manan Dey</a>, <a href="/search/cs?searchtype=author&amp;query=Santilli%2C+A">Andrea Santilli</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Z">Zhiqing Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Ben-David%2C+S">Srulik Ben-David</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+C">Canwen Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Chhablani%2C+G">Gunjan Chhablani</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Han Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Fries%2C+J+A">Jason Alan Fries</a>, <a href="/search/cs?searchtype=author&amp;query=Al-shaibani%2C+M+S">Maged S. Al-shaibani</a>, <a href="/search/cs?searchtype=author&amp;query=Sharma%2C+S">Shanya Sharma</a>, <a href="/search/cs?searchtype=author&amp;query=Thakker%2C+U">Urmish Thakker</a>, <a href="/search/cs?searchtype=author&amp;query=Almubarak%2C+K">Khalid Almubarak</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+X">Xiangru Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Radev%2C+D">Dragomir Radev</a> , et al. (2 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.01279v3-abstract-short" style="display: inline;"> PromptSource is a system for creating, sharing, and using natural language prompts. Prompts are functions that map an example from a dataset to a natural language input and target output. Using prompts to train and query language models is an emerging area in NLP that requires new tools that let users develop and refine these prompts collaboratively. PromptSource addresses the emergent challenges&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.01279v3-abstract-full').style.display = 'inline'; document.getElementById('2202.01279v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.01279v3-abstract-full" style="display: none;"> PromptSource is a system for creating, sharing, and using natural language prompts. Prompts are functions that map an example from a dataset to a natural language input and target output. Using prompts to train and query language models is an emerging area in NLP that requires new tools that let users develop and refine these prompts collaboratively. PromptSource addresses the emergent challenges in this new setting with (1) a templating language for defining data-linked prompts, (2) an interface that lets users quickly iterate on prompt development by observing outputs of their prompts on many examples, and (3) a community-driven set of guidelines for contributing new prompts to a common pool. Over 2,000 prompts for roughly 170 datasets are already available in PromptSource. PromptSource is available at https://github.com/bigscience-workshop/promptsource. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.01279v3-abstract-full').style.display = 'none'; document.getElementById('2202.01279v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACL 2022 Demo</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.08207">arXiv:2110.08207</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2110.08207">pdf</a>, <a href="https://arxiv.org/format/2110.08207">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Multitask Prompted Training Enables Zero-Shot Task Generalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sanh%2C+V">Victor Sanh</a>, <a href="/search/cs?searchtype=author&amp;query=Webson%2C+A">Albert Webson</a>, <a href="/search/cs?searchtype=author&amp;query=Raffel%2C+C">Colin Raffel</a>, <a href="/search/cs?searchtype=author&amp;query=Bach%2C+S+H">Stephen H. Bach</a>, <a href="/search/cs?searchtype=author&amp;query=Sutawika%2C+L">Lintang Sutawika</a>, <a href="/search/cs?searchtype=author&amp;query=Alyafeai%2C+Z">Zaid Alyafeai</a>, <a href="/search/cs?searchtype=author&amp;query=Chaffin%2C+A">Antoine Chaffin</a>, <a href="/search/cs?searchtype=author&amp;query=Stiegler%2C+A">Arnaud Stiegler</a>, <a href="/search/cs?searchtype=author&amp;query=Scao%2C+T+L">Teven Le Scao</a>, <a href="/search/cs?searchtype=author&amp;query=Raja%2C+A">Arun Raja</a>, <a href="/search/cs?searchtype=author&amp;query=Dey%2C+M">Manan Dey</a>, <a href="/search/cs?searchtype=author&amp;query=Bari%2C+M+S">M Saiful Bari</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+C">Canwen Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Thakker%2C+U">Urmish Thakker</a>, <a href="/search/cs?searchtype=author&amp;query=Sharma%2C+S+S">Shanya Sharma Sharma</a>, <a href="/search/cs?searchtype=author&amp;query=Szczechla%2C+E">Eliza Szczechla</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+T">Taewoon Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Chhablani%2C+G">Gunjan Chhablani</a>, <a href="/search/cs?searchtype=author&amp;query=Nayak%2C+N">Nihal Nayak</a>, <a href="/search/cs?searchtype=author&amp;query=Datta%2C+D">Debajyoti Datta</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+J">Jonathan Chang</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+M+T">Mike Tian-Jian Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Han Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Manica%2C+M">Matteo Manica</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+S">Sheng Shen</a> , et al. (16 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.08207v3-abstract-short" style="display: inline;"> Large language models have recently been shown to attain reasonable zero-shot generalization on a diverse set of tasks (Brown et al., 2020). It has been hypothesized that this is a consequence of implicit multitask learning in language models&#39; pretraining (Radford et al., 2019). Can zero-shot generalization instead be directly induced by explicit multitask learning? To test this question at scale,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.08207v3-abstract-full').style.display = 'inline'; document.getElementById('2110.08207v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.08207v3-abstract-full" style="display: none;"> Large language models have recently been shown to attain reasonable zero-shot generalization on a diverse set of tasks (Brown et al., 2020). It has been hypothesized that this is a consequence of implicit multitask learning in language models&#39; pretraining (Radford et al., 2019). Can zero-shot generalization instead be directly induced by explicit multitask learning? To test this question at scale, we develop a system for easily mapping any natural language tasks into a human-readable prompted form. We convert a large set of supervised datasets, each with multiple prompts with diverse wording. These prompted datasets allow for benchmarking the ability of a model to perform completely held-out tasks. We fine-tune a pretrained encoder-decoder model (Raffel et al., 2020; Lester et al., 2021) on this multitask mixture covering a wide variety of tasks. The model attains strong zero-shot performance on several standard datasets, often outperforming models up to 16x its size. Further, our approach attains strong performance on a subset of tasks from the BIG-bench benchmark, outperforming models up to 6x its size. All trained models are available at https://github.com/bigscience-workshop/t-zero and all prompts are available at https://github.com/bigscience-workshop/promptsource. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.08207v3-abstract-full').style.display = 'none'; document.getElementById('2110.08207v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2022 Spotlight (with extended discussion)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2109.04838">arXiv:2109.04838</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2109.04838">pdf</a>, <a href="https://arxiv.org/format/2109.04838">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Block Pruning For Faster Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lagunas%2C+F">Fran莽ois Lagunas</a>, <a href="/search/cs?searchtype=author&amp;query=Charlaix%2C+E">Ella Charlaix</a>, <a href="/search/cs?searchtype=author&amp;query=Sanh%2C+V">Victor Sanh</a>, <a href="/search/cs?searchtype=author&amp;query=Rush%2C+A+M">Alexander M. Rush</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2109.04838v1-abstract-short" style="display: inline;"> Pre-training has improved model accuracy for both classification and generation tasks at the cost of introducing much larger and slower models. Pruning methods have proven to be an effective way of reducing model size, whereas distillation methods are proven for speeding up inference. We introduce a block pruning approach targeting both small and fast models. Our approach extends structured method&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.04838v1-abstract-full').style.display = 'inline'; document.getElementById('2109.04838v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2109.04838v1-abstract-full" style="display: none;"> Pre-training has improved model accuracy for both classification and generation tasks at the cost of introducing much larger and slower models. Pruning methods have proven to be an effective way of reducing model size, whereas distillation methods are proven for speeding up inference. We introduce a block pruning approach targeting both small and fast models. Our approach extends structured methods by considering blocks of any size and integrates this structure into the movement pruning paradigm for fine-tuning. We find that this approach learns to prune out full components of the underlying model, such as attention heads. Experiments consider classification and generation tasks, yielding among other results a pruned model that is a 2.4x faster, 74% smaller BERT on SQuAD v1, with a 1% drop on F1, competitive both with distilled models in speed and pruned models in size. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.04838v1-abstract-full').style.display = 'none'; document.getElementById('2109.04838v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2021. Code, hyper-parameters, evaluation results and checkpoints available at https://github.com/huggingface/nn_pruning</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6; I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2109.04144">arXiv:2109.04144</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2109.04144">pdf</a>, <a href="https://arxiv.org/format/2109.04144">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Avoiding Inference Heuristics in Few-shot Prompt-based Finetuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Utama%2C+P+A">Prasetya Ajie Utama</a>, <a href="/search/cs?searchtype=author&amp;query=Moosavi%2C+N+S">Nafise Sadat Moosavi</a>, <a href="/search/cs?searchtype=author&amp;query=Sanh%2C+V">Victor Sanh</a>, <a href="/search/cs?searchtype=author&amp;query=Gurevych%2C+I">Iryna Gurevych</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2109.04144v1-abstract-short" style="display: inline;"> Recent prompt-based approaches allow pretrained language models to achieve strong performances on few-shot finetuning by reformulating downstream tasks as a language modeling problem. In this work, we demonstrate that, despite its advantages on low data regimes, finetuned prompt-based models for sentence pair classification tasks still suffer from a common pitfall of adopting inference heuristics&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.04144v1-abstract-full').style.display = 'inline'; document.getElementById('2109.04144v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2109.04144v1-abstract-full" style="display: none;"> Recent prompt-based approaches allow pretrained language models to achieve strong performances on few-shot finetuning by reformulating downstream tasks as a language modeling problem. In this work, we demonstrate that, despite its advantages on low data regimes, finetuned prompt-based models for sentence pair classification tasks still suffer from a common pitfall of adopting inference heuristics based on lexical overlap, e.g., models incorrectly assuming a sentence pair is of the same meaning because they consist of the same set of words. Interestingly, we find that this particular inference heuristic is significantly less present in the zero-shot evaluation of the prompt-based model, indicating how finetuning can be destructive to useful knowledge learned during the pretraining. We then show that adding a regularization that preserves pretraining weights is effective in mitigating this destructive tendency of few-shot finetuning. Our evaluation on three datasets demonstrates promising improvements on the three corresponding challenge datasets used to diagnose the inference heuristics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.04144v1-abstract-full').style.display = 'none'; document.getElementById('2109.04144v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at EMNLP 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2109.02846">arXiv:2109.02846</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2109.02846">pdf</a>, <a href="https://arxiv.org/format/2109.02846">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Datasets: A Community Library for Natural Language Processing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lhoest%2C+Q">Quentin Lhoest</a>, <a href="/search/cs?searchtype=author&amp;query=del+Moral%2C+A+V">Albert Villanova del Moral</a>, <a href="/search/cs?searchtype=author&amp;query=Jernite%2C+Y">Yacine Jernite</a>, <a href="/search/cs?searchtype=author&amp;query=Thakur%2C+A">Abhishek Thakur</a>, <a href="/search/cs?searchtype=author&amp;query=von+Platen%2C+P">Patrick von Platen</a>, <a href="/search/cs?searchtype=author&amp;query=Patil%2C+S">Suraj Patil</a>, <a href="/search/cs?searchtype=author&amp;query=Chaumond%2C+J">Julien Chaumond</a>, <a href="/search/cs?searchtype=author&amp;query=Drame%2C+M">Mariama Drame</a>, <a href="/search/cs?searchtype=author&amp;query=Plu%2C+J">Julien Plu</a>, <a href="/search/cs?searchtype=author&amp;query=Tunstall%2C+L">Lewis Tunstall</a>, <a href="/search/cs?searchtype=author&amp;query=Davison%2C+J">Joe Davison</a>, <a href="/search/cs?searchtype=author&amp;query=%C5%A0a%C5%A1ko%2C+M">Mario 艩a拧ko</a>, <a href="/search/cs?searchtype=author&amp;query=Chhablani%2C+G">Gunjan Chhablani</a>, <a href="/search/cs?searchtype=author&amp;query=Malik%2C+B">Bhavitvya Malik</a>, <a href="/search/cs?searchtype=author&amp;query=Brandeis%2C+S">Simon Brandeis</a>, <a href="/search/cs?searchtype=author&amp;query=Scao%2C+T+L">Teven Le Scao</a>, <a href="/search/cs?searchtype=author&amp;query=Sanh%2C+V">Victor Sanh</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+C">Canwen Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Patry%2C+N">Nicolas Patry</a>, <a href="/search/cs?searchtype=author&amp;query=McMillan-Major%2C+A">Angelina McMillan-Major</a>, <a href="/search/cs?searchtype=author&amp;query=Schmid%2C+P">Philipp Schmid</a>, <a href="/search/cs?searchtype=author&amp;query=Gugger%2C+S">Sylvain Gugger</a>, <a href="/search/cs?searchtype=author&amp;query=Delangue%2C+C">Cl茅ment Delangue</a>, <a href="/search/cs?searchtype=author&amp;query=Matussi%C3%A8re%2C+T">Th茅o Matussi猫re</a>, <a href="/search/cs?searchtype=author&amp;query=Debut%2C+L">Lysandre Debut</a> , et al. (7 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2109.02846v1-abstract-short" style="display: inline;"> The scale, variety, and quantity of publicly-available NLP datasets has grown rapidly as researchers propose new tasks, larger models, and novel benchmarks. Datasets is a community library for contemporary NLP designed to support this ecosystem. Datasets aims to standardize end-user interfaces, versioning, and documentation, while providing a lightweight front-end that behaves similarly for small&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.02846v1-abstract-full').style.display = 'inline'; document.getElementById('2109.02846v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2109.02846v1-abstract-full" style="display: none;"> The scale, variety, and quantity of publicly-available NLP datasets has grown rapidly as researchers propose new tasks, larger models, and novel benchmarks. Datasets is a community library for contemporary NLP designed to support this ecosystem. Datasets aims to standardize end-user interfaces, versioning, and documentation, while providing a lightweight front-end that behaves similarly for small datasets as for internet-scale corpora. The design of the library incorporates a distributed, community-driven approach to adding datasets and documenting usage. After a year of development, the library now includes more than 650 unique datasets, has more than 250 contributors, and has helped support a variety of novel cross-dataset research projects and shared tasks. The library is available at https://github.com/huggingface/datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.02846v1-abstract-full').style.display = 'none'; document.getElementById('2109.02846v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP Demo 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.03514">arXiv:2104.03514</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2104.03514">pdf</a>, <a href="https://arxiv.org/format/2104.03514">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Low-Complexity Probing via Finding Subnetworks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cao%2C+S">Steven Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Sanh%2C+V">Victor Sanh</a>, <a href="/search/cs?searchtype=author&amp;query=Rush%2C+A+M">Alexander M. Rush</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.03514v1-abstract-short" style="display: inline;"> The dominant approach in probing neural networks for linguistic properties is to train a new shallow multi-layer perceptron (MLP) on top of the model&#39;s internal representations. This approach can detect properties encoded in the model, but at the cost of adding new parameters that may learn the task directly. We instead propose a subtractive pruning-based probe, where we find an existing subnetwor&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.03514v1-abstract-full').style.display = 'inline'; document.getElementById('2104.03514v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.03514v1-abstract-full" style="display: none;"> The dominant approach in probing neural networks for linguistic properties is to train a new shallow multi-layer perceptron (MLP) on top of the model&#39;s internal representations. This approach can detect properties encoded in the model, but at the cost of adding new parameters that may learn the task directly. We instead propose a subtractive pruning-based probe, where we find an existing subnetwork that performs the linguistic task of interest. Compared to an MLP, the subnetwork probe achieves both higher accuracy on pre-trained models and lower accuracy on random models, so it is both better at finding properties of interest and worse at learning on its own. Next, by varying the complexity of each probe, we show that subnetwork probing Pareto-dominates MLP probing in that it achieves higher accuracy given any budget of probe complexity. Finally, we analyze the resulting subnetworks across various tasks to locate where each task is encoded, and we find that lower-level tasks are captured in lower layers, reproducing similar findings in past work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.03514v1-abstract-full').style.display = 'none'; document.getElementById('2104.03514v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NAACL-HLT 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2012.01300">arXiv:2012.01300</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2012.01300">pdf</a>, <a href="https://arxiv.org/format/2012.01300">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Learning from others&#39; mistakes: Avoiding dataset biases without modeling them </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sanh%2C+V">Victor Sanh</a>, <a href="/search/cs?searchtype=author&amp;query=Wolf%2C+T">Thomas Wolf</a>, <a href="/search/cs?searchtype=author&amp;query=Belinkov%2C+Y">Yonatan Belinkov</a>, <a href="/search/cs?searchtype=author&amp;query=Rush%2C+A+M">Alexander M. Rush</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2012.01300v1-abstract-short" style="display: inline;"> State-of-the-art natural language processing (NLP) models often learn to model dataset biases and surface form correlations instead of features that target the intended underlying task. Previous work has demonstrated effective methods to circumvent these issues when knowledge of the bias is available. We consider cases where the bias issues may not be explicitly identified, and show a method for t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.01300v1-abstract-full').style.display = 'inline'; document.getElementById('2012.01300v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2012.01300v1-abstract-full" style="display: none;"> State-of-the-art natural language processing (NLP) models often learn to model dataset biases and surface form correlations instead of features that target the intended underlying task. Previous work has demonstrated effective methods to circumvent these issues when knowledge of the bias is available. We consider cases where the bias issues may not be explicitly identified, and show a method for training models that learn to ignore these problematic correlations. Our approach relies on the observation that models with limited capacity primarily learn to exploit biases in the dataset. We can leverage the errors of such limited capacity models to train a more robust model in a product of experts, thus bypassing the need to hand-craft a biased model. We show the effectiveness of this method to retain improvements in out-of-distribution settings even if no particular bias is targeted by the biased model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.01300v1-abstract-full').style.display = 'none'; document.getElementById('2012.01300v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 6 figures, 6 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.14203">arXiv:2011.14203</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2011.14203">pdf</a>, <a href="https://arxiv.org/format/2011.14203">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> EdgeBERT: Sentence-Level Energy Optimizations for Latency-Aware Multi-Task NLP Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tambe%2C+T">Thierry Tambe</a>, <a href="/search/cs?searchtype=author&amp;query=Hooper%2C+C">Coleman Hooper</a>, <a href="/search/cs?searchtype=author&amp;query=Pentecost%2C+L">Lillian Pentecost</a>, <a href="/search/cs?searchtype=author&amp;query=Jia%2C+T">Tianyu Jia</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+E">En-Yu Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Donato%2C+M">Marco Donato</a>, <a href="/search/cs?searchtype=author&amp;query=Sanh%2C+V">Victor Sanh</a>, <a href="/search/cs?searchtype=author&amp;query=Whatmough%2C+P+N">Paul N. Whatmough</a>, <a href="/search/cs?searchtype=author&amp;query=Rush%2C+A+M">Alexander M. Rush</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.14203v5-abstract-short" style="display: inline;"> Transformer-based language models such as BERT provide significant accuracy improvement for a multitude of natural language processing (NLP) tasks. However, their hefty computational and memory demands make them challenging to deploy to resource-constrained edge platforms with strict latency requirements. We present EdgeBERT, an in-depth algorithm-hardware co-design for latency-aware energy optimi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.14203v5-abstract-full').style.display = 'inline'; document.getElementById('2011.14203v5-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.14203v5-abstract-full" style="display: none;"> Transformer-based language models such as BERT provide significant accuracy improvement for a multitude of natural language processing (NLP) tasks. However, their hefty computational and memory demands make them challenging to deploy to resource-constrained edge platforms with strict latency requirements. We present EdgeBERT, an in-depth algorithm-hardware co-design for latency-aware energy optimization for multi-task NLP. EdgeBERT employs entropy-based early exit predication in order to perform dynamic voltage-frequency scaling (DVFS), at a sentence granularity, for minimal energy consumption while adhering to a prescribed target latency. Computation and memory footprint overheads are further alleviated by employing a calibrated combination of adaptive attention span, selective network pruning, and floating-point quantization. Furthermore, in order to maximize the synergistic benefits of these algorithms in always-on and intermediate edge computing settings, we specialize a 12nm scalable hardware accelerator system, integrating a fast-switching low-dropout voltage regulator (LDO), an all-digital phase-locked loop (ADPLL), as well as, high-density embedded non-volatile memories (eNVMs) wherein the sparse floating-point bit encodings of the shared multi-task parameters are carefully stored. Altogether, latency-aware multi-task NLP inference acceleration on the EdgeBERT hardware system generates up to 7x, 2.5x, and 53x lower energy compared to the conventional inference without early stopping, the latency-unbounded early exit approach, and CUDA adaptations on an Nvidia Jetson Tegra X2 mobile GPU, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.14203v5-abstract-full').style.display = 'none'; document.getElementById('2011.14203v5-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages plus references. Paper to appear at the 54th IEEE/ACM International Symposium on Microarchitecture (MICRO 2021)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2005.07683">arXiv:2005.07683</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2005.07683">pdf</a>, <a href="https://arxiv.org/format/2005.07683">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Movement Pruning: Adaptive Sparsity by Fine-Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sanh%2C+V">Victor Sanh</a>, <a href="/search/cs?searchtype=author&amp;query=Wolf%2C+T">Thomas Wolf</a>, <a href="/search/cs?searchtype=author&amp;query=Rush%2C+A+M">Alexander M. Rush</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2005.07683v2-abstract-short" style="display: inline;"> Magnitude pruning is a widely used strategy for reducing model size in pure supervised learning; however, it is less effective in the transfer learning regime that has become standard for state-of-the-art natural language processing applications. We propose the use of movement pruning, a simple, deterministic first-order weight pruning method that is more adaptive to pretrained model fine-tuning.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.07683v2-abstract-full').style.display = 'inline'; document.getElementById('2005.07683v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2005.07683v2-abstract-full" style="display: none;"> Magnitude pruning is a widely used strategy for reducing model size in pure supervised learning; however, it is less effective in the transfer learning regime that has become standard for state-of-the-art natural language processing applications. We propose the use of movement pruning, a simple, deterministic first-order weight pruning method that is more adaptive to pretrained model fine-tuning. We give mathematical foundations to the method and compare it to existing zeroth- and first-order pruning methods. Experiments show that when pruning large pretrained language models, movement pruning shows significant improvements in high-sparsity regimes. When combined with distillation, the approach achieves minimal accuracy loss with down to only 3% of the model parameters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.07683v2-abstract-full').style.display = 'none'; document.getElementById('2005.07683v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 6 figures, 3 tables. Published at NeurIPS2020. Code: \url{huggingface.co/mvp}</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1910.03771">arXiv:1910.03771</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1910.03771">pdf</a>, <a href="https://arxiv.org/format/1910.03771">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> HuggingFace&#39;s Transformers: State-of-the-art Natural Language Processing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wolf%2C+T">Thomas Wolf</a>, <a href="/search/cs?searchtype=author&amp;query=Debut%2C+L">Lysandre Debut</a>, <a href="/search/cs?searchtype=author&amp;query=Sanh%2C+V">Victor Sanh</a>, <a href="/search/cs?searchtype=author&amp;query=Chaumond%2C+J">Julien Chaumond</a>, <a href="/search/cs?searchtype=author&amp;query=Delangue%2C+C">Clement Delangue</a>, <a href="/search/cs?searchtype=author&amp;query=Moi%2C+A">Anthony Moi</a>, <a href="/search/cs?searchtype=author&amp;query=Cistac%2C+P">Pierric Cistac</a>, <a href="/search/cs?searchtype=author&amp;query=Rault%2C+T">Tim Rault</a>, <a href="/search/cs?searchtype=author&amp;query=Louf%2C+R">R茅mi Louf</a>, <a href="/search/cs?searchtype=author&amp;query=Funtowicz%2C+M">Morgan Funtowicz</a>, <a href="/search/cs?searchtype=author&amp;query=Davison%2C+J">Joe Davison</a>, <a href="/search/cs?searchtype=author&amp;query=Shleifer%2C+S">Sam Shleifer</a>, <a href="/search/cs?searchtype=author&amp;query=von+Platen%2C+P">Patrick von Platen</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+C">Clara Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Jernite%2C+Y">Yacine Jernite</a>, <a href="/search/cs?searchtype=author&amp;query=Plu%2C+J">Julien Plu</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+C">Canwen Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Scao%2C+T+L">Teven Le Scao</a>, <a href="/search/cs?searchtype=author&amp;query=Gugger%2C+S">Sylvain Gugger</a>, <a href="/search/cs?searchtype=author&amp;query=Drame%2C+M">Mariama Drame</a>, <a href="/search/cs?searchtype=author&amp;query=Lhoest%2C+Q">Quentin Lhoest</a>, <a href="/search/cs?searchtype=author&amp;query=Rush%2C+A+M">Alexander M. Rush</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1910.03771v5-abstract-short" style="display: inline;"> Recent progress in natural language processing has been driven by advances in both model architecture and model pretraining. Transformer architectures have facilitated building higher-capacity models and pretraining has made it possible to effectively utilize this capacity for a wide variety of tasks. \textit{Transformers} is an open-source library with the goal of opening up these advances to the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.03771v5-abstract-full').style.display = 'inline'; document.getElementById('1910.03771v5-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1910.03771v5-abstract-full" style="display: none;"> Recent progress in natural language processing has been driven by advances in both model architecture and model pretraining. Transformer architectures have facilitated building higher-capacity models and pretraining has made it possible to effectively utilize this capacity for a wide variety of tasks. \textit{Transformers} is an open-source library with the goal of opening up these advances to the wider machine learning community. The library consists of carefully engineered state-of-the art Transformer architectures under a unified API. Backing this library is a curated collection of pretrained models made by and available for the community. \textit{Transformers} is designed to be extensible by researchers, simple for practitioners, and fast and robust in industrial deployments. The library is available at \url{https://github.com/huggingface/transformers}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.03771v5-abstract-full').style.display = 'none'; document.getElementById('1910.03771v5-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 4 figures, more details at https://github.com/huggingface/transformers</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1910.01108">arXiv:1910.01108</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1910.01108">pdf</a>, <a href="https://arxiv.org/format/1910.01108">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sanh%2C+V">Victor Sanh</a>, <a href="/search/cs?searchtype=author&amp;query=Debut%2C+L">Lysandre Debut</a>, <a href="/search/cs?searchtype=author&amp;query=Chaumond%2C+J">Julien Chaumond</a>, <a href="/search/cs?searchtype=author&amp;query=Wolf%2C+T">Thomas Wolf</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1910.01108v4-abstract-short" style="display: inline;"> As Transfer Learning from large-scale pre-trained models becomes more prevalent in Natural Language Processing (NLP), operating these large models in on-the-edge and/or under constrained computational training or inference budgets remains challenging. In this work, we propose a method to pre-train a smaller general-purpose language representation model, called DistilBERT, which can then be fine-tu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.01108v4-abstract-full').style.display = 'inline'; document.getElementById('1910.01108v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1910.01108v4-abstract-full" style="display: none;"> As Transfer Learning from large-scale pre-trained models becomes more prevalent in Natural Language Processing (NLP), operating these large models in on-the-edge and/or under constrained computational training or inference budgets remains challenging. In this work, we propose a method to pre-train a smaller general-purpose language representation model, called DistilBERT, which can then be fine-tuned with good performances on a wide range of tasks like its larger counterparts. While most prior work investigated the use of distillation for building task-specific models, we leverage knowledge distillation during the pre-training phase and show that it is possible to reduce the size of a BERT model by 40%, while retaining 97% of its language understanding capabilities and being 60% faster. To leverage the inductive biases learned by larger models during pre-training, we introduce a triple loss combining language modeling, distillation and cosine-distance losses. Our smaller, faster and lighter model is cheaper to pre-train and we demonstrate its capabilities for on-device computations in a proof-of-concept experiment and a comparative on-device study. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.01108v4-abstract-full').style.display = 'none'; document.getElementById('1910.01108v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 February, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">February 2020 - Revision: fix bug in evaluation metrics, updated metrics, argumentation unchanged. 5 pages, 1 figure, 4 tables. Accepted at the 5th Workshop on Energy Efficient Machine Learning and Cognitive Computing - NeurIPS 2019</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1901.08149">arXiv:1901.08149</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1901.08149">pdf</a>, <a href="https://arxiv.org/format/1901.08149">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> TransferTransfo: A Transfer Learning Approach for Neural Network Based Conversational Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wolf%2C+T">Thomas Wolf</a>, <a href="/search/cs?searchtype=author&amp;query=Sanh%2C+V">Victor Sanh</a>, <a href="/search/cs?searchtype=author&amp;query=Chaumond%2C+J">Julien Chaumond</a>, <a href="/search/cs?searchtype=author&amp;query=Delangue%2C+C">Clement Delangue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1901.08149v2-abstract-short" style="display: inline;"> We introduce a new approach to generative data-driven dialogue systems (e.g. chatbots) called TransferTransfo which is a combination of a Transfer learning based training scheme and a high-capacity Transformer model. Fine-tuning is performed by using a multi-task objective which combines several unsupervised prediction tasks. The resulting fine-tuned model shows strong improvements over the curren&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1901.08149v2-abstract-full').style.display = 'inline'; document.getElementById('1901.08149v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1901.08149v2-abstract-full" style="display: none;"> We introduce a new approach to generative data-driven dialogue systems (e.g. chatbots) called TransferTransfo which is a combination of a Transfer learning based training scheme and a high-capacity Transformer model. Fine-tuning is performed by using a multi-task objective which combines several unsupervised prediction tasks. The resulting fine-tuned model shows strong improvements over the current state-of-the-art end-to-end conversational models like memory augmented seq2seq and information-retrieval models. On the privately held PERSONA-CHAT dataset of the Conversational Intelligence Challenge 2, this approach obtains a new state-of-the-art, with respective perplexity, Hits@1 and F1 metrics of 16.28 (45 % absolute improvement), 80.7 (46 % absolute improvement) and 19.5 (20 % absolute improvement). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1901.08149v2-abstract-full').style.display = 'none'; document.getElementById('1901.08149v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 January, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 2 figures, 2 tables, NeurIPS 2018 CAI Workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1811.06031">arXiv:1811.06031</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1811.06031">pdf</a>, <a href="https://arxiv.org/format/1811.06031">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Hierarchical Multi-task Approach for Learning Embeddings from Semantic Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sanh%2C+V">Victor Sanh</a>, <a href="/search/cs?searchtype=author&amp;query=Wolf%2C+T">Thomas Wolf</a>, <a href="/search/cs?searchtype=author&amp;query=Ruder%2C+S">Sebastian Ruder</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1811.06031v2-abstract-short" style="display: inline;"> Much effort has been devoted to evaluate whether multi-task learning can be leveraged to learn rich representations that can be used in various Natural Language Processing (NLP) down-stream applications. However, there is still a lack of understanding of the settings in which multi-task learning has a significant effect. In this work, we introduce a hierarchical model trained in a multi-task learn&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1811.06031v2-abstract-full').style.display = 'inline'; document.getElementById('1811.06031v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1811.06031v2-abstract-full" style="display: none;"> Much effort has been devoted to evaluate whether multi-task learning can be leveraged to learn rich representations that can be used in various Natural Language Processing (NLP) down-stream applications. However, there is still a lack of understanding of the settings in which multi-task learning has a significant effect. In this work, we introduce a hierarchical model trained in a multi-task learning setup on a set of carefully selected semantic tasks. The model is trained in a hierarchical fashion to introduce an inductive bias by supervising a set of low level tasks at the bottom layers of the model and more complex tasks at the top layers of the model. This model achieves state-of-the-art results on a number of tasks, namely Named Entity Recognition, Entity Mention Detection and Relation Extraction without hand-engineered features or external NLP tools like syntactic parsers. The hierarchical training supervision induces a set of shared semantic representations at lower layers of the model. We show that as we move from the bottom to the top layers of the model, the hidden states of the layers tend to represent more complex semantic information. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1811.06031v2-abstract-full').style.display = 'none'; document.getElementById('1811.06031v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 November, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 1 figure, To appear in Proceedings of AAAI 2019</span> </p> </li> </ol> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10