Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 52 results for author: <span class="mathjax">Strubell, E</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Strubell%2C+E">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Strubell, E"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Strubell%2C+E&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Strubell, E"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Strubell%2C+E&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Strubell%2C+E&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Strubell%2C+E&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.17805">arXiv:2501.17805</a> <span> [<a href="https://arxiv.org/pdf/2501.17805">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> International AI Safety Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bengio%2C+Y">Yoshua Bengio</a>, <a href="/search/cs?searchtype=author&query=Mindermann%2C+S">S枚ren Mindermann</a>, <a href="/search/cs?searchtype=author&query=Privitera%2C+D">Daniel Privitera</a>, <a href="/search/cs?searchtype=author&query=Besiroglu%2C+T">Tamay Besiroglu</a>, <a href="/search/cs?searchtype=author&query=Bommasani%2C+R">Rishi Bommasani</a>, <a href="/search/cs?searchtype=author&query=Casper%2C+S">Stephen Casper</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+Y">Yejin Choi</a>, <a href="/search/cs?searchtype=author&query=Fox%2C+P">Philip Fox</a>, <a href="/search/cs?searchtype=author&query=Garfinkel%2C+B">Ben Garfinkel</a>, <a href="/search/cs?searchtype=author&query=Goldfarb%2C+D">Danielle Goldfarb</a>, <a href="/search/cs?searchtype=author&query=Heidari%2C+H">Hoda Heidari</a>, <a href="/search/cs?searchtype=author&query=Ho%2C+A">Anson Ho</a>, <a href="/search/cs?searchtype=author&query=Kapoor%2C+S">Sayash Kapoor</a>, <a href="/search/cs?searchtype=author&query=Khalatbari%2C+L">Leila Khalatbari</a>, <a href="/search/cs?searchtype=author&query=Longpre%2C+S">Shayne Longpre</a>, <a href="/search/cs?searchtype=author&query=Manning%2C+S">Sam Manning</a>, <a href="/search/cs?searchtype=author&query=Mavroudis%2C+V">Vasilios Mavroudis</a>, <a href="/search/cs?searchtype=author&query=Mazeika%2C+M">Mantas Mazeika</a>, <a href="/search/cs?searchtype=author&query=Michael%2C+J">Julian Michael</a>, <a href="/search/cs?searchtype=author&query=Newman%2C+J">Jessica Newman</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+K+Y">Kwan Yee Ng</a>, <a href="/search/cs?searchtype=author&query=Okolo%2C+C+T">Chinasa T. Okolo</a>, <a href="/search/cs?searchtype=author&query=Raji%2C+D">Deborah Raji</a>, <a href="/search/cs?searchtype=author&query=Sastry%2C+G">Girish Sastry</a>, <a href="/search/cs?searchtype=author&query=Seger%2C+E">Elizabeth Seger</a> , et al. (71 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.17805v1-abstract-short" style="display: inline;"> The first International AI Safety Report comprehensively synthesizes the current evidence on the capabilities, risks, and safety of advanced AI systems. The report was mandated by the nations attending the AI Safety Summit in Bletchley, UK. Thirty nations, the UN, the OECD, and the EU each nominated a representative to the report's Expert Advisory Panel. A total of 100 AI experts contributed, repr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17805v1-abstract-full').style.display = 'inline'; document.getElementById('2501.17805v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.17805v1-abstract-full" style="display: none;"> The first International AI Safety Report comprehensively synthesizes the current evidence on the capabilities, risks, and safety of advanced AI systems. The report was mandated by the nations attending the AI Safety Summit in Bletchley, UK. Thirty nations, the UN, the OECD, and the EU each nominated a representative to the report's Expert Advisory Panel. A total of 100 AI experts contributed, representing diverse perspectives and disciplines. Led by the report's Chair, these independent experts collectively had full discretion over the report's content. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17805v1-abstract-full').style.display = 'none'; document.getElementById('2501.17805v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16548">arXiv:2501.16548</a> <span> [<a href="https://arxiv.org/pdf/2501.16548">pdf</a>, <a href="https://arxiv.org/ps/2501.16548">ps</a>, <a href="https://arxiv.org/format/2501.16548">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> From Efficiency Gains to Rebound Effects: The Problem of Jevons' Paradox in AI's Polarized Environmental Debate </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luccioni%2C+A+S">Alexandra Sasha Luccioni</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=Crawford%2C+K">Kate Crawford</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16548v1-abstract-short" style="display: inline;"> As the climate crisis deepens, artificial intelligence (AI) has emerged as a contested force: some champion its potential to advance renewable energy, materials discovery, and large-scale emissions monitoring, while others underscore its growing carbon footprint, water consumption, and material resource demands. Much of this debate has concentrated on direct impact -- energy and water usage in dat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16548v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16548v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16548v1-abstract-full" style="display: none;"> As the climate crisis deepens, artificial intelligence (AI) has emerged as a contested force: some champion its potential to advance renewable energy, materials discovery, and large-scale emissions monitoring, while others underscore its growing carbon footprint, water consumption, and material resource demands. Much of this debate has concentrated on direct impact -- energy and water usage in data centers, e-waste from frequent hardware upgrades -- without addressing the significant indirect effects. This paper examines how the problem of Jevons' Paradox applies to AI, whereby efficiency gains may paradoxically spur increased consumption. We argue that understanding these second-order impacts requires an interdisciplinary approach, combining lifecycle assessments with socio-economic analyses. Rebound effects undermine the assumption that improved technical efficiency alone will ensure net reductions in environmental harm. Instead, the trajectory of AI's impact also hinges on business incentives and market logics, governance and policymaking, and broader social and cultural norms. We contend that a narrow focus on direct emissions misrepresents AI's true climate footprint, limiting the scope for meaningful interventions. We conclude with recommendations that address rebound effects and challenge the market-driven imperatives fueling uncontrolled AI growth. By broadening the analysis to include both direct and indirect consequences, we aim to inform a more comprehensive, evidence-based dialogue on AI's role in the climate crisis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16548v1-abstract-full').style.display = 'none'; document.getElementById('2501.16548v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.17009">arXiv:2412.17009</a> <span> [<a href="https://arxiv.org/pdf/2412.17009">pdf</a>, <a href="https://arxiv.org/format/2412.17009">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Generate to Discriminate: Expert Routing for Continual Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Byun%2C+Y">Yewon Byun</a>, <a href="/search/cs?searchtype=author&query=Mehta%2C+S+V">Sanket Vaibhav Mehta</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Saurabh Garg</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=Oberst%2C+M">Michael Oberst</a>, <a href="/search/cs?searchtype=author&query=Wilder%2C+B">Bryan Wilder</a>, <a href="/search/cs?searchtype=author&query=Lipton%2C+Z+C">Zachary C. Lipton</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.17009v2-abstract-short" style="display: inline;"> In many real-world settings, regulations and economic incentives permit the sharing of models but not data across institutional boundaries. In such scenarios, practitioners might hope to adapt models to new domains, without losing performance on previous domains (so-called catastrophic forgetting). While any single model may struggle to achieve this goal, learning an ensemble of domain-specific ex… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17009v2-abstract-full').style.display = 'inline'; document.getElementById('2412.17009v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.17009v2-abstract-full" style="display: none;"> In many real-world settings, regulations and economic incentives permit the sharing of models but not data across institutional boundaries. In such scenarios, practitioners might hope to adapt models to new domains, without losing performance on previous domains (so-called catastrophic forgetting). While any single model may struggle to achieve this goal, learning an ensemble of domain-specific experts offers the potential to adapt more closely to each individual institution. However, a core challenge in this context is determining which expert to deploy at test time. In this paper, we propose Generate to Discriminate (G2D), a domain-incremental continual learning method that leverages synthetic data to train a domain-discriminator that routes samples at inference time to the appropriate expert. Surprisingly, we find that leveraging synthetic data in this capacity is more effective than using the samples to \textit{directly} train the downstream classifier (the more common approach to leveraging synthetic data in the lifelong learning literature). We observe that G2D outperforms competitive domain-incremental learning methods on tasks in both vision and language modalities, providing a new perspective on the use of synthetic data in the lifelong learning literature. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17009v2-abstract-full').style.display = 'none'; document.getElementById('2412.17009v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13055">arXiv:2411.13055</a> <span> [<a href="https://arxiv.org/pdf/2411.13055">pdf</a>, <a href="https://arxiv.org/format/2411.13055">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Hardware Scaling Trends and Diminishing Returns in Large-Scale Distributed Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fernandez%2C+J">Jared Fernandez</a>, <a href="/search/cs?searchtype=author&query=Wehrstedt%2C+L">Luca Wehrstedt</a>, <a href="/search/cs?searchtype=author&query=Shamis%2C+L">Leonid Shamis</a>, <a href="/search/cs?searchtype=author&query=Elhoushi%2C+M">Mostafa Elhoushi</a>, <a href="/search/cs?searchtype=author&query=Saladi%2C+K">Kalyan Saladi</a>, <a href="/search/cs?searchtype=author&query=Bisk%2C+Y">Yonatan Bisk</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=Kahn%2C+J">Jacob Kahn</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13055v1-abstract-short" style="display: inline;"> Dramatic increases in the capabilities of neural network models in recent years are driven by scaling model size, training data, and corresponding computational resources. To develop the exceedingly large networks required in modern applications, such as large language models (LLMs), model training is distributed across tens of thousands of hardware accelerators (e.g. GPUs), requiring orchestratio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13055v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13055v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13055v1-abstract-full" style="display: none;"> Dramatic increases in the capabilities of neural network models in recent years are driven by scaling model size, training data, and corresponding computational resources. To develop the exceedingly large networks required in modern applications, such as large language models (LLMs), model training is distributed across tens of thousands of hardware accelerators (e.g. GPUs), requiring orchestration of computation and communication across large computing clusters. In this work, we demonstrate that careful consideration of hardware configuration and parallelization strategy is critical for effective (i.e. compute- and cost-efficient) scaling of model size, training data, and total computation. We conduct an extensive empirical study of the performance of large-scale LLM training workloads across model size, hardware configurations, and distributed parallelization strategies. We demonstrate that: (1) beyond certain scales, overhead incurred from certain distributed communication strategies leads parallelization strategies previously thought to be sub-optimal in fact become preferable; and (2) scaling the total number of accelerators for large model training quickly yields diminishing returns even when hardware and parallelization strategies are properly optimized, implying poor marginal performance per additional unit of power or GPU-hour. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13055v1-abstract-full').style.display = 'none'; document.getElementById('2411.13055v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04448">arXiv:2411.04448</a> <span> [<a href="https://arxiv.org/pdf/2411.04448">pdf</a>, <a href="https://arxiv.org/format/2411.04448">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Gradient Localization Improves Lifelong Pretraining of Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fernandez%2C+J">Jared Fernandez</a>, <a href="/search/cs?searchtype=author&query=Bisk%2C+Y">Yonatan Bisk</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04448v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) trained on web-scale text corpora have been shown to capture world knowledge in their parameters. However, the mechanism by which language models store different types of knowledge is poorly understood. In this work, we examine two types of knowledge relating to temporally sensitive entities and demonstrate that each type is localized to different sets of parameters wi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04448v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04448v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04448v1-abstract-full" style="display: none;"> Large Language Models (LLMs) trained on web-scale text corpora have been shown to capture world knowledge in their parameters. However, the mechanism by which language models store different types of knowledge is poorly understood. In this work, we examine two types of knowledge relating to temporally sensitive entities and demonstrate that each type is localized to different sets of parameters within the LLMs. We hypothesize that the lack of consideration of the locality of knowledge in existing continual learning methods contributes to both: the failed uptake of new information, and catastrophic forgetting of previously learned information. We observe that sequences containing references to updated and newly mentioned entities exhibit larger gradient norms in a subset of layers. We demonstrate that targeting parameter updates to these relevant layers can improve the performance of continually pretraining on language containing temporal drift. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04448v1-abstract-full').style.display = 'none'; document.getElementById('2411.04448v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP Findings 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23478">arXiv:2410.23478</a> <span> [<a href="https://arxiv.org/pdf/2410.23478">pdf</a>, <a href="https://arxiv.org/format/2410.23478">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Collage: Decomposable Rapid Prototyping for Information Extraction on Scientific PDFs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gururaja%2C+S">Sireesh Gururaja</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yueheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+G">Guannan Tang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Murphy%2C+K">Kevin Murphy</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+Y">Yu-Tsen Yi</a>, <a href="/search/cs?searchtype=author&query=Seo%2C+J">Junwon Seo</a>, <a href="/search/cs?searchtype=author&query=Rollett%2C+A">Anthony Rollett</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23478v1-abstract-short" style="display: inline;"> Recent years in NLP have seen the continued development of domain-specific information extraction tools for scientific documents, alongside the release of increasingly multimodal pretrained transformer models. While the opportunity for scientists outside of NLP to evaluate and apply such systems to their own domains has never been clearer, these models are difficult to compare: they accept differe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23478v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23478v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23478v1-abstract-full" style="display: none;"> Recent years in NLP have seen the continued development of domain-specific information extraction tools for scientific documents, alongside the release of increasingly multimodal pretrained transformer models. While the opportunity for scientists outside of NLP to evaluate and apply such systems to their own domains has never been clearer, these models are difficult to compare: they accept different input formats, are often black-box and give little insight into processing failures, and rarely handle PDF documents, the most common format of scientific publication. In this work, we present Collage, a tool designed for rapid prototyping, visualization, and evaluation of different information extraction models on scientific PDFs. Collage allows the use and evaluation of any HuggingFace token classifier, several LLMs, and multiple other task-specific models out of the box, and provides extensible software interfaces to accelerate experimentation with new models. Further, we enable both developers and users of NLP-based tools to inspect, debug, and better understand modeling pipelines by providing granular views of intermediate states of processing. We demonstrate our system in the context of information extraction to assist with literature review in materials science. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23478v1-abstract-full').style.display = 'none'; document.getElementById('2410.23478v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15661">arXiv:2410.15661</a> <span> [<a href="https://arxiv.org/pdf/2410.15661">pdf</a>, <a href="https://arxiv.org/format/2410.15661">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.18653/v1/2024.emnlp-main.1176">10.18653/v1/2024.emnlp-main.1176 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Scalable Data Ablation Approximations for Language Models through Modular Training and Merging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Na%2C+C">Clara Na</a>, <a href="/search/cs?searchtype=author&query=Magnusson%2C+I">Ian Magnusson</a>, <a href="/search/cs?searchtype=author&query=Jha%2C+A+H">Ananya Harsh Jha</a>, <a href="/search/cs?searchtype=author&query=Sherborne%2C+T">Tom Sherborne</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=Dodge%2C+J">Jesse Dodge</a>, <a href="/search/cs?searchtype=author&query=Dasigi%2C+P">Pradeep Dasigi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15661v1-abstract-short" style="display: inline;"> Training data compositions for Large Language Models (LLMs) can significantly affect their downstream performance. However, a thorough data ablation study exploring large sets of candidate data mixtures is typically prohibitively expensive since the full effect is seen only after training the models; this can lead practitioners to settle for sub-optimal data mixtures. We propose an efficient metho… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15661v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15661v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15661v1-abstract-full" style="display: none;"> Training data compositions for Large Language Models (LLMs) can significantly affect their downstream performance. However, a thorough data ablation study exploring large sets of candidate data mixtures is typically prohibitively expensive since the full effect is seen only after training the models; this can lead practitioners to settle for sub-optimal data mixtures. We propose an efficient method for approximating data ablations which trains individual models on subsets of a training corpus and reuses them across evaluations of combinations of subsets. In continued pre-training experiments, we find that, given an arbitrary evaluation set, the perplexity score of a single model trained on a candidate set of data is strongly correlated with perplexity scores of parameter averages of models trained on distinct partitions of that data. From this finding, we posit that researchers and practitioners can conduct inexpensive simulations of data ablations by maintaining a pool of models that were each trained on partitions of a large training corpus, and assessing candidate data mixtures by evaluating parameter averages of combinations of these models. This approach allows for substantial improvements in amortized training efficiency -- scaling only linearly with respect to new data -- by enabling reuse of previous training computation, opening new avenues for improving model performance through rigorous, incremental data assessment and mixing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15661v1-abstract-full').style.display = 'none'; document.getElementById('2410.15661v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024. 17 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05613">arXiv:2410.05613</a> <span> [<a href="https://arxiv.org/pdf/2410.05613">pdf</a>, <a href="https://arxiv.org/format/2410.05613">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Stereotype or Personalization? User Identity Biases Chatbot Recommendations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kantharuban%2C+A">Anjali Kantharuban</a>, <a href="/search/cs?searchtype=author&query=Milbauer%2C+J">Jeremiah Milbauer</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=Neubig%2C+G">Graham Neubig</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05613v1-abstract-short" style="display: inline;"> We demonstrate that when people use large language models (LLMs) to generate recommendations, the LLMs produce responses that reflect both what the user wants and who the user is. While personalized recommendations are often desired by users, it can be difficult in practice to distinguish cases of bias from cases of personalization: we find that models generate racially stereotypical recommendatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05613v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05613v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05613v1-abstract-full" style="display: none;"> We demonstrate that when people use large language models (LLMs) to generate recommendations, the LLMs produce responses that reflect both what the user wants and who the user is. While personalized recommendations are often desired by users, it can be difficult in practice to distinguish cases of bias from cases of personalization: we find that models generate racially stereotypical recommendations regardless of whether the user revealed their identity intentionally through explicit indications or unintentionally through implicit cues. We argue that chatbots ought to transparently indicate when recommendations are influenced by a user's revealed identity characteristics, but observe that they currently fail to do so. Our experiments show that even though a user's revealed identity significantly influences model recommendations (p < 0.001), model responses obfuscate this fact in response to user queries. This bias and lack of transparency occurs consistently across multiple popular consumer LLMs (gpt-4o-mini, gpt-4-turbo, llama-3-70B, and claude-3.5) and for four American racial groups. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05613v1-abstract-full').style.display = 'none'; document.getElementById('2410.05613v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.13954">arXiv:2405.13954</a> <span> [<a href="https://arxiv.org/pdf/2405.13954">pdf</a>, <a href="https://arxiv.org/format/2405.13954">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> What is Your Data Worth to GPT? LLM-Scale Data Valuation with Influence Functions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Choe%2C+S+K">Sang Keun Choe</a>, <a href="/search/cs?searchtype=author&query=Ahn%2C+H">Hwijeen Ahn</a>, <a href="/search/cs?searchtype=author&query=Bae%2C+J">Juhan Bae</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+K">Kewen Zhao</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+M">Minsoo Kang</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+Y">Youngseog Chung</a>, <a href="/search/cs?searchtype=author&query=Pratapa%2C+A">Adithya Pratapa</a>, <a href="/search/cs?searchtype=author&query=Neiswanger%2C+W">Willie Neiswanger</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=Mitamura%2C+T">Teruko Mitamura</a>, <a href="/search/cs?searchtype=author&query=Schneider%2C+J">Jeff Schneider</a>, <a href="/search/cs?searchtype=author&query=Hovy%2C+E">Eduard Hovy</a>, <a href="/search/cs?searchtype=author&query=Grosse%2C+R">Roger Grosse</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+E">Eric Xing</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.13954v1-abstract-short" style="display: inline;"> Large language models (LLMs) are trained on a vast amount of human-written data, but data providers often remain uncredited. In response to this issue, data valuation (or data attribution), which quantifies the contribution or value of each data to the model output, has been discussed as a potential solution. Nevertheless, applying existing data valuation methods to recent LLMs and their vast trai… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.13954v1-abstract-full').style.display = 'inline'; document.getElementById('2405.13954v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.13954v1-abstract-full" style="display: none;"> Large language models (LLMs) are trained on a vast amount of human-written data, but data providers often remain uncredited. In response to this issue, data valuation (or data attribution), which quantifies the contribution or value of each data to the model output, has been discussed as a potential solution. Nevertheless, applying existing data valuation methods to recent LLMs and their vast training datasets has been largely limited by prohibitive compute and memory costs. In this work, we focus on influence functions, a popular gradient-based data valuation method, and significantly improve its scalability with an efficient gradient projection strategy called LoGra that leverages the gradient structure in backpropagation. We then provide a theoretical motivation of gradient projection approaches to influence functions to promote trust in the data valuation process. Lastly, we lower the barrier to implementing data valuation systems by introducing LogIX, a software package that can transform existing training code into data valuation code with minimal effort. In our data valuation experiments, LoGra achieves competitive accuracy against more expensive baselines while showing up to 6,500x improvement in throughput and 5x reduction in GPU memory usage when applied to Llama3-8B-Instruct and the 1B-token dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.13954v1-abstract-full').style.display = 'none'; document.getElementById('2405.13954v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.13858">arXiv:2405.13858</a> <span> [<a href="https://arxiv.org/pdf/2405.13858">pdf</a>, <a href="https://arxiv.org/format/2405.13858">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Carbon Connect: An Ecosystem for Sustainable Computing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+B+C">Benjamin C. Lee</a>, <a href="/search/cs?searchtype=author&query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&query=van+Benthem%2C+A">Arthur van Benthem</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+U">Udit Gupta</a>, <a href="/search/cs?searchtype=author&query=Hills%2C+G">Gage Hills</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+V">Vincent Liu</a>, <a href="/search/cs?searchtype=author&query=Pierce%2C+B">Benjamin Pierce</a>, <a href="/search/cs?searchtype=author&query=Stewart%2C+C">Christopher Stewart</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+G">Gu-Yeon Wei</a>, <a href="/search/cs?searchtype=author&query=Wierman%2C+A">Adam Wierman</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuan Yao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+M">Minlan Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.13858v2-abstract-short" style="display: inline;"> Computing is at a moment of profound opportunity. Emerging applications -- such as capable artificial intelligence, immersive virtual realities, and pervasive sensor systems -- drive unprecedented demand for computer. Despite recent advances toward net zero carbon emissions, the computing industry's gross energy usage continues to rise at an alarming rate, outpacing the growth of new energy instal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.13858v2-abstract-full').style.display = 'inline'; document.getElementById('2405.13858v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.13858v2-abstract-full" style="display: none;"> Computing is at a moment of profound opportunity. Emerging applications -- such as capable artificial intelligence, immersive virtual realities, and pervasive sensor systems -- drive unprecedented demand for computer. Despite recent advances toward net zero carbon emissions, the computing industry's gross energy usage continues to rise at an alarming rate, outpacing the growth of new energy installations and renewable energy deployments. A shift towards sustainability is needed to spark a transformation in how computer systems are manufactured, allocated, and consumed. Carbon Connect envisions coordinated research thrusts that produce design and management strategies for sustainable, next-generation computer systems. These strategies must flatten and then reverse growth trajectories for computing power and carbon for society's most rapidly growing applications such as artificial intelligence and virtual spaces. We will require accurate models for carbon accounting in computing technology. For embodied carbon, we must re-think conventional design strategies -- over-provisioned monolithic servers, frequent hardware refresh cycles, custom silicon -- and adopt life-cycle design strategies that more effectively reduce, reuse and recycle hardware at scale. For operational carbon, we must not only embrace renewable energy but also design systems to use that energy more efficiently. Finally, new hardware design and management strategies must be cognizant of economic policy and regulatory landscape, aligning private initiatives with societal goals. Many of these broader goals will require computer scientists to develop deep, enduring collaborations with researchers in economics, law, and industrial ecology to spark change in broader practice. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.13858v2-abstract-full').style.display = 'none'; document.getElementById('2405.13858v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.01019">arXiv:2404.01019</a> <span> [<a href="https://arxiv.org/pdf/2404.01019">pdf</a>, <a href="https://arxiv.org/format/2404.01019">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Source-Aware Training Enables Knowledge Attribution in Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Khalifa%2C+M">Muhammad Khalifa</a>, <a href="/search/cs?searchtype=author&query=Wadden%2C+D">David Wadden</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Honglak Lee</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lu Wang</a>, <a href="/search/cs?searchtype=author&query=Beltagy%2C+I">Iz Beltagy</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+H">Hao Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.01019v3-abstract-short" style="display: inline;"> Large language models (LLMs) learn a vast amount of knowledge during pretraining, but they are often oblivious to the source(s) of such knowledge. We investigate the problem of intrinsic source citation, where LLMs are required to cite the pretraining source supporting a generated response. Intrinsic source citation can enhance LLM transparency, interpretability, and verifiability. To give LLMs su… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01019v3-abstract-full').style.display = 'inline'; document.getElementById('2404.01019v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.01019v3-abstract-full" style="display: none;"> Large language models (LLMs) learn a vast amount of knowledge during pretraining, but they are often oblivious to the source(s) of such knowledge. We investigate the problem of intrinsic source citation, where LLMs are required to cite the pretraining source supporting a generated response. Intrinsic source citation can enhance LLM transparency, interpretability, and verifiability. To give LLMs such ability, we explore source-aware training -- a recipe that involves (i) training the LLM to associate unique source document identifiers with the knowledge in each document, followed by (ii) an instruction-tuning stage to teach the LLM to cite a supporting pretraining source when prompted. Source-aware training borrows from existing pretraining/fine-tuning frameworks and requires minimal changes to the model architecture or implementation. Through experiments on synthetic data, we demonstrate that our training recipe can enable faithful attribution to the pretraining data without a substantial impact on the model's perplexity compared to standard pretraining. Our findings also highlight the importance of pretraining data augmentation in achieving attribution. Code and data available here: \url{https://github.com/mukhal/intrinsic-source-citation} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01019v3-abstract-full').style.display = 'none'; document.getElementById('2404.01019v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">COLM '24</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.00838">arXiv:2402.00838</a> <span> [<a href="https://arxiv.org/pdf/2402.00838">pdf</a>, <a href="https://arxiv.org/format/2402.00838">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> OLMo: Accelerating the Science of Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Groeneveld%2C+D">Dirk Groeneveld</a>, <a href="/search/cs?searchtype=author&query=Beltagy%2C+I">Iz Beltagy</a>, <a href="/search/cs?searchtype=author&query=Walsh%2C+P">Pete Walsh</a>, <a href="/search/cs?searchtype=author&query=Bhagia%2C+A">Akshita Bhagia</a>, <a href="/search/cs?searchtype=author&query=Kinney%2C+R">Rodney Kinney</a>, <a href="/search/cs?searchtype=author&query=Tafjord%2C+O">Oyvind Tafjord</a>, <a href="/search/cs?searchtype=author&query=Jha%2C+A+H">Ananya Harsh Jha</a>, <a href="/search/cs?searchtype=author&query=Ivison%2C+H">Hamish Ivison</a>, <a href="/search/cs?searchtype=author&query=Magnusson%2C+I">Ian Magnusson</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yizhong Wang</a>, <a href="/search/cs?searchtype=author&query=Arora%2C+S">Shane Arora</a>, <a href="/search/cs?searchtype=author&query=Atkinson%2C+D">David Atkinson</a>, <a href="/search/cs?searchtype=author&query=Authur%2C+R">Russell Authur</a>, <a href="/search/cs?searchtype=author&query=Chandu%2C+K+R">Khyathi Raghavi Chandu</a>, <a href="/search/cs?searchtype=author&query=Cohan%2C+A">Arman Cohan</a>, <a href="/search/cs?searchtype=author&query=Dumas%2C+J">Jennifer Dumas</a>, <a href="/search/cs?searchtype=author&query=Elazar%2C+Y">Yanai Elazar</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yuling Gu</a>, <a href="/search/cs?searchtype=author&query=Hessel%2C+J">Jack Hessel</a>, <a href="/search/cs?searchtype=author&query=Khot%2C+T">Tushar Khot</a>, <a href="/search/cs?searchtype=author&query=Merrill%2C+W">William Merrill</a>, <a href="/search/cs?searchtype=author&query=Morrison%2C+J">Jacob Morrison</a>, <a href="/search/cs?searchtype=author&query=Muennighoff%2C+N">Niklas Muennighoff</a>, <a href="/search/cs?searchtype=author&query=Naik%2C+A">Aakanksha Naik</a>, <a href="/search/cs?searchtype=author&query=Nam%2C+C">Crystal Nam</a> , et al. (18 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.00838v4-abstract-short" style="display: inline;"> Language models (LMs) have become ubiquitous in both NLP research and in commercial product offerings. As their commercial importance has surged, the most powerful models have become closed off, gated behind proprietary interfaces, with important details of their training data, architectures, and development undisclosed. Given the importance of these details in scientifically studying these models… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.00838v4-abstract-full').style.display = 'inline'; document.getElementById('2402.00838v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.00838v4-abstract-full" style="display: none;"> Language models (LMs) have become ubiquitous in both NLP research and in commercial product offerings. As their commercial importance has surged, the most powerful models have become closed off, gated behind proprietary interfaces, with important details of their training data, architectures, and development undisclosed. Given the importance of these details in scientifically studying these models, including their biases and potential risks, we believe it is essential for the research community to have access to powerful, truly open LMs. To this end, we have built OLMo, a competitive, truly Open Language Model, to enable the scientific study of language models. Unlike most prior efforts that have only released model weights and inference code, we release OLMo alongside open training data and training and evaluation code. We hope this release will empower the open research community and inspire a new wave of innovation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.00838v4-abstract-full').style.display = 'none'; document.getElementById('2402.00838v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.00159">arXiv:2402.00159</a> <span> [<a href="https://arxiv.org/pdf/2402.00159">pdf</a>, <a href="https://arxiv.org/format/2402.00159">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Soldaini%2C+L">Luca Soldaini</a>, <a href="/search/cs?searchtype=author&query=Kinney%2C+R">Rodney Kinney</a>, <a href="/search/cs?searchtype=author&query=Bhagia%2C+A">Akshita Bhagia</a>, <a href="/search/cs?searchtype=author&query=Schwenk%2C+D">Dustin Schwenk</a>, <a href="/search/cs?searchtype=author&query=Atkinson%2C+D">David Atkinson</a>, <a href="/search/cs?searchtype=author&query=Authur%2C+R">Russell Authur</a>, <a href="/search/cs?searchtype=author&query=Bogin%2C+B">Ben Bogin</a>, <a href="/search/cs?searchtype=author&query=Chandu%2C+K">Khyathi Chandu</a>, <a href="/search/cs?searchtype=author&query=Dumas%2C+J">Jennifer Dumas</a>, <a href="/search/cs?searchtype=author&query=Elazar%2C+Y">Yanai Elazar</a>, <a href="/search/cs?searchtype=author&query=Hofmann%2C+V">Valentin Hofmann</a>, <a href="/search/cs?searchtype=author&query=Jha%2C+A+H">Ananya Harsh Jha</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+S">Sachin Kumar</a>, <a href="/search/cs?searchtype=author&query=Lucy%2C+L">Li Lucy</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+X">Xinxi Lyu</a>, <a href="/search/cs?searchtype=author&query=Lambert%2C+N">Nathan Lambert</a>, <a href="/search/cs?searchtype=author&query=Magnusson%2C+I">Ian Magnusson</a>, <a href="/search/cs?searchtype=author&query=Morrison%2C+J">Jacob Morrison</a>, <a href="/search/cs?searchtype=author&query=Muennighoff%2C+N">Niklas Muennighoff</a>, <a href="/search/cs?searchtype=author&query=Naik%2C+A">Aakanksha Naik</a>, <a href="/search/cs?searchtype=author&query=Nam%2C+C">Crystal Nam</a>, <a href="/search/cs?searchtype=author&query=Peters%2C+M+E">Matthew E. Peters</a>, <a href="/search/cs?searchtype=author&query=Ravichander%2C+A">Abhilasha Ravichander</a>, <a href="/search/cs?searchtype=author&query=Richardson%2C+K">Kyle Richardson</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Z">Zejiang Shen</a> , et al. (11 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.00159v2-abstract-short" style="display: inline;"> Information about pretraining corpora used to train the current best-performing language models is seldom discussed: commercial models rarely detail their data, and even open models are often released without accompanying training data or recipes to reproduce them. As a result, it is challenging to conduct and advance scientific research on language modeling, such as understanding how training dat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.00159v2-abstract-full').style.display = 'inline'; document.getElementById('2402.00159v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.00159v2-abstract-full" style="display: none;"> Information about pretraining corpora used to train the current best-performing language models is seldom discussed: commercial models rarely detail their data, and even open models are often released without accompanying training data or recipes to reproduce them. As a result, it is challenging to conduct and advance scientific research on language modeling, such as understanding how training data impacts model capabilities and limitations. To facilitate scientific research on language model pretraining, we curate and release Dolma, a three-trillion-token English corpus, built from a diverse mixture of web content, scientific papers, code, public-domain books, social media, and encyclopedic materials. We extensively document Dolma, including its design principles, details about its construction, and a summary of its contents. We present analyses and experimental results on intermediate states of Dolma to share what we have learned about important data curation practices. Finally, we open-source our data curation toolkit to enable reproduction of our work as well as support further research in large-scale data curation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.00159v2-abstract-full').style.display = 'none'; document.getElementById('2402.00159v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ACL 2024; Dataset: https://hf.co/datasets/allenai/dolma; Code: https://github.com/allenai/dolma</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.06408">arXiv:2401.06408</a> <span> [<a href="https://arxiv.org/pdf/2401.06408">pdf</a>, <a href="https://arxiv.org/format/2401.06408">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> AboutMe: Using Self-Descriptions in Webpages to Document the Effects of English Pretraining Data Filters </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lucy%2C+L">Li Lucy</a>, <a href="/search/cs?searchtype=author&query=Gururangan%2C+S">Suchin Gururangan</a>, <a href="/search/cs?searchtype=author&query=Soldaini%2C+L">Luca Soldaini</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=Bamman%2C+D">David Bamman</a>, <a href="/search/cs?searchtype=author&query=Klein%2C+L+F">Lauren F. Klein</a>, <a href="/search/cs?searchtype=author&query=Dodge%2C+J">Jesse Dodge</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.06408v3-abstract-short" style="display: inline;"> Large language models' (LLMs) abilities are drawn from their pretraining data, and model development begins with data curation. However, decisions around what data is retained or removed during this initial stage are under-scrutinized. In our work, we ground web text, which is a popular pretraining data source, to its social and geographic contexts. We create a new dataset of 10.3 million self-des… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.06408v3-abstract-full').style.display = 'inline'; document.getElementById('2401.06408v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.06408v3-abstract-full" style="display: none;"> Large language models' (LLMs) abilities are drawn from their pretraining data, and model development begins with data curation. However, decisions around what data is retained or removed during this initial stage are under-scrutinized. In our work, we ground web text, which is a popular pretraining data source, to its social and geographic contexts. We create a new dataset of 10.3 million self-descriptions of website creators, and extract information about who they are and where they are from: their topical interests, social roles, and geographic affiliations. Then, we conduct the first study investigating how ten "quality" and English language identification (langID) filters affect webpages that vary along these social dimensions. Our experiments illuminate a range of implicit preferences in data curation: we show that some quality classifiers act like topical domain filters, and langID can overlook English content from some regions of the world. Overall, we hope that our work will encourage a new line of research on pretraining data curation practices and its social implications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.06408v3-abstract-full').style.display = 'none'; document.getElementById('2401.06408v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">28 pages, 13 figures. Association for Computational Linguistics (ACL) 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.05662">arXiv:2312.05662</a> <span> [<a href="https://arxiv.org/pdf/2312.05662">pdf</a>, <a href="https://arxiv.org/format/2312.05662">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Understanding the Effect of Model Compression on Social Bias in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gon%C3%A7alves%2C+G">Gustavo Gon莽alves</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.05662v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) trained with self-supervision on vast corpora of web text fit to the social biases of that text. Without intervention, these social biases persist in the model's predictions in downstream tasks, leading to representational harm. Many strategies have been proposed to mitigate the effects of inappropriate social biases learned during pretraining. Simultaneously, methods… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.05662v2-abstract-full').style.display = 'inline'; document.getElementById('2312.05662v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.05662v2-abstract-full" style="display: none;"> Large Language Models (LLMs) trained with self-supervision on vast corpora of web text fit to the social biases of that text. Without intervention, these social biases persist in the model's predictions in downstream tasks, leading to representational harm. Many strategies have been proposed to mitigate the effects of inappropriate social biases learned during pretraining. Simultaneously, methods for model compression have become increasingly popular to reduce the computational burden of LLMs. Despite the popularity and need for both approaches, little work has been done to explore the interplay between these two. We perform a carefully controlled study of the impact of model compression via quantization and knowledge distillation on measures of social bias in LLMs. Longer pretraining and larger models led to higher social bias, and quantization showed a regularizer effect with its best trade-off around 20% of the original pretraining time. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.05662v2-abstract-full').style.display = 'none'; document.getElementById('2312.05662v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2023 Main</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.16863">arXiv:2311.16863</a> <span> [<a href="https://arxiv.org/pdf/2311.16863">pdf</a>, <a href="https://arxiv.org/format/2311.16863">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3630106.3658542">10.1145/3630106.3658542 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Power Hungry Processing: Watts Driving the Cost of AI Deployment? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luccioni%2C+A+S">Alexandra Sasha Luccioni</a>, <a href="/search/cs?searchtype=author&query=Jernite%2C+Y">Yacine Jernite</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.16863v3-abstract-short" style="display: inline;"> Recent years have seen a surge in the popularity of commercial AI products based on generative, multi-purpose AI systems promising a unified approach to building machine learning (ML) models into technology. However, this ambition of ``generality'' comes at a steep cost to the environment, given the amount of energy these systems require and the amount of carbon that they emit. In this work, we pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.16863v3-abstract-full').style.display = 'inline'; document.getElementById('2311.16863v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.16863v3-abstract-full" style="display: none;"> Recent years have seen a surge in the popularity of commercial AI products based on generative, multi-purpose AI systems promising a unified approach to building machine learning (ML) models into technology. However, this ambition of ``generality'' comes at a steep cost to the environment, given the amount of energy these systems require and the amount of carbon that they emit. In this work, we propose the first systematic comparison of the ongoing inference cost of various categories of ML systems, covering both task-specific (i.e. finetuned models that carry out a single task) and `general-purpose' models, (i.e. those trained for multiple tasks). We measure deployment cost as the amount of energy and carbon required to perform 1,000 inferences on representative benchmark dataset using these models. We find that multi-purpose, generative architectures are orders of magnitude more expensive than task-specific systems for a variety of tasks, even when controlling for the number of model parameters. We conclude with a discussion around the current trend of deploying multi-purpose generative ML systems, and caution that their utility should be more intentionally weighed against increased costs in terms of energy and emissions. All the data from our study can be accessed via an interactive demo to carry out further exploration and analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.16863v3-abstract-full').style.display = 'none'; document.getElementById('2311.16863v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ACM Conference on Fairness, Accountability, and Transparency (ACM FAccT '24), June 3--6, 2024, Rio de Janeiro, Brazil </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.10267">arXiv:2311.10267</a> <span> [<a href="https://arxiv.org/pdf/2311.10267">pdf</a>, <a href="https://arxiv.org/format/2311.10267">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.18653/v1/2023.findings-emnlp.607">10.18653/v1/2023.findings-emnlp.607 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Energy and Carbon Considerations of Fine-Tuning BERT </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaorong Wang</a>, <a href="/search/cs?searchtype=author&query=Na%2C+C">Clara Na</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=Friedler%2C+S">Sorelle Friedler</a>, <a href="/search/cs?searchtype=author&query=Luccioni%2C+S">Sasha Luccioni</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.10267v2-abstract-short" style="display: inline;"> Despite the popularity of the `pre-train then fine-tune' paradigm in the NLP community, existing work quantifying energy costs and associated carbon emissions has largely focused on language model pre-training. Although a single pre-training run draws substantially more energy than fine-tuning, fine-tuning is performed more frequently by many more individual actors, and thus must be accounted for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.10267v2-abstract-full').style.display = 'inline'; document.getElementById('2311.10267v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.10267v2-abstract-full" style="display: none;"> Despite the popularity of the `pre-train then fine-tune' paradigm in the NLP community, existing work quantifying energy costs and associated carbon emissions has largely focused on language model pre-training. Although a single pre-training run draws substantially more energy than fine-tuning, fine-tuning is performed more frequently by many more individual actors, and thus must be accounted for when considering the energy and carbon footprint of NLP. In order to better characterize the role of fine-tuning in the landscape of energy and carbon emissions in NLP, we perform a careful empirical study of the computational costs of fine-tuning across tasks, datasets, hardware infrastructure and measurement modalities. Our experimental results allow us to place fine-tuning energy and carbon costs into perspective with respect to pre-training and inference, and outline recommendations to NLP researchers and practitioners who wish to improve their fine-tuning energy efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.10267v2-abstract-full').style.display = 'none'; document.getElementById('2311.10267v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2023 Findings; First two authors contributed equally; 12 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.07715">arXiv:2310.07715</a> <span> [<a href="https://arxiv.org/pdf/2310.07715">pdf</a>, <a href="https://arxiv.org/format/2310.07715">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.18653/v1/2023.emnlp-main.822">10.18653/v1/2023.emnlp-main.822 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> To Build Our Future, We Must Know Our Past: Contextualizing Paradigm Shifts in Natural Language Processing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gururaja%2C+S">Sireesh Gururaja</a>, <a href="/search/cs?searchtype=author&query=Bertsch%2C+A">Amanda Bertsch</a>, <a href="/search/cs?searchtype=author&query=Na%2C+C">Clara Na</a>, <a href="/search/cs?searchtype=author&query=Widder%2C+D+G">David Gray Widder</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.07715v1-abstract-short" style="display: inline;"> NLP is in a period of disruptive change that is impacting our methodologies, funding sources, and public perception. In this work, we seek to understand how to shape our future by better understanding our past. We study factors that shape NLP as a field, including culture, incentives, and infrastructure by conducting long-form interviews with 26 NLP researchers of varying seniority, research area,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.07715v1-abstract-full').style.display = 'inline'; document.getElementById('2310.07715v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.07715v1-abstract-full" style="display: none;"> NLP is in a period of disruptive change that is impacting our methodologies, funding sources, and public perception. In this work, we seek to understand how to shape our future by better understanding our past. We study factors that shape NLP as a field, including culture, incentives, and infrastructure by conducting long-form interviews with 26 NLP researchers of varying seniority, research area, institution, and social identity. Our interviewees identify cyclical patterns in the field, as well as new shifts without historical parallel, including changes in benchmark culture and software infrastructure. We complement this discussion with quantitative analysis of citation, authorship, and language use in the ACL Anthology over time. We conclude by discussing shared visions, concerns, and hopes for the future of NLP. We hope that this study of our field's past and present can prompt informed discussion of our community's implicit norms and more deliberate action to consciously shape the future. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.07715v1-abstract-full').style.display = 'none'; document.getElementById('2310.07715v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to EMNLP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.05674">arXiv:2310.05674</a> <span> [<a href="https://arxiv.org/pdf/2310.05674">pdf</a>, <a href="https://arxiv.org/format/2310.05674">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Making Scalable Meta Learning Practical </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Choe%2C+S+K">Sang Keun Choe</a>, <a href="/search/cs?searchtype=author&query=Mehta%2C+S+V">Sanket Vaibhav Mehta</a>, <a href="/search/cs?searchtype=author&query=Ahn%2C+H">Hwijeen Ahn</a>, <a href="/search/cs?searchtype=author&query=Neiswanger%2C+W">Willie Neiswanger</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+P">Pengtao Xie</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+E">Eric Xing</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.05674v2-abstract-short" style="display: inline;"> Despite its flexibility to learn diverse inductive biases in machine learning programs, meta learning (i.e., learning to learn) has long been recognized to suffer from poor scalability due to its tremendous compute/memory costs, training instability, and a lack of efficient distributed training support. In this work, we focus on making scalable meta learning practical by introducing SAMA, which co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.05674v2-abstract-full').style.display = 'inline'; document.getElementById('2310.05674v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.05674v2-abstract-full" style="display: none;"> Despite its flexibility to learn diverse inductive biases in machine learning programs, meta learning (i.e., learning to learn) has long been recognized to suffer from poor scalability due to its tremendous compute/memory costs, training instability, and a lack of efficient distributed training support. In this work, we focus on making scalable meta learning practical by introducing SAMA, which combines advances in both implicit differentiation algorithms and systems. Specifically, SAMA is designed to flexibly support a broad range of adaptive optimizers in the base level of meta learning programs, while reducing computational burden by avoiding explicit computation of second-order gradient information, and exploiting efficient distributed training techniques implemented for first-order gradients. Evaluated on multiple large-scale meta learning benchmarks, SAMA showcases up to 1.7/4.8x increase in throughput and 2.0/3.8x decrease in memory consumption respectively on single-/multi-GPU setups compared to other baseline meta learning algorithms. Furthermore, we show that SAMA-based data optimization leads to consistent improvements in text classification accuracy with BERT and RoBERTa large language models, and achieves state-of-the-art results in both small- and large-scale data pruning on image classification tasks, demonstrating the practical applicability of scalable meta learning across language and vision domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.05674v2-abstract-full').style.display = 'none'; document.getElementById('2310.05674v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.09701">arXiv:2307.09701</a> <span> [<a href="https://arxiv.org/pdf/2307.09701">pdf</a>, <a href="https://arxiv.org/format/2307.09701">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Efficiency Pentathlon: A Standardized Arena for Efficiency Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+H">Hao Peng</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Q">Qingqing Cao</a>, <a href="/search/cs?searchtype=author&query=Dodge%2C+J">Jesse Dodge</a>, <a href="/search/cs?searchtype=author&query=Peters%2C+M+E">Matthew E. Peters</a>, <a href="/search/cs?searchtype=author&query=Fernandez%2C+J">Jared Fernandez</a>, <a href="/search/cs?searchtype=author&query=Sherborne%2C+T">Tom Sherborne</a>, <a href="/search/cs?searchtype=author&query=Lo%2C+K">Kyle Lo</a>, <a href="/search/cs?searchtype=author&query=Skjonsberg%2C+S">Sam Skjonsberg</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=Plessas%2C+D">Darrell Plessas</a>, <a href="/search/cs?searchtype=author&query=Beltagy%2C+I">Iz Beltagy</a>, <a href="/search/cs?searchtype=author&query=Walsh%2C+E+P">Evan Pete Walsh</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+N+A">Noah A. Smith</a>, <a href="/search/cs?searchtype=author&query=Hajishirzi%2C+H">Hannaneh Hajishirzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.09701v1-abstract-short" style="display: inline;"> Rising computational demands of modern natural language processing (NLP) systems have increased the barrier to entry for cutting-edge research while posing serious environmental concerns. Yet, progress on model efficiency has been impeded by practical challenges in model evaluation and comparison. For example, hardware is challenging to control due to disparate levels of accessibility across diffe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.09701v1-abstract-full').style.display = 'inline'; document.getElementById('2307.09701v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.09701v1-abstract-full" style="display: none;"> Rising computational demands of modern natural language processing (NLP) systems have increased the barrier to entry for cutting-edge research while posing serious environmental concerns. Yet, progress on model efficiency has been impeded by practical challenges in model evaluation and comparison. For example, hardware is challenging to control due to disparate levels of accessibility across different institutions. Moreover, improvements in metrics such as FLOPs often fail to translate to progress in real-world applications. In response, we introduce Pentathlon, a benchmark for holistic and realistic evaluation of model efficiency. Pentathlon focuses on inference, which accounts for a majority of the compute in a model's lifecycle. It offers a strictly-controlled hardware platform, and is designed to mirror real-world applications scenarios. It incorporates a suite of metrics that target different aspects of efficiency, including latency, throughput, memory overhead, and energy consumption. Pentathlon also comes with a software library that can be seamlessly integrated into any codebase and enable evaluation. As a standardized and centralized evaluation platform, Pentathlon can drastically reduce the workload to make fair and reproducible efficiency comparisons. While initially focused on natural language processing (NLP) models, Pentathlon is designed to allow flexible extension to other fields. We envision Pentathlon will stimulate algorithmic innovations in building efficient models, and foster an increased awareness of the social and environmental implications in the development of future-generation NLP models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.09701v1-abstract-full').style.display = 'none'; document.getElementById('2307.09701v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.00101">arXiv:2307.00101</a> <span> [<a href="https://arxiv.org/pdf/2307.00101">pdf</a>, <a href="https://arxiv.org/format/2307.00101">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Queer People are People First: Deconstructing Sexual Identity Stereotypes in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dhingra%2C+H">Harnoor Dhingra</a>, <a href="/search/cs?searchtype=author&query=Jayashanker%2C+P">Preetiha Jayashanker</a>, <a href="/search/cs?searchtype=author&query=Moghe%2C+S">Sayali Moghe</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.00101v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) are trained primarily on minimally processed web text, which exhibits the same wide range of social biases held by the humans who created that content. Consequently, text generated by LLMs can inadvertently perpetuate stereotypes towards marginalized groups, like the LGBTQIA+ community. In this paper, we perform a comparative study of how LLMs generate text describing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.00101v1-abstract-full').style.display = 'inline'; document.getElementById('2307.00101v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.00101v1-abstract-full" style="display: none;"> Large Language Models (LLMs) are trained primarily on minimally processed web text, which exhibits the same wide range of social biases held by the humans who created that content. Consequently, text generated by LLMs can inadvertently perpetuate stereotypes towards marginalized groups, like the LGBTQIA+ community. In this paper, we perform a comparative study of how LLMs generate text describing people with different sexual identities. Analyzing bias in the text generated by an LLM using regard score shows measurable bias against queer people. We then show that a post-hoc method based on chain-of-thought prompting using SHAP analysis can increase the regard of the sentence, representing a promising approach towards debiasing the output of LLMs in this setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.00101v1-abstract-full').style.display = 'none'; document.getElementById('2307.00101v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Queer in AI Workshop at ACL 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.16900">arXiv:2306.16900</a> <span> [<a href="https://arxiv.org/pdf/2306.16900">pdf</a>, <a href="https://arxiv.org/format/2306.16900">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Surveying (Dis)Parities and Concerns of Compute Hungry NLP Research </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+J">Ji-Ung Lee</a>, <a href="/search/cs?searchtype=author&query=Puerto%2C+H">Haritz Puerto</a>, <a href="/search/cs?searchtype=author&query=van+Aken%2C+B">Betty van Aken</a>, <a href="/search/cs?searchtype=author&query=Arase%2C+Y">Yuki Arase</a>, <a href="/search/cs?searchtype=author&query=Forde%2C+J+Z">Jessica Zosa Forde</a>, <a href="/search/cs?searchtype=author&query=Derczynski%2C+L">Leon Derczynski</a>, <a href="/search/cs?searchtype=author&query=R%C3%BCckl%C3%A9%2C+A">Andreas R眉ckl茅</a>, <a href="/search/cs?searchtype=author&query=Gurevych%2C+I">Iryna Gurevych</a>, <a href="/search/cs?searchtype=author&query=Schwartz%2C+R">Roy Schwartz</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=Dodge%2C+J">Jesse Dodge</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.16900v2-abstract-short" style="display: inline;"> Many recent improvements in NLP stem from the development and use of large pre-trained language models (PLMs) with billions of parameters. Large model sizes makes computational cost one of the main limiting factors for training and evaluating such models; and has raised severe concerns about the sustainability, reproducibility, and inclusiveness for researching PLMs. These concerns are often based… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.16900v2-abstract-full').style.display = 'inline'; document.getElementById('2306.16900v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.16900v2-abstract-full" style="display: none;"> Many recent improvements in NLP stem from the development and use of large pre-trained language models (PLMs) with billions of parameters. Large model sizes makes computational cost one of the main limiting factors for training and evaluating such models; and has raised severe concerns about the sustainability, reproducibility, and inclusiveness for researching PLMs. These concerns are often based on personal experiences and observations. However, there had not been any large-scale surveys that investigate them. In this work, we provide a first attempt to quantify these concerns regarding three topics, namely, environmental impact, equity, and impact on peer reviewing. By conducting a survey with 312 participants from the NLP community, we capture existing (dis)parities between different and within groups with respect to seniority, academia, and industry; and their impact on the peer reviewing process. For each topic, we provide an analysis and devise recommendations to mitigate found disparities, some of which already successfully implemented. Finally, we discuss additional concerns raised by many participants in free-text responses. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.16900v2-abstract-full').style.display = 'none'; document.getElementById('2306.16900v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.14864">arXiv:2305.14864</a> <span> [<a href="https://arxiv.org/pdf/2305.14864">pdf</a>, <a href="https://arxiv.org/format/2305.14864">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Just CHOP: Embarrassingly Simple LLM Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jha%2C+A+H">Ananya Harsh Jha</a>, <a href="/search/cs?searchtype=author&query=Sherborne%2C+T">Tom Sherborne</a>, <a href="/search/cs?searchtype=author&query=Walsh%2C+E+P">Evan Pete Walsh</a>, <a href="/search/cs?searchtype=author&query=Groeneveld%2C+D">Dirk Groeneveld</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=Beltagy%2C+I">Iz Beltagy</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.14864v3-abstract-short" style="display: inline;"> Large language models (LLMs) enable unparalleled few- and zero-shot reasoning capabilities but at a high computational footprint. A growing assortment of methods for compression promises to reduce the computational burden of LLMs in deployment, but so far, only quantization approaches have been demonstrated to be effective for LLM compression while maintaining zero-shot performance. A critical ste… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14864v3-abstract-full').style.display = 'inline'; document.getElementById('2305.14864v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.14864v3-abstract-full" style="display: none;"> Large language models (LLMs) enable unparalleled few- and zero-shot reasoning capabilities but at a high computational footprint. A growing assortment of methods for compression promises to reduce the computational burden of LLMs in deployment, but so far, only quantization approaches have been demonstrated to be effective for LLM compression while maintaining zero-shot performance. A critical step in the compression process, the pretrain-then-finetune paradigm, has largely been overlooked when adapting existing pruning strategies to LLMs or proposing new ones. In this work, we show that embarrassingly simple layer pruning coupled with an extended language model pretraining as the finetuning phase produces state-of-the-art results against structured and even semi-structured compression of models at a 7B scale while being more inference efficient. We call this method LayerChop, where we deterministically remove layers from a model followed by task-agnostic finetuning of the remaining weights by continued self-supervised pretraining. At this scale, we also show how distillation, which has been super effective in task-agnostic compression of smaller BERT-style models, becomes inefficient against our simple pruning technique. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14864v3-abstract-full').style.display = 'none'; document.getElementById('2305.14864v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 6 figures, 6 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.12634">arXiv:2305.12634</a> <span> [<a href="https://arxiv.org/pdf/2305.12634">pdf</a>, <a href="https://arxiv.org/format/2305.12634">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Data-efficient Active Learning for Structured Prediction with Partial Annotation and Self-Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhisong Zhang</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=Hovy%2C+E">Eduard Hovy</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.12634v2-abstract-short" style="display: inline;"> In this work we propose a pragmatic method that reduces the annotation cost for structured label spaces using active learning. Our approach leverages partial annotation, which reduces labeling costs for structured outputs by selecting only the most informative sub-structures for annotation. We also utilize self-training to incorporate the current model's automatic predictions as pseudo-labels for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.12634v2-abstract-full').style.display = 'inline'; document.getElementById('2305.12634v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.12634v2-abstract-full" style="display: none;"> In this work we propose a pragmatic method that reduces the annotation cost for structured label spaces using active learning. Our approach leverages partial annotation, which reduces labeling costs for structured outputs by selecting only the most informative sub-structures for annotation. We also utilize self-training to incorporate the current model's automatic predictions as pseudo-labels for un-annotated sub-structures. A key challenge in effectively combining partial annotation with self-training to reduce annotation cost is determining which sub-structures to select to label. To address this challenge, we adopt an error estimator to adaptively decide the partial selection ratio according to the current model's capability. In evaluations spanning four structured prediction tasks, we show that our combination of partial annotation and self-training using an adaptive selection ratio reduces annotation cost over strong full annotation baselines under a fair comparison scheme that takes reading time into consideration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.12634v2-abstract-full').style.display = 'none'; document.getElementById('2305.12634v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Findings of EMNLP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.00131">arXiv:2305.00131</a> <span> [<a href="https://arxiv.org/pdf/2305.00131">pdf</a>, <a href="https://arxiv.org/format/2305.00131">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Regularizing Self-training for Unsupervised Domain Adaptation via Structural Constraints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Das%2C+R">Rajshekhar Das</a>, <a href="/search/cs?searchtype=author&query=Francis%2C+J">Jonathan Francis</a>, <a href="/search/cs?searchtype=author&query=Mehta%2C+S+V">Sanket Vaibhav Mehta</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+J">Jean Oh</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=Moura%2C+J">Jose Moura</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.00131v1-abstract-short" style="display: inline;"> Self-training based on pseudo-labels has emerged as a dominant approach for addressing conditional distribution shifts in unsupervised domain adaptation (UDA) for semantic segmentation problems. A notable drawback, however, is that this family of approaches is susceptible to erroneous pseudo labels that arise from confirmation biases in the source domain and that manifest as nuisance factors in th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.00131v1-abstract-full').style.display = 'inline'; document.getElementById('2305.00131v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.00131v1-abstract-full" style="display: none;"> Self-training based on pseudo-labels has emerged as a dominant approach for addressing conditional distribution shifts in unsupervised domain adaptation (UDA) for semantic segmentation problems. A notable drawback, however, is that this family of approaches is susceptible to erroneous pseudo labels that arise from confirmation biases in the source domain and that manifest as nuisance factors in the target domain. A possible source for this mismatch is the reliance on only photometric cues provided by RGB image inputs, which may ultimately lead to sub-optimal adaptation. To mitigate the effect of mismatched pseudo-labels, we propose to incorporate structural cues from auxiliary modalities, such as depth, to regularise conventional self-training objectives. Specifically, we introduce a contrastive pixel-level objectness constraint that pulls the pixel representations within a region of an object instance closer, while pushing those from different object categories apart. To obtain object regions consistent with the true underlying object, we extract information from both depth maps and RGB-images in the form of multimodal clustering. Crucially, the objectness constraint is agnostic to the ground-truth semantic labels and, hence, appropriate for unsupervised domain adaptation. In this work, we show that our regularizer significantly improves top performing self-training methods (by up to $2$ points) in various UDA benchmarks for semantic segmentation. We include all code in the supplementary. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.00131v1-abstract-full').style.display = 'none'; document.getElementById('2305.00131v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.06117">arXiv:2302.06117</a> <span> [<a href="https://arxiv.org/pdf/2302.06117">pdf</a>, <a href="https://arxiv.org/format/2302.06117">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> The Framework Tax: Disparities Between Inference Efficiency in NLP Research and Deployment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fernandez%2C+J">Jared Fernandez</a>, <a href="/search/cs?searchtype=author&query=Kahn%2C+J">Jacob Kahn</a>, <a href="/search/cs?searchtype=author&query=Na%2C+C">Clara Na</a>, <a href="/search/cs?searchtype=author&query=Bisk%2C+Y">Yonatan Bisk</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.06117v2-abstract-short" style="display: inline;"> Increased focus on the computational efficiency of NLP systems has motivated the design of efficient model architectures and improvements to underlying hardware accelerators. However, the resulting increases in computational throughput and reductions in floating point operations have not directly translated to improvements in wall-clock inference latency. We demonstrate that these discrepancies ca… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.06117v2-abstract-full').style.display = 'inline'; document.getElementById('2302.06117v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.06117v2-abstract-full" style="display: none;"> Increased focus on the computational efficiency of NLP systems has motivated the design of efficient model architectures and improvements to underlying hardware accelerators. However, the resulting increases in computational throughput and reductions in floating point operations have not directly translated to improvements in wall-clock inference latency. We demonstrate that these discrepancies can be largely attributed to bottlenecks introduced by deep learning frameworks. We denote this phenomenon as the \textit{framework tax}, and observe that the disparity is growing as hardware speed increases over time. In this work, we examine this phenomenon through a series of case studies analyzing the effects of model design decisions, framework paradigms, and hardware platforms on total model latency. Code is available at https://github.com/JaredFern/Framework-Tax. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.06117v2-abstract-full').style.display = 'none'; document.getElementById('2302.06117v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.10381">arXiv:2212.10381</a> <span> [<a href="https://arxiv.org/pdf/2212.10381">pdf</a>, <a href="https://arxiv.org/format/2212.10381">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> To Adapt or to Annotate: Challenges and Interventions for Domain Adaptation in Open-Domain Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dua%2C+D">Dheeru Dua</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+S">Sameer Singh</a>, <a href="/search/cs?searchtype=author&query=Verga%2C+P">Pat Verga</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.10381v1-abstract-short" style="display: inline;"> Recent advances in open-domain question answering (ODQA) have demonstrated impressive accuracy on standard Wikipedia style benchmarks. However, it is less clear how robust these models are and how well they perform when applied to real-world applications in drastically different domains. While there has been some work investigating how well ODQA models perform when tested for out-of-domain (OOD) g… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.10381v1-abstract-full').style.display = 'inline'; document.getElementById('2212.10381v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.10381v1-abstract-full" style="display: none;"> Recent advances in open-domain question answering (ODQA) have demonstrated impressive accuracy on standard Wikipedia style benchmarks. However, it is less clear how robust these models are and how well they perform when applied to real-world applications in drastically different domains. While there has been some work investigating how well ODQA models perform when tested for out-of-domain (OOD) generalization, these studies have been conducted only under conservative shifts in data distribution and typically focus on a single component (ie. retrieval) rather than an end-to-end system. In response, we propose a more realistic and challenging domain shift evaluation setting and, through extensive experiments, study end-to-end model performance. We find that not only do models fail to generalize, but high retrieval scores often still yield poor answer prediction accuracy. We then categorize different types of shifts and propose techniques that, when presented with a new dataset, predict if intervention methods are likely to be successful. Finally, using insights from this analysis, we propose and evaluate several intervention methods which improve end-to-end answer F1 score by up to 24 points. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.10381v1-abstract-full').style.display = 'none'; document.getElementById('2212.10381v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.09744">arXiv:2212.09744</a> <span> [<a href="https://arxiv.org/pdf/2212.09744">pdf</a>, <a href="https://arxiv.org/format/2212.09744">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DSI++: Updating Transformer Memory with New Documents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mehta%2C+S+V">Sanket Vaibhav Mehta</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+J">Jai Gupta</a>, <a href="/search/cs?searchtype=author&query=Tay%2C+Y">Yi Tay</a>, <a href="/search/cs?searchtype=author&query=Dehghani%2C+M">Mostafa Dehghani</a>, <a href="/search/cs?searchtype=author&query=Tran%2C+V+Q">Vinh Q. Tran</a>, <a href="/search/cs?searchtype=author&query=Rao%2C+J">Jinfeng Rao</a>, <a href="/search/cs?searchtype=author&query=Najork%2C+M">Marc Najork</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=Metzler%2C+D">Donald Metzler</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.09744v3-abstract-short" style="display: inline;"> Differentiable Search Indices (DSIs) encode a corpus of documents in model parameters and use the same model to answer user queries directly. Despite the strong performance of DSI models, deploying them in situations where the corpus changes over time is computationally expensive because reindexing the corpus requires re-training the model. In this work, we introduce DSI++, a continual learning ch… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.09744v3-abstract-full').style.display = 'inline'; document.getElementById('2212.09744v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.09744v3-abstract-full" style="display: none;"> Differentiable Search Indices (DSIs) encode a corpus of documents in model parameters and use the same model to answer user queries directly. Despite the strong performance of DSI models, deploying them in situations where the corpus changes over time is computationally expensive because reindexing the corpus requires re-training the model. In this work, we introduce DSI++, a continual learning challenge for DSI to incrementally index new documents while being able to answer queries related to both previously and newly indexed documents. Across different model scales and document identifier representations, we show that continual indexing of new documents leads to considerable forgetting of previously indexed documents. We also hypothesize and verify that the model experiences forgetting events during training, leading to unstable learning. To mitigate these issues, we investigate two approaches. The first focuses on modifying the training dynamics. Flatter minima implicitly alleviate forgetting, so we optimize for flatter loss basins and show that the model stably memorizes more documents ($+12\%$). Next, we introduce a generative memory to sample pseudo-queries for documents and supplement them during continual indexing to prevent forgetting for the retrieval task. Extensive experiments on novel continual indexing benchmarks based on Natural Questions (NQ) and MS MARCO demonstrate that our proposed solution mitigates forgetting significantly. Concretely, it improves the average Hits@10 by $+21.1\%$ over competitive baselines for NQ and requires $6$ times fewer model updates compared to re-training the DSI model for incrementally indexing five corpora in a sequence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.09744v3-abstract-full').style.display = 'none'; document.getElementById('2212.09744v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at EMNLP 2023 main conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.05603">arXiv:2212.05603</a> <span> [<a href="https://arxiv.org/pdf/2212.05603">pdf</a>, <a href="https://arxiv.org/format/2212.05603">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Error-aware Quantization through Noise Tempering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zheng Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J+B">Juncheng B Li</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+S">Shuhui Qu</a>, <a href="/search/cs?searchtype=author&query=Metze%2C+F">Florian Metze</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.05603v1-abstract-short" style="display: inline;"> Quantization has become a predominant approach for model compression, enabling deployment of large models trained on GPUs onto smaller form-factor devices for inference. Quantization-aware training (QAT) optimizes model parameters with respect to the end task while simulating quantization error, leading to better performance than post-training quantization. Approximation of gradients through the n… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.05603v1-abstract-full').style.display = 'inline'; document.getElementById('2212.05603v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.05603v1-abstract-full" style="display: none;"> Quantization has become a predominant approach for model compression, enabling deployment of large models trained on GPUs onto smaller form-factor devices for inference. Quantization-aware training (QAT) optimizes model parameters with respect to the end task while simulating quantization error, leading to better performance than post-training quantization. Approximation of gradients through the non-differentiable quantization operator is typically achieved using the straight-through estimator (STE) or additive noise. However, STE-based methods suffer from instability due to biased gradients, whereas existing noise-based methods cannot reduce the resulting variance. In this work, we incorporate exponentially decaying quantization-error-aware noise together with a learnable scale of task loss gradient to approximate the effect of a quantization operator. We show this method combines gradient scale and quantization noise in a better optimized way, providing finer-grained estimation of gradients at each weight and activation layer's quantizer bin size. Our controlled noise also contains an implicit curvature term that could encourage flatter minima, which we show is indeed the case in our experiments. Experiments training ResNet architectures on the CIFAR-10, CIFAR-100 and ImageNet benchmarks show that our method obtains state-of-the-art top-1 classification accuracy for uniform (non mixed-precision) quantization, out-performing previous methods by 0.5-1.2% absolute. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.05603v1-abstract-full').style.display = 'none'; document.getElementById('2212.05603v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.04256">arXiv:2211.04256</a> <span> [<a href="https://arxiv.org/pdf/2211.04256">pdf</a>, <a href="https://arxiv.org/format/2211.04256">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Bridging Fairness and Environmental Sustainability in Natural Language Processing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hessenthaler%2C+M">Marius Hessenthaler</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=Hovy%2C+D">Dirk Hovy</a>, <a href="/search/cs?searchtype=author&query=Lauscher%2C+A">Anne Lauscher</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.04256v1-abstract-short" style="display: inline;"> Fairness and environmental impact are important research directions for the sustainable development of artificial intelligence. However, while each topic is an active research area in natural language processing (NLP), there is a surprising lack of research on the interplay between the two fields. This lacuna is highly problematic, since there is increasing evidence that an exclusive focus on fair… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.04256v1-abstract-full').style.display = 'inline'; document.getElementById('2211.04256v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.04256v1-abstract-full" style="display: none;"> Fairness and environmental impact are important research directions for the sustainable development of artificial intelligence. However, while each topic is an active research area in natural language processing (NLP), there is a surprising lack of research on the interplay between the two fields. This lacuna is highly problematic, since there is increasing evidence that an exclusive focus on fairness can actually hinder environmental sustainability, and vice versa. In this work, we shed light on this crucial intersection in NLP by (1) investigating the efficiency of current fairness approaches through surveying example methods for reducing unfair stereotypical bias from the literature, and (2) evaluating a common technique to reduce energy consumption (and thus environmental impact) of English NLP models, knowledge distillation (KD), for its impact on fairness. In this case study, we evaluate the effect of important KD factors, including layer and dimensionality reduction, with respect to: (a) performance on the distillation task (natural language inference and semantic similarity prediction), and (b) multiple measures and dimensions of stereotypical bias (e.g., gender bias measured via the Word Embedding Association Test). Our results lead us to clarify current assumptions regarding the effect of KD on unfair bias: contrary to other findings, we show that KD can actually decrease model fairness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.04256v1-abstract-full').style.display = 'none'; document.getElementById('2211.04256v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication at EMNLP 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.10109">arXiv:2210.10109</a> <span> [<a href="https://arxiv.org/pdf/2210.10109">pdf</a>, <a href="https://arxiv.org/format/2210.10109">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Survey of Active Learning for Natural Language Processing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhisong Zhang</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=Hovy%2C+E">Eduard Hovy</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.10109v2-abstract-short" style="display: inline;"> In this work, we provide a survey of active learning (AL) for its applications in natural language processing (NLP). In addition to a fine-grained categorization of query strategies, we also investigate several other important aspects of applying AL to NLP problems. These include AL for structured prediction tasks, annotation cost, model learning (especially with deep neural models), and starting… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.10109v2-abstract-full').style.display = 'inline'; document.getElementById('2210.10109v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.10109v2-abstract-full" style="display: none;"> In this work, we provide a survey of active learning (AL) for its applications in natural language processing (NLP). In addition to a fine-grained categorization of query strategies, we also investigate several other important aspects of applying AL to NLP problems. These include AL for structured prediction tasks, annotation cost, model learning (especially with deep neural models), and starting and stopping AL. Finally, we conclude with a discussion of related topics and future directions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.10109v2-abstract-full').style.display = 'none'; document.getElementById('2210.10109v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.07602">arXiv:2210.07602</a> <span> [<a href="https://arxiv.org/pdf/2210.07602">pdf</a>, <a href="https://arxiv.org/format/2210.07602">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Mention Annotations Alone Enable Efficient Domain Adaptation for Coreference Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gandhi%2C+N">Nupoor Gandhi</a>, <a href="/search/cs?searchtype=author&query=Field%2C+A">Anjalie Field</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.07602v2-abstract-short" style="display: inline;"> Although recent neural models for coreference resolution have led to substantial improvements on benchmark datasets, transferring these models to new target domains containing out-of-vocabulary spans and requiring differing annotation schemes remains challenging. Typical approaches involve continued training on annotated target-domain data, but obtaining annotations is costly and time-consuming. W… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.07602v2-abstract-full').style.display = 'inline'; document.getElementById('2210.07602v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.07602v2-abstract-full" style="display: none;"> Although recent neural models for coreference resolution have led to substantial improvements on benchmark datasets, transferring these models to new target domains containing out-of-vocabulary spans and requiring differing annotation schemes remains challenging. Typical approaches involve continued training on annotated target-domain data, but obtaining annotations is costly and time-consuming. We show that annotating mentions alone is nearly twice as fast as annotating full coreference chains. Accordingly, we propose a method for efficiently adapting coreference models, which includes a high-precision mention detection objective and requires annotating only mentions in the target domain. Extensive evaluation across three English coreference datasets: CoNLL-2012 (news/conversation), i2b2/VA (medical notes), and previously unstudied child welfare notes, reveals that our approach facilitates annotation-efficient transfer and results in a 7-14% improvement in average F1 without increasing annotator time. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.07602v2-abstract-full').style.display = 'none'; document.getElementById('2210.07602v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.07171">arXiv:2210.07171</a> <span> [<a href="https://arxiv.org/pdf/2210.07171">pdf</a>, <a href="https://arxiv.org/format/2210.07171">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SQuAT: Sharpness- and Quantization-Aware Training for BERT </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zheng Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J+B">Juncheng B Li</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+S">Shuhui Qu</a>, <a href="/search/cs?searchtype=author&query=Metze%2C+F">Florian Metze</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.07171v1-abstract-short" style="display: inline;"> Quantization is an effective technique to reduce memory footprint, inference latency, and power consumption of deep learning models. However, existing quantization methods suffer from accuracy degradation compared to full-precision (FP) models due to the errors introduced by coarse gradient estimation through non-differentiable quantization layers. The existence of sharp local minima in the loss l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.07171v1-abstract-full').style.display = 'inline'; document.getElementById('2210.07171v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.07171v1-abstract-full" style="display: none;"> Quantization is an effective technique to reduce memory footprint, inference latency, and power consumption of deep learning models. However, existing quantization methods suffer from accuracy degradation compared to full-precision (FP) models due to the errors introduced by coarse gradient estimation through non-differentiable quantization layers. The existence of sharp local minima in the loss landscapes of overparameterized models (e.g., Transformers) tends to aggravate such performance penalty in low-bit (2, 4 bits) settings. In this work, we propose sharpness- and quantization-aware training (SQuAT), which would encourage the model to converge to flatter minima while performing quantization-aware training. Our proposed method alternates training between sharpness objective and step-size objective, which could potentially let the model learn the most suitable parameter update magnitude to reach convergence near-flat minima. Extensive experiments show that our method can consistently outperform state-of-the-art quantized BERT models under 2, 3, and 4-bit settings on GLUE benchmarks by 1%, and can sometimes even outperform full precision (32-bit) models. Our experiments on empirical measurement of sharpness also suggest that our method would lead to flatter minima compared to other quantization methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.07171v1-abstract-full').style.display = 'none'; document.getElementById('2210.07171v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.00099">arXiv:2209.00099</a> <span> [<a href="https://arxiv.org/pdf/2209.00099">pdf</a>, <a href="https://arxiv.org/format/2209.00099">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Efficient Methods for Natural Language Processing: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Treviso%2C+M">Marcos Treviso</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Ji-Ung Lee</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+T">Tianchu Ji</a>, <a href="/search/cs?searchtype=author&query=van+Aken%2C+B">Betty van Aken</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Q">Qingqing Cao</a>, <a href="/search/cs?searchtype=author&query=Ciosici%2C+M+R">Manuel R. Ciosici</a>, <a href="/search/cs?searchtype=author&query=Hassid%2C+M">Michael Hassid</a>, <a href="/search/cs?searchtype=author&query=Heafield%2C+K">Kenneth Heafield</a>, <a href="/search/cs?searchtype=author&query=Hooker%2C+S">Sara Hooker</a>, <a href="/search/cs?searchtype=author&query=Raffel%2C+C">Colin Raffel</a>, <a href="/search/cs?searchtype=author&query=Martins%2C+P+H">Pedro H. Martins</a>, <a href="/search/cs?searchtype=author&query=Martins%2C+A+F+T">Andr茅 F. T. Martins</a>, <a href="/search/cs?searchtype=author&query=Forde%2C+J+Z">Jessica Zosa Forde</a>, <a href="/search/cs?searchtype=author&query=Milder%2C+P">Peter Milder</a>, <a href="/search/cs?searchtype=author&query=Simpson%2C+E">Edwin Simpson</a>, <a href="/search/cs?searchtype=author&query=Slonim%2C+N">Noam Slonim</a>, <a href="/search/cs?searchtype=author&query=Dodge%2C+J">Jesse Dodge</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=Balasubramanian%2C+N">Niranjan Balasubramanian</a>, <a href="/search/cs?searchtype=author&query=Derczynski%2C+L">Leon Derczynski</a>, <a href="/search/cs?searchtype=author&query=Gurevych%2C+I">Iryna Gurevych</a>, <a href="/search/cs?searchtype=author&query=Schwartz%2C+R">Roy Schwartz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.00099v2-abstract-short" style="display: inline;"> Recent work in natural language processing (NLP) has yielded appealing results from scaling model parameters and training data; however, using only scale to improve performance means that resource consumption also grows. Such resources include data, time, storage, or energy, all of which are naturally limited and unevenly distributed. This motivates research into efficient methods that require few… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.00099v2-abstract-full').style.display = 'inline'; document.getElementById('2209.00099v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.00099v2-abstract-full" style="display: none;"> Recent work in natural language processing (NLP) has yielded appealing results from scaling model parameters and training data; however, using only scale to improve performance means that resource consumption also grows. Such resources include data, time, storage, or energy, all of which are naturally limited and unevenly distributed. This motivates research into efficient methods that require fewer resources to achieve similar results. This survey synthesizes and relates current methods and findings in efficient NLP. We aim to provide both guidance for conducting NLP under limited resources, and point towards promising research directions for developing more efficient methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.00099v2-abstract-full').style.display = 'none'; document.getElementById('2209.00099v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at TACL, pre publication version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.05229">arXiv:2206.05229</a> <span> [<a href="https://arxiv.org/pdf/2206.05229">pdf</a>, <a href="https://arxiv.org/format/2206.05229">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Measuring the Carbon Intensity of AI in Cloud Instances </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dodge%2C+J">Jesse Dodge</a>, <a href="/search/cs?searchtype=author&query=Prewitt%2C+T">Taylor Prewitt</a>, <a href="/search/cs?searchtype=author&query=Combes%2C+R+T+D">Remi Tachet Des Combes</a>, <a href="/search/cs?searchtype=author&query=Odmark%2C+E">Erika Odmark</a>, <a href="/search/cs?searchtype=author&query=Schwartz%2C+R">Roy Schwartz</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=Luccioni%2C+A+S">Alexandra Sasha Luccioni</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+N+A">Noah A. Smith</a>, <a href="/search/cs?searchtype=author&query=DeCario%2C+N">Nicole DeCario</a>, <a href="/search/cs?searchtype=author&query=Buchanan%2C+W">Will Buchanan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.05229v1-abstract-short" style="display: inline;"> By providing unprecedented access to computational resources, cloud computing has enabled rapid growth in technologies such as machine learning, the computational demands of which incur a high energy cost and a commensurate carbon footprint. As a result, recent scholarship has called for better estimates of the greenhouse gas impact of AI: data scientists today do not have easy or reliable access… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.05229v1-abstract-full').style.display = 'inline'; document.getElementById('2206.05229v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.05229v1-abstract-full" style="display: none;"> By providing unprecedented access to computational resources, cloud computing has enabled rapid growth in technologies such as machine learning, the computational demands of which incur a high energy cost and a commensurate carbon footprint. As a result, recent scholarship has called for better estimates of the greenhouse gas impact of AI: data scientists today do not have easy or reliable access to measurements of this information, precluding development of actionable tactics. Cloud providers presenting information about software carbon intensity to users is a fundamental stepping stone towards minimizing emissions. In this paper, we provide a framework for measuring software carbon intensity, and propose to measure operational carbon emissions by using location-based and time-specific marginal emissions data per energy unit. We provide measurements of operational software carbon intensity for a set of modern models for natural language processing and computer vision, and a wide range of model sizes, including pretraining of a 6.1 billion parameter language model. We then evaluate a suite of approaches for reducing emissions on the Microsoft Azure cloud compute platform: using cloud instances in different geographic regions, using cloud instances at different times of day, and dynamically pausing cloud instances when the marginal carbon intensity is above a certain threshold. We confirm previous results that the geographic region of the data center plays a significant role in the carbon intensity for a given cloud instance, and find that choosing an appropriate region can have the largest operational emissions reduction impact. We also show that the time of day has notable impact on operational software carbon intensity. Finally, we conclude with recommendations for how machine learning practitioners can use software carbon intensity information to reduce environmental impact. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.05229v1-abstract-full').style.display = 'none'; document.getElementById('2206.05229v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In ACM Conference on Fairness, Accountability, and Transparency (ACM FAccT) 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.12694">arXiv:2205.12694</a> <span> [<a href="https://arxiv.org/pdf/2205.12694">pdf</a>, <a href="https://arxiv.org/format/2205.12694">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.18653/v1/2022.findings-emnlp.361">10.18653/v1/2022.findings-emnlp.361 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Train Flat, Then Compress: Sharpness-Aware Minimization Learns More Compressible Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Na%2C+C">Clara Na</a>, <a href="/search/cs?searchtype=author&query=Mehta%2C+S+V">Sanket Vaibhav Mehta</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.12694v2-abstract-short" style="display: inline;"> Model compression by way of parameter pruning, quantization, or distillation has recently gained popularity as an approach for reducing the computational requirements of modern deep neural network models for NLP. Inspired by prior works suggesting a connection between simpler, more generalizable models and those that lie within wider loss basins, we hypothesize that optimizing for flat minima shou… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.12694v2-abstract-full').style.display = 'inline'; document.getElementById('2205.12694v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.12694v2-abstract-full" style="display: none;"> Model compression by way of parameter pruning, quantization, or distillation has recently gained popularity as an approach for reducing the computational requirements of modern deep neural network models for NLP. Inspired by prior works suggesting a connection between simpler, more generalizable models and those that lie within wider loss basins, we hypothesize that optimizing for flat minima should lead to simpler parameterizations and thus more compressible models. We propose to combine sharpness-aware minimization (SAM) with various task-specific model compression methods, including iterative magnitude pruning (IMP), structured pruning with a distillation objective, and post-training dynamic quantization. Empirically, we show that optimizing for flatter minima consistently leads to greater compressibility of parameters compared to vanilla Adam when fine-tuning BERT models, with little to no loss in accuracy on the GLUE text classification and SQuAD question answering benchmarks. Moreover, SAM finds superior winning tickets during IMP that 1) are amenable to vanilla Adam optimization, and 2) transfer more effectively across tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.12694v2-abstract-full').style.display = 'none'; document.getElementById('2205.12694v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2022 Findings, 28 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.09153">arXiv:2112.09153</a> <span> [<a href="https://arxiv.org/pdf/2112.09153">pdf</a>, <a href="https://arxiv.org/format/2112.09153">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> An Empirical Investigation of the Role of Pre-training in Lifelong Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mehta%2C+S+V">Sanket Vaibhav Mehta</a>, <a href="/search/cs?searchtype=author&query=Patil%2C+D">Darshan Patil</a>, <a href="/search/cs?searchtype=author&query=Chandar%2C+S">Sarath Chandar</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.09153v2-abstract-short" style="display: inline;"> The lifelong learning paradigm in machine learning is an attractive alternative to the more prominent isolated learning scheme not only due to its resemblance to biological learning but also its potential to reduce energy waste by obviating excessive model re-training. A key challenge to this paradigm is the phenomenon of catastrophic forgetting. With the increasing popularity and success of pre-t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.09153v2-abstract-full').style.display = 'inline'; document.getElementById('2112.09153v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.09153v2-abstract-full" style="display: none;"> The lifelong learning paradigm in machine learning is an attractive alternative to the more prominent isolated learning scheme not only due to its resemblance to biological learning but also its potential to reduce energy waste by obviating excessive model re-training. A key challenge to this paradigm is the phenomenon of catastrophic forgetting. With the increasing popularity and success of pre-trained models in machine learning, we pose the question: What role does pre-training play in lifelong learning, specifically with respect to catastrophic forgetting? We investigate existing methods in the context of large, pre-trained models and evaluate their performance on a variety of text and image classification tasks, including a large-scale study using a novel data set of 15 diverse NLP tasks. Across all settings, we observe that generic pre-training implicitly alleviates the effects of catastrophic forgetting when learning multiple tasks sequentially compared to randomly initialized models. We then further investigate why pre-training alleviates forgetting in this setting. We study this phenomenon by analyzing the loss landscape, finding that pre-trained weights appear to ease forgetting by leading to wider minima. Based on this insight, we propose jointly optimizing for current task loss and loss basin sharpness to explicitly encourage wider basins during sequential fine-tuning. We show that this optimization approach outperforms several state-of-the-art task-sequential continual learning algorithms across multiple settings, occasionally even without retaining a memory that scales in size with the number of tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.09153v2-abstract-full').style.display = 'none'; document.getElementById('2112.09153v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Journal of Machine Learning Research 24 (2023) 1-50 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.08467">arXiv:2110.08467</a> <span> [<a href="https://arxiv.org/pdf/2110.08467">pdf</a>, <a href="https://arxiv.org/format/2110.08467">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Improving Compositional Generalization with Self-Training for Data-to-Text Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mehta%2C+S+V">Sanket Vaibhav Mehta</a>, <a href="/search/cs?searchtype=author&query=Rao%2C+J">Jinfeng Rao</a>, <a href="/search/cs?searchtype=author&query=Tay%2C+Y">Yi Tay</a>, <a href="/search/cs?searchtype=author&query=Kale%2C+M">Mihir Kale</a>, <a href="/search/cs?searchtype=author&query=Parikh%2C+A+P">Ankur P. Parikh</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.08467v2-abstract-short" style="display: inline;"> Data-to-text generation focuses on generating fluent natural language responses from structured meaning representations (MRs). Such representations are compositional and it is costly to collect responses for all possible combinations of atomic meaning schemata, thereby necessitating few-shot generalization to novel MRs. In this work, we systematically study the compositional generalization of the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.08467v2-abstract-full').style.display = 'inline'; document.getElementById('2110.08467v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.08467v2-abstract-full" style="display: none;"> Data-to-text generation focuses on generating fluent natural language responses from structured meaning representations (MRs). Such representations are compositional and it is costly to collect responses for all possible combinations of atomic meaning schemata, thereby necessitating few-shot generalization to novel MRs. In this work, we systematically study the compositional generalization of the state-of-the-art T5 models in few-shot data-to-text tasks. We show that T5 models fail to generalize to unseen MRs, and we propose a template-based input representation that considerably improves the model's generalization capability. To further improve the model's performance, we propose an approach based on self-training using fine-tuned BLEURT for pseudo response selection. On the commonly-used SGD and Weather benchmarks, the proposed self-training approach improves tree accuracy by 46%+ and reduces the slot error rates by 73%+ over the strong T5 baselines in few-shot settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.08467v2-abstract-full').style.display = 'none'; document.getElementById('2110.08467v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ACL 2022 main conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.09835">arXiv:2104.09835</a> <span> [<a href="https://arxiv.org/pdf/2104.09835">pdf</a>, <a href="https://arxiv.org/format/2104.09835">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> WiFiMod: Transformer-based Indoor Human Mobility Modeling using Passive Sensing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Trivedi%2C+A">Amee Trivedi</a>, <a href="/search/cs?searchtype=author&query=Silverstein%2C+K">Kate Silverstein</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=Iyyer%2C+M">Mohit Iyyer</a>, <a href="/search/cs?searchtype=author&query=Shenoy%2C+P">Prashant Shenoy</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.09835v3-abstract-short" style="display: inline;"> Modeling human mobility has a wide range of applications from urban planning to simulations of disease spread. It is well known that humans spend 80% of their time indoors but modeling indoor human mobility is challenging due to three main reasons: (i) the absence of easily acquirable, reliable, low-cost indoor mobility datasets, (ii) high prediction space in modeling the frequent indoor mobility,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.09835v3-abstract-full').style.display = 'inline'; document.getElementById('2104.09835v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.09835v3-abstract-full" style="display: none;"> Modeling human mobility has a wide range of applications from urban planning to simulations of disease spread. It is well known that humans spend 80% of their time indoors but modeling indoor human mobility is challenging due to three main reasons: (i) the absence of easily acquirable, reliable, low-cost indoor mobility datasets, (ii) high prediction space in modeling the frequent indoor mobility, and (iii) multi-scalar periodicity and correlations in mobility. To deal with all these challenges, we propose WiFiMod, a Transformer-based, data-driven approach that models indoor human mobility at multiple spatial scales using WiFi system logs. WiFiMod takes as input enterprise WiFi system logs to extract human mobility trajectories from smartphone digital traces. Next, for each extracted trajectory, we identify the mobility features at multiple spatial scales, macro, and micro, to design a multi-modal embedding Transformer that predicts user mobility for several hours to an entire day across multiple spatial granularities. Multi-modal embedding captures the mobility periodicity and correlations across various scales while Transformers capture long-term mobility dependencies boosting model prediction performance. This approach significantly reduces the prediction space by first predicting macro mobility, then modeling indoor scale mobility, micro-mobility, conditioned on the estimated macro mobility distribution, thereby using the topological constraint of the macro-scale. Experimental results show that WiFiMod achieves a prediction accuracy of at least 10% points higher than the current state-of-art models. Additionally, we present 3 real-world applications of WiFiMod - (i) predict high-density hot pockets for policy-making decisions for COVID19 or ILI, (ii) generate a realistic simulation of indoor mobility, (iii) design personal assistants. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.09835v3-abstract-full').style.display = 'none'; document.getElementById('2104.09835v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1906.02243">arXiv:1906.02243</a> <span> [<a href="https://arxiv.org/pdf/1906.02243">pdf</a>, <a href="https://arxiv.org/ps/1906.02243">ps</a>, <a href="https://arxiv.org/format/1906.02243">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Energy and Policy Considerations for Deep Learning in NLP </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=Ganesh%2C+A">Ananya Ganesh</a>, <a href="/search/cs?searchtype=author&query=McCallum%2C+A">Andrew McCallum</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1906.02243v1-abstract-short" style="display: inline;"> Recent progress in hardware and methodology for training neural networks has ushered in a new generation of large networks trained on abundant data. These models have obtained notable gains in accuracy across many NLP tasks. However, these accuracy improvements depend on the availability of exceptionally large computational resources that necessitate similarly substantial energy consumption. As a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.02243v1-abstract-full').style.display = 'inline'; document.getElementById('1906.02243v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1906.02243v1-abstract-full" style="display: none;"> Recent progress in hardware and methodology for training neural networks has ushered in a new generation of large networks trained on abundant data. These models have obtained notable gains in accuracy across many NLP tasks. However, these accuracy improvements depend on the availability of exceptionally large computational resources that necessitate similarly substantial energy consumption. As a result these models are costly to train and develop, both financially, due to the cost of hardware and electricity or cloud compute time, and environmentally, due to the carbon footprint required to fuel modern tensor processing hardware. In this paper we bring this issue to the attention of NLP researchers by quantifying the approximate financial and environmental costs of training a variety of recently successful neural network models for NLP. Based on these findings, we propose actionable recommendations to reduce costs and improve equity in NLP research and practice. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.02243v1-abstract-full').style.display = 'none'; document.getElementById('1906.02243v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 June, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In the 57th Annual Meeting of the Association for Computational Linguistics (ACL). Florence, Italy. July 2019</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1905.06939">arXiv:1905.06939</a> <span> [<a href="https://arxiv.org/pdf/1905.06939">pdf</a>, <a href="https://arxiv.org/format/1905.06939">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> The Materials Science Procedural Text Corpus: Annotating Materials Synthesis Procedures with Shallow Semantic Structures </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mysore%2C+S">Sheshera Mysore</a>, <a href="/search/cs?searchtype=author&query=Jensen%2C+Z">Zach Jensen</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+E">Edward Kim</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kevin Huang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+H">Haw-Shiuan Chang</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=Flanigan%2C+J">Jeffrey Flanigan</a>, <a href="/search/cs?searchtype=author&query=McCallum%2C+A">Andrew McCallum</a>, <a href="/search/cs?searchtype=author&query=Olivetti%2C+E">Elsa Olivetti</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1905.06939v2-abstract-short" style="display: inline;"> Materials science literature contains millions of materials synthesis procedures described in unstructured natural language text. Large-scale analysis of these synthesis procedures would facilitate deeper scientific understanding of materials synthesis and enable automated synthesis planning. Such analysis requires extracting structured representations of synthesis procedures from the raw text as… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1905.06939v2-abstract-full').style.display = 'inline'; document.getElementById('1905.06939v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1905.06939v2-abstract-full" style="display: none;"> Materials science literature contains millions of materials synthesis procedures described in unstructured natural language text. Large-scale analysis of these synthesis procedures would facilitate deeper scientific understanding of materials synthesis and enable automated synthesis planning. Such analysis requires extracting structured representations of synthesis procedures from the raw text as a first step. To facilitate the training and evaluation of synthesis extraction models, we introduce a dataset of 230 synthesis procedures annotated by domain experts with labeled graphs that express the semantics of the synthesis sentences. The nodes in this graph are synthesis operations and their typed arguments, and labeled edges specify relations between the nodes. We describe this new resource in detail and highlight some specific challenges to annotating scientific text with shallow semantic structure. We make the corpus available to the community to promote further research and development of scientific information extraction systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1905.06939v2-abstract-full').style.display = 'none'; document.getElementById('1905.06939v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 July, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 May, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted as a long paper at the Linguistic Annotation Workshop (LAW) at ACL 2019</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1901.00032">arXiv:1901.00032</a> <span> [<a href="https://arxiv.org/pdf/1901.00032">pdf</a>, <a href="https://arxiv.org/format/1901.00032">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Inorganic Materials Synthesis Planning with Literature-Trained Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+E">Edward Kim</a>, <a href="/search/cs?searchtype=author&query=Jensen%2C+Z">Zach Jensen</a>, <a href="/search/cs?searchtype=author&query=van+Grootel%2C+A">Alexander van Grootel</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kevin Huang</a>, <a href="/search/cs?searchtype=author&query=Staib%2C+M">Matthew Staib</a>, <a href="/search/cs?searchtype=author&query=Mysore%2C+S">Sheshera Mysore</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+H">Haw-Shiuan Chang</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=McCallum%2C+A">Andrew McCallum</a>, <a href="/search/cs?searchtype=author&query=Jegelka%2C+S">Stefanie Jegelka</a>, <a href="/search/cs?searchtype=author&query=Olivetti%2C+E">Elsa Olivetti</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1901.00032v2-abstract-short" style="display: inline;"> Leveraging new data sources is a key step in accelerating the pace of materials design and discovery. To complement the strides in synthesis planning driven by historical, experimental, and computed data, we present an automated method for connecting scientific literature to synthesis insights. Starting from natural language text, we apply word embeddings from language models, which are fed into a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1901.00032v2-abstract-full').style.display = 'inline'; document.getElementById('1901.00032v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1901.00032v2-abstract-full" style="display: none;"> Leveraging new data sources is a key step in accelerating the pace of materials design and discovery. To complement the strides in synthesis planning driven by historical, experimental, and computed data, we present an automated method for connecting scientific literature to synthesis insights. Starting from natural language text, we apply word embeddings from language models, which are fed into a named entity recognition model, upon which a conditional variational autoencoder is trained to generate syntheses for arbitrary materials. We show the potential of this technique by predicting precursors for two perovskite materials, using only training data published over a decade prior to their first reported syntheses. We demonstrate that the model learns representations of materials corresponding to synthesis-related properties, and that the model's behavior complements existing thermodynamic knowledge. Finally, we apply the model to perform synthesizability screening for proposed novel perovskite compounds. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1901.00032v2-abstract-full').style.display = 'none'; document.getElementById('1901.00032v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 December, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Added new funding support to the acknowledgments section in this version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1811.04773">arXiv:1811.04773</a> <span> [<a href="https://arxiv.org/pdf/1811.04773">pdf</a>, <a href="https://arxiv.org/format/1811.04773">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Syntax Helps ELMo Understand Semantics: Is Syntax Still Relevant in a Deep Neural Architecture for SRL? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=McCallum%2C+A">Andrew McCallum</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1811.04773v1-abstract-short" style="display: inline;"> Do unsupervised methods for learning rich, contextualized token representations obviate the need for explicit modeling of linguistic structure in neural network models for semantic role labeling (SRL)? We address this question by incorporating the massively successful ELMo embeddings (Peters et al., 2018) into LISA (Strubell et al., 2018), a strong, linguistically-informed neural network architect… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1811.04773v1-abstract-full').style.display = 'inline'; document.getElementById('1811.04773v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1811.04773v1-abstract-full" style="display: none;"> Do unsupervised methods for learning rich, contextualized token representations obviate the need for explicit modeling of linguistic structure in neural network models for semantic role labeling (SRL)? We address this question by incorporating the massively successful ELMo embeddings (Peters et al., 2018) into LISA (Strubell et al., 2018), a strong, linguistically-informed neural network architecture for SRL. In experiments on the CoNLL-2005 shared task we find that though ELMo out-performs typical word embeddings, beginning to close the gap in F1 between LISA with predicted and gold syntactic parses, syntactically-informed models still out-perform syntax-free models when both use ELMo, especially on out-of-domain data. Our results suggest that linguistic structures are indeed still relevant in this golden age of deep learning for NLP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1811.04773v1-abstract-full').style.display = 'none'; document.getElementById('1811.04773v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In Proceedings of the Workshop on the Relevance of Linguistic Structure in Neural Architectures for NLP, ACL 2018</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1804.08199">arXiv:1804.08199</a> <span> [<a href="https://arxiv.org/pdf/1804.08199">pdf</a>, <a href="https://arxiv.org/format/1804.08199">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Linguistically-Informed Self-Attention for Semantic Role Labeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=Verga%2C+P">Patrick Verga</a>, <a href="/search/cs?searchtype=author&query=Andor%2C+D">Daniel Andor</a>, <a href="/search/cs?searchtype=author&query=Weiss%2C+D">David Weiss</a>, <a href="/search/cs?searchtype=author&query=McCallum%2C+A">Andrew McCallum</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1804.08199v3-abstract-short" style="display: inline;"> Current state-of-the-art semantic role labeling (SRL) uses a deep neural network with no explicit linguistic features. However, prior work has shown that gold syntax trees can dramatically improve SRL decoding, suggesting the possibility of increased accuracy from explicit modeling of syntax. In this work, we present linguistically-informed self-attention (LISA): a neural network model that combin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1804.08199v3-abstract-full').style.display = 'inline'; document.getElementById('1804.08199v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1804.08199v3-abstract-full" style="display: none;"> Current state-of-the-art semantic role labeling (SRL) uses a deep neural network with no explicit linguistic features. However, prior work has shown that gold syntax trees can dramatically improve SRL decoding, suggesting the possibility of increased accuracy from explicit modeling of syntax. In this work, we present linguistically-informed self-attention (LISA): a neural network model that combines multi-head self-attention with multi-task learning across dependency parsing, part-of-speech tagging, predicate detection and SRL. Unlike previous models which require significant pre-processing to prepare linguistic features, LISA can incorporate syntax using merely raw tokens as input, encoding the sequence only once to simultaneously perform parsing, predicate detection and role labeling for all predicates. Syntax is incorporated by training one attention head to attend to syntactic parents for each token. Moreover, if a high-quality syntactic parse is already available, it can be beneficially injected at test time without re-training our SRL model. In experiments on CoNLL-2005 SRL, LISA achieves new state-of-the-art performance for a model using predicted predicates and standard word embeddings, attaining 2.5 F1 absolute higher than the previous state-of-the-art on newswire and more than 3.5 F1 on out-of-domain data, nearly 10% reduction in error. On ConLL-2012 English SRL we also show an improvement of more than 2.5 F1. LISA also out-performs the state-of-the-art with contextually-encoded (ELMo) word representations, by nearly 1.0 F1 on news and more than 2.0 F1 on out-of-domain text. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1804.08199v3-abstract-full').style.display = 'none'; document.getElementById('1804.08199v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 April, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In Conference on Empirical Methods in Natural Language Processing (EMNLP). Brussels, Belgium. October 2018</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1802.10569">arXiv:1802.10569</a> <span> [<a href="https://arxiv.org/pdf/1802.10569">pdf</a>, <a href="https://arxiv.org/format/1802.10569">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Simultaneously Self-Attending to All Mentions for Full-Abstract Biological Relation Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Verga%2C+P">Patrick Verga</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=McCallum%2C+A">Andrew McCallum</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1802.10569v1-abstract-short" style="display: inline;"> Most work in relation extraction forms a prediction by looking at a short span of text within a single sentence containing a single entity pair mention. This approach often does not consider interactions across mentions, requires redundant computation for each mention pair, and ignores relationships expressed across sentence boundaries. These problems are exacerbated by the document- (rather than… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1802.10569v1-abstract-full').style.display = 'inline'; document.getElementById('1802.10569v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1802.10569v1-abstract-full" style="display: none;"> Most work in relation extraction forms a prediction by looking at a short span of text within a single sentence containing a single entity pair mention. This approach often does not consider interactions across mentions, requires redundant computation for each mention pair, and ignores relationships expressed across sentence boundaries. These problems are exacerbated by the document- (rather than sentence-) level annotation common in biological text. In response, we propose a model which simultaneously predicts relationships between all mention pairs in a document. We form pairwise predictions over entire paper abstracts using an efficient self-attention encoder. All-pairs mention scores allow us to perform multi-instance learning by aggregating over mentions to form entity pair representations. We further adapt to settings without mention-level annotation by jointly training to predict named entities and adding a corpus of weakly labeled data. In experiments on two Biocreative benchmark datasets, we achieve state of the art performance on the Biocreative V Chemical Disease Relation dataset for models without external KB resources. We also introduce a new dataset an order of magnitude larger than existing human-annotated biological information extraction datasets and more accurate than distantly supervised alternatives. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1802.10569v1-abstract-full').style.display = 'none'; document.getElementById('1802.10569v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 February, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NAACL 2018</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1711.06872">arXiv:1711.06872</a> <span> [<a href="https://arxiv.org/pdf/1711.06872">pdf</a>, <a href="https://arxiv.org/format/1711.06872">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Automatically Extracting Action Graphs from Materials Science Synthesis Procedures </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mysore%2C+S">Sheshera Mysore</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+E">Edward Kim</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Ao Liu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+H">Haw-Shiuan Chang</a>, <a href="/search/cs?searchtype=author&query=Kompella%2C+S">Srikrishna Kompella</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kevin Huang</a>, <a href="/search/cs?searchtype=author&query=McCallum%2C+A">Andrew McCallum</a>, <a href="/search/cs?searchtype=author&query=Olivetti%2C+E">Elsa Olivetti</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1711.06872v2-abstract-short" style="display: inline;"> Computational synthesis planning approaches have achieved recent success in organic chemistry, where tabulated synthesis procedures are readily available for supervised learning. The syntheses of inorganic materials, however, exist primarily as natural language narratives contained within scientific journal articles. This synthesis information must first be extracted from the text in order to enab… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1711.06872v2-abstract-full').style.display = 'inline'; document.getElementById('1711.06872v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1711.06872v2-abstract-full" style="display: none;"> Computational synthesis planning approaches have achieved recent success in organic chemistry, where tabulated synthesis procedures are readily available for supervised learning. The syntheses of inorganic materials, however, exist primarily as natural language narratives contained within scientific journal articles. This synthesis information must first be extracted from the text in order to enable analogous synthesis planning methods for inorganic materials. In this work, we present a system for automatically extracting structured representations of synthesis procedures from the texts of materials science journal articles that describe explicit, experimental syntheses of inorganic compounds. We define the structured representation as a set of linked events made up of extracted scientific entities and evaluate two unsupervised approaches for extracting these structures on expert-annotated articles: a strong heuristic baseline and a generative model of procedural text. We also evaluate a variety of supervised models for extracting scientific entities. Our results provide insight into the nature of the data and directions for further work in this exciting new area of research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1711.06872v2-abstract-full').style.display = 'none'; document.getElementById('1711.06872v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2017; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2017. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NIPS Workshop on Machine Learning for Molecules and Materials</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1710.08312">arXiv:1710.08312</a> <span> [<a href="https://arxiv.org/pdf/1710.08312">pdf</a>, <a href="https://arxiv.org/format/1710.08312">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Attending to All Mention Pairs for Full Abstract Biological Relation Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Verga%2C+P">Patrick Verga</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=Shai%2C+O">Ofer Shai</a>, <a href="/search/cs?searchtype=author&query=McCallum%2C+A">Andrew McCallum</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1710.08312v2-abstract-short" style="display: inline;"> Most work in relation extraction forms a prediction by looking at a short span of text within a single sentence containing a single entity pair mention. However, many relation types, particularly in biomedical text, are expressed across sentences or require a large context to disambiguate. We propose a model to consider all mention and entity pairs simultaneously in order to make a prediction. We… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1710.08312v2-abstract-full').style.display = 'inline'; document.getElementById('1710.08312v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1710.08312v2-abstract-full" style="display: none;"> Most work in relation extraction forms a prediction by looking at a short span of text within a single sentence containing a single entity pair mention. However, many relation types, particularly in biomedical text, are expressed across sentences or require a large context to disambiguate. We propose a model to consider all mention and entity pairs simultaneously in order to make a prediction. We encode full paper abstracts using an efficient self-attention encoder and form pairwise predictions between all mentions with a bi-affine operation. An entity-pair wise pooling aggregates mention pair scores to make a final prediction while alleviating training noise by performing within document multi-instance learning. We improve our model's performance by jointly training the model to predict named entities and adding an additional corpus of weakly labeled data. We demonstrate our model's effectiveness by achieving the state of the art on the Biocreative V Chemical Disease Relation dataset for models without KB resources, outperforming ensembles of models which use hand-crafted features and additional linguistic resources. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1710.08312v2-abstract-full').style.display = 'none'; document.getElementById('1710.08312v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2017; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 October, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2017. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6th Workshop on Automated Knowledge Base Construction (AKBC)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1705.00403">arXiv:1705.00403</a> <span> [<a href="https://arxiv.org/pdf/1705.00403">pdf</a>, <a href="https://arxiv.org/format/1705.00403">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Dependency Parsing with Dilated Iterated Graph CNNs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=McCallum%2C+A">Andrew McCallum</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1705.00403v2-abstract-short" style="display: inline;"> Dependency parses are an effective way to inject linguistic knowledge into many downstream tasks, and many practitioners wish to efficiently parse sentences at scale. Recent advances in GPU hardware have enabled neural networks to achieve significant gains over the previous best models, these models still fail to leverage GPUs' capability for massive parallelism due to their requirement of sequent… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1705.00403v2-abstract-full').style.display = 'inline'; document.getElementById('1705.00403v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1705.00403v2-abstract-full" style="display: none;"> Dependency parses are an effective way to inject linguistic knowledge into many downstream tasks, and many practitioners wish to efficiently parse sentences at scale. Recent advances in GPU hardware have enabled neural networks to achieve significant gains over the previous best models, these models still fail to leverage GPUs' capability for massive parallelism due to their requirement of sequential processing of the sentence. In response, we propose Dilated Iterated Graph Convolutional Neural Networks (DIG-CNNs) for graph-based dependency parsing, a graph convolutional architecture that allows for efficient end-to-end GPU parsing. In experiments on the English Penn TreeBank benchmark, we show that DIG-CNNs perform on par with some of the best neural network parsers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1705.00403v2-abstract-full').style.display = 'none'; document.getElementById('1705.00403v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 July, 2017; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 April, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2017. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2nd Workshop on Structured Prediction for Natural Language Processing (at EMNLP '17)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1702.02098">arXiv:1702.02098</a> <span> [<a href="https://arxiv.org/pdf/1702.02098">pdf</a>, <a href="https://arxiv.org/format/1702.02098">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Fast and Accurate Entity Recognition with Iterated Dilated Convolutions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=Verga%2C+P">Patrick Verga</a>, <a href="/search/cs?searchtype=author&query=Belanger%2C+D">David Belanger</a>, <a href="/search/cs?searchtype=author&query=McCallum%2C+A">Andrew McCallum</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1702.02098v3-abstract-short" style="display: inline;"> Today when many practitioners run basic NLP on the entire web and large-volume traffic, faster methods are paramount to saving time and energy costs. Recent advances in GPU hardware have led to the emergence of bi-directional LSTMs as a standard method for obtaining per-token vector representations serving as input to labeling tasks such as NER (often followed by prediction in a linear-chain CRF).… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1702.02098v3-abstract-full').style.display = 'inline'; document.getElementById('1702.02098v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1702.02098v3-abstract-full" style="display: none;"> Today when many practitioners run basic NLP on the entire web and large-volume traffic, faster methods are paramount to saving time and energy costs. Recent advances in GPU hardware have led to the emergence of bi-directional LSTMs as a standard method for obtaining per-token vector representations serving as input to labeling tasks such as NER (often followed by prediction in a linear-chain CRF). Though expressive and accurate, these models fail to fully exploit GPU parallelism, limiting their computational efficiency. This paper proposes a faster alternative to Bi-LSTMs for NER: Iterated Dilated Convolutional Neural Networks (ID-CNNs), which have better capacity than traditional CNNs for large context and structured prediction. Unlike LSTMs whose sequential processing on sentences of length N requires O(N) time even in the face of parallelism, ID-CNNs permit fixed-depth convolutions to run in parallel across entire documents. We describe a distinct combination of network structure, parameter sharing and training procedures that enable dramatic 14-20x test-time speedups while retaining accuracy comparable to the Bi-LSTM-CRF. Moreover, ID-CNNs trained to aggregate context from the entire document are even more accurate while maintaining 8x faster test time speeds. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1702.02098v3-abstract-full').style.display = 'none'; document.getElementById('1702.02098v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 July, 2017; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 February, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2017. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In Conference on Empirical Methods in Natural Language Processing (EMNLP). Copenhagen, Denmark. September 2017</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1511.06396">arXiv:1511.06396</a> <span> [<a href="https://arxiv.org/pdf/1511.06396">pdf</a>, <a href="https://arxiv.org/format/1511.06396">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Multilingual Relation Extraction using Compositional Universal Schema </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Verga%2C+P">Patrick Verga</a>, <a href="/search/cs?searchtype=author&query=Belanger%2C+D">David Belanger</a>, <a href="/search/cs?searchtype=author&query=Strubell%2C+E">Emma Strubell</a>, <a href="/search/cs?searchtype=author&query=Roth%2C+B">Benjamin Roth</a>, <a href="/search/cs?searchtype=author&query=McCallum%2C+A">Andrew McCallum</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1511.06396v2-abstract-short" style="display: inline;"> Universal schema builds a knowledge base (KB) of entities and relations by jointly embedding all relation types from input KBs as well as textual patterns expressing relations from raw text. In most previous applications of universal schema, each textual pattern is represented as a single embedding, preventing generalization to unseen patterns. Recent work employs a neural network to capture patte… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1511.06396v2-abstract-full').style.display = 'inline'; document.getElementById('1511.06396v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1511.06396v2-abstract-full" style="display: none;"> Universal schema builds a knowledge base (KB) of entities and relations by jointly embedding all relation types from input KBs as well as textual patterns expressing relations from raw text. In most previous applications of universal schema, each textual pattern is represented as a single embedding, preventing generalization to unseen patterns. Recent work employs a neural network to capture patterns' compositional semantics, providing generalization to all possible input text. In response, this paper introduces significant further improvements to the coverage and flexibility of universal schema relation extraction: predictions for entities unseen in training and multilingual transfer learning to domains with no annotation. We evaluate our model through extensive experiments on the English and Spanish TAC KBP benchmark, outperforming the top system from TAC 2013 slot-filling using no handwritten patterns or additional annotation. We also consider a multilingual setting in which English training data entities overlap with the seed KB, but Spanish text does not. Despite having no annotation for Spanish data, we train an accurate predictor, with additional improvements obtained by tying word embeddings across languages. Furthermore, we find that multilingual training improves English relation extraction accuracy. Our approach is thus suited to broad-coverage automated knowledge base construction in a variety of languages and domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1511.06396v2-abstract-full').style.display = 'none'; document.getElementById('1511.06396v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 March, 2016; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 November, 2015; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2015. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NAACL 2016</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Strubell%2C+E&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Strubell%2C+E&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Strubell%2C+E&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository