CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;14 of 14 results for author: <span class="mathjax">Hooper, C</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Hooper%2C+C">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Hooper, C"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Hooper%2C+C&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Hooper, C"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09688">arXiv:2411.09688</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.09688">pdf</a>, <a href="https://arxiv.org/format/2411.09688">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Squeezed Attention: Accelerating Long Context Length LLM Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hooper%2C+C">Coleman Hooper</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+S">Sehoon Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Mohammadzadeh%2C+H">Hiva Mohammadzadeh</a>, <a href="/search/cs?searchtype=author&amp;query=Maheswaran%2C+M">Monishwaran Maheswaran</a>, <a href="/search/cs?searchtype=author&amp;query=Paik%2C+J">June Paik</a>, <a href="/search/cs?searchtype=author&amp;query=Mahoney%2C+M+W">Michael W. Mahoney</a>, <a href="/search/cs?searchtype=author&amp;query=Keutzer%2C+K">Kurt Keutzer</a>, <a href="/search/cs?searchtype=author&amp;query=Gholami%2C+A">Amir Gholami</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09688v1-abstract-short" style="display: inline;"> Emerging Large Language Model (LLM) applications require long input prompts to perform complex downstream tasks like document analysis and code generation. For these long context length applications, the length of the input prompt poses a significant challenge in terms of inference efficiency since the inference costs increase linearly with sequence length. However, for many of these applications,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09688v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09688v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09688v1-abstract-full" style="display: none;"> Emerging Large Language Model (LLM) applications require long input prompts to perform complex downstream tasks like document analysis and code generation. For these long context length applications, the length of the input prompt poses a significant challenge in terms of inference efficiency since the inference costs increase linearly with sequence length. However, for many of these applications, much of the context in the prompt is fixed across different user inputs, thereby providing the opportunity to perform offline optimizations to process user inputs quickly, as they are received. In this work, we propose Squeezed Attention as a mechanism to accelerate LLM applications where a large portion of the input prompt is fixed. We first leverage K-means clustering offline to group the keys for the fixed context based on semantic similarity and represent each cluster with a single centroid value. During inference, we compare query tokens from the user input with the centroids to predict which of the keys from the fixed context are semantically relevant and need to be loaded during inference. We then compute exact attention using only these important keys from the fixed context, thereby reducing bandwidth and computational costs. We also extend our method to use a hierarchical centroid lookup to identify important keys, which can reduce the complexity of attention from linear to logarithmic with respect to the context length. We implement optimized Triton kernels for centroid comparison and sparse FlashAttention with important keys, achieving more than 4x speedups during both the prefill and generation phases for long-context inference. Furthermore, we have extensively evaluated our method on various long-context benchmarks including LongBench, where it achieves a 3x reduction in KV cache budget without accuracy loss and up to an 8x reduction with &lt;0.5 point accuracy gap for various models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09688v1-abstract-full').style.display = 'none'; document.getElementById('2411.09688v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00608">arXiv:2409.00608</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.00608">pdf</a>, <a href="https://arxiv.org/format/2409.00608">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> TinyAgent: Function Calling at the Edge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Erdogan%2C+L+E">Lutfi Eren Erdogan</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+N">Nicholas Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Jha%2C+S">Siddharth Jha</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+S">Sehoon Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Tabrizi%2C+R">Ryan Tabrizi</a>, <a href="/search/cs?searchtype=author&amp;query=Moon%2C+S">Suhong Moon</a>, <a href="/search/cs?searchtype=author&amp;query=Hooper%2C+C">Coleman Hooper</a>, <a href="/search/cs?searchtype=author&amp;query=Anumanchipalli%2C+G">Gopala Anumanchipalli</a>, <a href="/search/cs?searchtype=author&amp;query=Keutzer%2C+K">Kurt Keutzer</a>, <a href="/search/cs?searchtype=author&amp;query=Gholami%2C+A">Amir Gholami</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00608v3-abstract-short" style="display: inline;"> Recent large language models (LLMs) have enabled the development of advanced agentic systems that can integrate various tools and APIs to fulfill user queries through function calling. However, the deployment of these LLMs on the edge has not been explored since they typically require cloud-based infrastructure due to their substantial model size and computational demands. To this end, we present&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00608v3-abstract-full').style.display = 'inline'; document.getElementById('2409.00608v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00608v3-abstract-full" style="display: none;"> Recent large language models (LLMs) have enabled the development of advanced agentic systems that can integrate various tools and APIs to fulfill user queries through function calling. However, the deployment of these LLMs on the edge has not been explored since they typically require cloud-based infrastructure due to their substantial model size and computational demands. To this end, we present TinyAgent, an end-to-end framework for training and deploying task-specific small language model agents capable of function calling for driving agentic systems at the edge. We first show how to enable accurate function calling for open-source models via the LLMCompiler framework. We then systematically curate a high-quality dataset for function calling, which we use to fine-tune two small language models, TinyAgent-1.1B and 7B. For efficient inference, we introduce a novel tool retrieval method to reduce the input prompt length and utilize quantization to further accelerate the inference speed. As a driving application, we demonstrate a local Siri-like system for Apple&#39;s MacBook that can execute user commands through text or voice input. Our results show that our models can achieve, and even surpass, the function-calling capabilities of larger models like GPT-4-Turbo, while being fully deployed at the edge. We open-source our dataset, models, and installable package and provide a demo video for our MacBook assistant agent. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00608v3-abstract-full').style.display = 'none'; document.getElementById('2409.00608v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024 Demo</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.13463">arXiv:2407.13463</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.13463">pdf</a>, <a href="https://arxiv.org/format/2407.13463">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> End-To-End Clinical Trial Matching with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ferber%2C+D">Dyke Ferber</a>, <a href="/search/cs?searchtype=author&amp;query=Hilgers%2C+L">Lars Hilgers</a>, <a href="/search/cs?searchtype=author&amp;query=Wiest%2C+I+C">Isabella C. Wiest</a>, <a href="/search/cs?searchtype=author&amp;query=Le%C3%9Fmann%2C+M">Marie-Elisabeth Le脽mann</a>, <a href="/search/cs?searchtype=author&amp;query=Clusmann%2C+J">Jan Clusmann</a>, <a href="/search/cs?searchtype=author&amp;query=Neidlinger%2C+P">Peter Neidlinger</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+J">Jiefu Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=W%C3%B6lflein%2C+G">Georg W枚lflein</a>, <a href="/search/cs?searchtype=author&amp;query=Lammert%2C+J">Jacqueline Lammert</a>, <a href="/search/cs?searchtype=author&amp;query=Tschochohei%2C+M">Maximilian Tschochohei</a>, <a href="/search/cs?searchtype=author&amp;query=B%C3%B6hme%2C+H">Heiko B枚hme</a>, <a href="/search/cs?searchtype=author&amp;query=J%C3%A4ger%2C+D">Dirk J盲ger</a>, <a href="/search/cs?searchtype=author&amp;query=Aldea%2C+M">Mihaela Aldea</a>, <a href="/search/cs?searchtype=author&amp;query=Truhn%2C+D">Daniel Truhn</a>, <a href="/search/cs?searchtype=author&amp;query=H%C3%B6per%2C+C">Christiane H枚per</a>, <a href="/search/cs?searchtype=author&amp;query=Kather%2C+J+N">Jakob Nikolas Kather</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.13463v1-abstract-short" style="display: inline;"> Matching cancer patients to clinical trials is essential for advancing treatment and patient care. However, the inconsistent format of medical free text documents and complex trial eligibility criteria make this process extremely challenging and time-consuming for physicians. We investigated whether the entire trial matching process - from identifying relevant trials among 105,600 oncology-related&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13463v1-abstract-full').style.display = 'inline'; document.getElementById('2407.13463v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.13463v1-abstract-full" style="display: none;"> Matching cancer patients to clinical trials is essential for advancing treatment and patient care. However, the inconsistent format of medical free text documents and complex trial eligibility criteria make this process extremely challenging and time-consuming for physicians. We investigated whether the entire trial matching process - from identifying relevant trials among 105,600 oncology-related clinical trials on clinicaltrials.gov to generating criterion-level eligibility matches - could be automated using Large Language Models (LLMs). Using GPT-4o and a set of 51 synthetic Electronic Health Records (EHRs), we demonstrate that our approach identifies relevant candidate trials in 93.3% of cases and achieves a preliminary accuracy of 88.0% when matching patient-level information at the criterion level against a baseline defined by human experts. Utilizing LLM feedback reveals that 39.3% criteria that were initially considered incorrect are either ambiguous or inaccurately annotated, leading to a total model accuracy of 92.7% after refining our human baseline. In summary, we present an end-to-end pipeline for clinical trial matching using LLMs, demonstrating high precision in screening and matching trials to individual patients, even outperforming the performance of qualified medical doctors. Our fully end-to-end pipeline can operate autonomously or with human supervision and is not restricted to oncology, offering a scalable solution for enhancing patient-trial matching in real-world settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13463v1-abstract-full').style.display = 'none'; document.getElementById('2407.13463v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">149 pages, including Supplements. 3 Main Figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.14123">arXiv:2403.14123</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.14123">pdf</a>, <a href="https://arxiv.org/format/2403.14123">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> AI and Memory Wall </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gholami%2C+A">Amir Gholami</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+Z">Zhewei Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+S">Sehoon Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Hooper%2C+C">Coleman Hooper</a>, <a href="/search/cs?searchtype=author&amp;query=Mahoney%2C+M+W">Michael W. Mahoney</a>, <a href="/search/cs?searchtype=author&amp;query=Keutzer%2C+K">Kurt Keutzer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.14123v1-abstract-short" style="display: inline;"> The availability of unprecedented unsupervised training data, along with neural scaling laws, has resulted in an unprecedented surge in model size and compute requirements for serving/training LLMs. However, the main performance bottleneck is increasingly shifting to memory bandwidth. Over the past 20 years, peak server hardware FLOPS has been scaling at 3.0x/2yrs, outpacing the growth of DRAM and&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.14123v1-abstract-full').style.display = 'inline'; document.getElementById('2403.14123v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.14123v1-abstract-full" style="display: none;"> The availability of unprecedented unsupervised training data, along with neural scaling laws, has resulted in an unprecedented surge in model size and compute requirements for serving/training LLMs. However, the main performance bottleneck is increasingly shifting to memory bandwidth. Over the past 20 years, peak server hardware FLOPS has been scaling at 3.0x/2yrs, outpacing the growth of DRAM and interconnect bandwidth, which have only scaled at 1.6 and 1.4 times every 2 years, respectively. This disparity has made memory, rather than compute, the primary bottleneck in AI applications, particularly in serving. Here, we analyze encoder and decoder Transformer models and show how memory bandwidth can become the dominant bottleneck for decoder models. We argue for a redesign in model architecture, training, and deployment strategies to overcome this memory limitation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.14123v1-abstract-full').style.display = 'none'; document.getElementById('2403.14123v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in IEEE Micro Journal</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.18079">arXiv:2401.18079</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.18079">pdf</a>, <a href="https://arxiv.org/format/2401.18079">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> KVQuant: Towards 10 Million Context Length LLM Inference with KV Cache Quantization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hooper%2C+C">Coleman Hooper</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+S">Sehoon Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Mohammadzadeh%2C+H">Hiva Mohammadzadeh</a>, <a href="/search/cs?searchtype=author&amp;query=Mahoney%2C+M+W">Michael W. Mahoney</a>, <a href="/search/cs?searchtype=author&amp;query=Shao%2C+Y+S">Yakun Sophia Shao</a>, <a href="/search/cs?searchtype=author&amp;query=Keutzer%2C+K">Kurt Keutzer</a>, <a href="/search/cs?searchtype=author&amp;query=Gholami%2C+A">Amir Gholami</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.18079v5-abstract-short" style="display: inline;"> LLMs are seeing growing use for applications which require large context windows, and with these large context windows KV cache activations surface as the dominant contributor to memory consumption during inference. Quantization is a promising approach for compressing KV cache activations; however, existing solutions fail to represent activations accurately in sub-4-bit precision. Our work, KVQuan&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.18079v5-abstract-full').style.display = 'inline'; document.getElementById('2401.18079v5-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.18079v5-abstract-full" style="display: none;"> LLMs are seeing growing use for applications which require large context windows, and with these large context windows KV cache activations surface as the dominant contributor to memory consumption during inference. Quantization is a promising approach for compressing KV cache activations; however, existing solutions fail to represent activations accurately in sub-4-bit precision. Our work, KVQuant, facilitates low precision KV cache quantization by incorporating several novel methods: (i) Per-Channel Key Quantization, where we adjust the dimension along which we quantize the Key activations to better match the distribution; (ii) Pre-RoPE Key Quantization, where we quantize Key activations before the rotary positional embedding to mitigate its impact on quantization; (iii) Non-Uniform KV Cache Quantization, where we derive per-layer sensitivity-weighted non-uniform datatypes that better represent the distributions; and (iv) Per-Vector Dense-and-Sparse Quantization, where we isolate outliers separately for each vector to minimize skews in quantization ranges. By applying our method to the LLaMA, Llama-2, Llama-3, and Mistral models, we achieve &lt; 0.1 perplexity degradation with 3-bit quantization on both Wikitext-2 and C4, outperforming existing approaches. Our method enables serving LLaMA-7B with a context length of up to 1 million on a single A100-80GB GPU and up to 10 million on an 8-GPU system. We develop custom CUDA kernels for KVQuant, showing that we can achieve up to ~1.7x speedups, compared to baseline fp16 matrix-vector multiplications, for the LLaMA-7B model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.18079v5-abstract-full').style.display = 'none'; document.getElementById('2401.18079v5-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.07886">arXiv:2401.07886</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.07886">pdf</a>, <a href="https://arxiv.org/format/2401.07886">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Learned Best-Effort LLM Serving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jha%2C+S">Siddharth Jha</a>, <a href="/search/cs?searchtype=author&amp;query=Hooper%2C+C">Coleman Hooper</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xiaoxuan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+S">Sehoon Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Keutzer%2C+K">Kurt Keutzer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.07886v2-abstract-short" style="display: inline;"> Many applications must provide low-latency LLM service to users or risk unacceptable user experience. However, over-provisioning resources to serve fluctuating request patterns is often prohibitively expensive. In this work, we present a best-effort serving system that employs deep reinforcement learning to adjust service quality based on the task distribution and system load. Our best-effort syst&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.07886v2-abstract-full').style.display = 'inline'; document.getElementById('2401.07886v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.07886v2-abstract-full" style="display: none;"> Many applications must provide low-latency LLM service to users or risk unacceptable user experience. However, over-provisioning resources to serve fluctuating request patterns is often prohibitively expensive. In this work, we present a best-effort serving system that employs deep reinforcement learning to adjust service quality based on the task distribution and system load. Our best-effort system can maintain availability with over 10x higher client request rates, serves above 96% of peak performance 4.1x more often, and serves above 98% of peak performance 2.3x more often than static serving on unpredictable workloads. Our learned router is robust to shifts in both the arrival and task distribution. Compared to static serving, learned best-effort serving allows for cost-efficient serving through increased hardware utility. Additionally, we argue that learned best-effort LLM serving is applicable in wide variety of settings and provides application developers great flexibility to meet their specific needs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.07886v2-abstract-full').style.display = 'none'; document.getElementById('2401.07886v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Es-FoMo @ ICML 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.03285">arXiv:2311.03285</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.03285">pdf</a>, <a href="https://arxiv.org/format/2311.03285">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> S-LoRA: Serving Thousands of Concurrent LoRA Adapters </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sheng%2C+Y">Ying Sheng</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+S">Shiyi Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+D">Dacheng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Hooper%2C+C">Coleman Hooper</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+N">Nicholas Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+S">Shuo Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Chou%2C+C">Christopher Chou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+B">Banghua Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+L">Lianmin Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Keutzer%2C+K">Kurt Keutzer</a>, <a href="/search/cs?searchtype=author&amp;query=Gonzalez%2C+J+E">Joseph E. Gonzalez</a>, <a href="/search/cs?searchtype=author&amp;query=Stoica%2C+I">Ion Stoica</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.03285v3-abstract-short" style="display: inline;"> The &#34;pretrain-then-finetune&#34; paradigm is commonly adopted in the deployment of large language models. Low-Rank Adaptation (LoRA), a parameter-efficient fine-tuning method, is often employed to adapt a base model to a multitude of tasks, resulting in a substantial collection of LoRA adapters derived from one base model. We observe that this paradigm presents significant opportunities for batched in&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.03285v3-abstract-full').style.display = 'inline'; document.getElementById('2311.03285v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.03285v3-abstract-full" style="display: none;"> The &#34;pretrain-then-finetune&#34; paradigm is commonly adopted in the deployment of large language models. Low-Rank Adaptation (LoRA), a parameter-efficient fine-tuning method, is often employed to adapt a base model to a multitude of tasks, resulting in a substantial collection of LoRA adapters derived from one base model. We observe that this paradigm presents significant opportunities for batched inference during serving. To capitalize on these opportunities, we present S-LoRA, a system designed for the scalable serving of many LoRA adapters. S-LoRA stores all adapters in the main memory and fetches the adapters used by the currently running queries to the GPU memory. To efficiently use the GPU memory and reduce fragmentation, S-LoRA proposes Unified Paging. Unified Paging uses a unified memory pool to manage dynamic adapter weights with different ranks and KV cache tensors with varying sequence lengths. Additionally, S-LoRA employs a novel tensor parallelism strategy and highly optimized custom CUDA kernels for heterogeneous batching of LoRA computation. Collectively, these features enable S-LoRA to serve thousands of LoRA adapters on a single GPU or across multiple GPUs with a small overhead. Compared to state-of-the-art libraries such as HuggingFace PEFT and vLLM (with naive support of LoRA serving), S-LoRA can improve the throughput by up to 4 times and increase the number of served adapters by several orders of magnitude. As a result, S-LoRA enables scalable serving of many task-specific fine-tuned models and offers the potential for large-scale customized fine-tuning services. The code is available at https://github.com/S-LoRA/S-LoRA <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.03285v3-abstract-full').style.display = 'none'; document.getElementById('2311.03285v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.12371">arXiv:2310.12371</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.12371">pdf</a>, <a href="https://arxiv.org/format/2310.12371">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Property-Aware Multi-Speaker Data Simulation: A Probabilistic Modelling Technique for Synthetic Data Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Park%2C+T+J">Tae Jin Park</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+H">He Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Hooper%2C+C">Coleman Hooper</a>, <a href="/search/cs?searchtype=author&amp;query=Koluguri%2C+N">Nithin Koluguri</a>, <a href="/search/cs?searchtype=author&amp;query=Dhawan%2C+K">Kunal Dhawan</a>, <a href="/search/cs?searchtype=author&amp;query=Jukic%2C+A">Ante Jukic</a>, <a href="/search/cs?searchtype=author&amp;query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/cs?searchtype=author&amp;query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.12371v1-abstract-short" style="display: inline;"> We introduce a sophisticated multi-speaker speech data simulator, specifically engineered to generate multi-speaker speech recordings. A notable feature of this simulator is its capacity to modulate the distribution of silence and overlap via the adjustment of statistical parameters. This capability offers a tailored training environment for developing neural models suited for speaker diarization&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.12371v1-abstract-full').style.display = 'inline'; document.getElementById('2310.12371v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.12371v1-abstract-full" style="display: none;"> We introduce a sophisticated multi-speaker speech data simulator, specifically engineered to generate multi-speaker speech recordings. A notable feature of this simulator is its capacity to modulate the distribution of silence and overlap via the adjustment of statistical parameters. This capability offers a tailored training environment for developing neural models suited for speaker diarization and voice activity detection. The acquisition of substantial datasets for speaker diarization often presents a significant challenge, particularly in multi-speaker scenarios. Furthermore, the precise time stamp annotation of speech data is a critical factor for training both speaker diarization and voice activity detection. Our proposed multi-speaker simulator tackles these problems by generating large-scale audio mixtures that maintain statistical properties closely aligned with the input parameters. We demonstrate that the proposed multi-speaker simulator generates audio mixtures with statistical properties that closely align with the input parameters derived from real-world statistics. Additionally, we present the effectiveness of speaker diarization and voice activity detection models, which have been trained exclusively on the generated simulated datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.12371v1-abstract-full').style.display = 'none'; document.getElementById('2310.12371v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> CHiME-7 Workshop 2023 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.12072">arXiv:2310.12072</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.12072">pdf</a>, <a href="https://arxiv.org/format/2310.12072">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SPEED: Speculative Pipelined Execution for Efficient Decoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hooper%2C+C">Coleman Hooper</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+S">Sehoon Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Mohammadzadeh%2C+H">Hiva Mohammadzadeh</a>, <a href="/search/cs?searchtype=author&amp;query=Genc%2C+H">Hasan Genc</a>, <a href="/search/cs?searchtype=author&amp;query=Keutzer%2C+K">Kurt Keutzer</a>, <a href="/search/cs?searchtype=author&amp;query=Gholami%2C+A">Amir Gholami</a>, <a href="/search/cs?searchtype=author&amp;query=Shao%2C+S">Sophia Shao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.12072v2-abstract-short" style="display: inline;"> Generative Large Language Models (LLMs) based on the Transformer architecture have recently emerged as a dominant foundation model for a wide range of Natural Language Processing tasks. Nevertheless, their application in real-time scenarios has been highly restricted due to the significant inference latency associated with these models. This is particularly pronounced due to the autoregressive nat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.12072v2-abstract-full').style.display = 'inline'; document.getElementById('2310.12072v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.12072v2-abstract-full" style="display: none;"> Generative Large Language Models (LLMs) based on the Transformer architecture have recently emerged as a dominant foundation model for a wide range of Natural Language Processing tasks. Nevertheless, their application in real-time scenarios has been highly restricted due to the significant inference latency associated with these models. This is particularly pronounced due to the autoregressive nature of generative LLM inference, where tokens are generated sequentially since each token depends on all previous output tokens. It is therefore challenging to achieve any token-level parallelism, making inference extremely memory-bound. In this work, we propose SPEED, which improves inference efficiency by speculatively executing multiple future tokens in parallel with the current token using predicted values based on early-layer hidden states. For Transformer decoders that employ parameter sharing, the memory operations for the tokens executing in parallel can be amortized, which allows us to accelerate generative LLM inference. We demonstrate the efficiency of our method in terms of latency reduction relative to model accuracy and demonstrate how speculation allows for training deeper decoders with parameter sharing with minimal runtime overhead. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.12072v2-abstract-full').style.display = 'none'; document.getElementById('2310.12072v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS Workshop on Efficient Natural Language and Speech Processing (2023)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.07629">arXiv:2306.07629</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2306.07629">pdf</a>, <a href="https://arxiv.org/format/2306.07629">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SqueezeLLM: Dense-and-Sparse Quantization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kim%2C+S">Sehoon Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Hooper%2C+C">Coleman Hooper</a>, <a href="/search/cs?searchtype=author&amp;query=Gholami%2C+A">Amir Gholami</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Z">Zhen Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xiuyu Li</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+S">Sheng Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Mahoney%2C+M+W">Michael W. Mahoney</a>, <a href="/search/cs?searchtype=author&amp;query=Keutzer%2C+K">Kurt Keutzer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.07629v4-abstract-short" style="display: inline;"> Generative Large Language Models (LLMs) have demonstrated remarkable results for a wide range of tasks. However, deploying these models for inference has been a significant challenge due to their unprecedented resource requirements. This has forced existing deployment frameworks to use multi-GPU inference pipelines, which are often complex and costly, or to use smaller and less performant models.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.07629v4-abstract-full').style.display = 'inline'; document.getElementById('2306.07629v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.07629v4-abstract-full" style="display: none;"> Generative Large Language Models (LLMs) have demonstrated remarkable results for a wide range of tasks. However, deploying these models for inference has been a significant challenge due to their unprecedented resource requirements. This has forced existing deployment frameworks to use multi-GPU inference pipelines, which are often complex and costly, or to use smaller and less performant models. In this work, we demonstrate that the main bottleneck for generative inference with LLMs is memory bandwidth, rather than compute, specifically for single batch inference. While quantization has emerged as a promising solution by representing weights with reduced precision, previous efforts have often resulted in notable performance degradation. To address this, we introduce SqueezeLLM, a post-training quantization framework that not only enables lossless compression to ultra-low precisions of up to 3-bit, but also achieves higher quantization performance under the same memory constraint. Our framework incorporates two novel ideas: (i) sensitivity-based non-uniform quantization, which searches for the optimal bit precision assignment based on second-order information; and (ii) the Dense-and-Sparse decomposition that stores outliers and sensitive weight values in an efficient sparse format. When applied to the LLaMA models, our 3-bit quantization significantly reduces the perplexity gap from the FP16 baseline by up to 2.1x as compared to the state-of-the-art methods with the same memory requirement. Furthermore, when deployed on an A6000 GPU, our quantized models achieve up to 2.3x speedup compared to the baseline. Our code is available at https://github.com/SqueezeAILab/SqueezeLLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.07629v4-abstract-full').style.display = 'none'; document.getElementById('2306.07629v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.14017">arXiv:2302.14017</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2302.14017">pdf</a>, <a href="https://arxiv.org/format/2302.14017">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Full Stack Optimization of Transformer Inference: a Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kim%2C+S">Sehoon Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Hooper%2C+C">Coleman Hooper</a>, <a href="/search/cs?searchtype=author&amp;query=Wattanawong%2C+T">Thanakul Wattanawong</a>, <a href="/search/cs?searchtype=author&amp;query=Kang%2C+M">Minwoo Kang</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+R">Ruohan Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Genc%2C+H">Hasan Genc</a>, <a href="/search/cs?searchtype=author&amp;query=Dinh%2C+G">Grace Dinh</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Q">Qijing Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Keutzer%2C+K">Kurt Keutzer</a>, <a href="/search/cs?searchtype=author&amp;query=Mahoney%2C+M+W">Michael W. Mahoney</a>, <a href="/search/cs?searchtype=author&amp;query=Shao%2C+Y+S">Yakun Sophia Shao</a>, <a href="/search/cs?searchtype=author&amp;query=Gholami%2C+A">Amir Gholami</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.14017v1-abstract-short" style="display: inline;"> Recent advances in state-of-the-art DNN architecture design have been moving toward Transformer models. These models achieve superior accuracy across a wide range of applications. This trend has been consistent over the past several years since Transformer models were originally introduced. However, the amount of compute and bandwidth required for inference of recent Transformer models is growing&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.14017v1-abstract-full').style.display = 'inline'; document.getElementById('2302.14017v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.14017v1-abstract-full" style="display: none;"> Recent advances in state-of-the-art DNN architecture design have been moving toward Transformer models. These models achieve superior accuracy across a wide range of applications. This trend has been consistent over the past several years since Transformer models were originally introduced. However, the amount of compute and bandwidth required for inference of recent Transformer models is growing at a significant rate, and this has made their deployment in latency-sensitive applications challenging. As such, there has been an increased focus on making Transformer models more efficient, with methods that range from changing the architecture design, all the way to developing dedicated domain-specific accelerators. In this work, we survey different approaches for efficient Transformer inference, including: (i) analysis and profiling of the bottlenecks in existing Transformer architectures and their similarities and differences with previous convolutional models; (ii) implications of Transformer architecture on hardware, including the impact of non-linear operations such as Layer Normalization, Softmax, and GELU, as well as linear operations, on hardware design; (iii) approaches for optimizing a fixed Transformer architecture; (iv) challenges in finding the right mapping and scheduling of operations for Transformer models; and (v) approaches for optimizing Transformer models by adapting the architecture using neural architecture search. Finally, we perform a case study by applying the surveyed optimizations on Gemmini, the open-source, full-stack DNN accelerator generator, and we show how each of these approaches can yield improvements, compared to previous benchmark results on Gemmini. Among other things, we find that a full-stack co-design approach with the aforementioned methods can result in up to 88.7x speedup with a minimal performance degradation for Transformer inference. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.14017v1-abstract-full').style.display = 'none'; document.getElementById('2302.14017v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Presented in Workshop on Architecture and System Support for Transformer Models (ASSYST) at ISCA 2023 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2105.01134">arXiv:2105.01134</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2105.01134">pdf</a>, <a href="https://arxiv.org/format/2105.01134">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Quantifying and Maximizing the Benefits of Back-End Noise Adaption on Attention-Based Speech Recognition Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hooper%2C+C">Coleman Hooper</a>, <a href="/search/cs?searchtype=author&amp;query=Tambe%2C+T">Thierry Tambe</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2105.01134v2-abstract-short" style="display: inline;"> This work analyzes how attention-based Bidirectional Long Short-Term Memory (BLSTM) models adapt to noise-augmented speech. We identify crucial components for noise adaptation in BLSTM models by freezing model components during fine-tuning. We first freeze larger model subnetworks and then pursue a fine-grained freezing approach in the encoder after identifying its importance for noise adaptation.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2105.01134v2-abstract-full').style.display = 'inline'; document.getElementById('2105.01134v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2105.01134v2-abstract-full" style="display: none;"> This work analyzes how attention-based Bidirectional Long Short-Term Memory (BLSTM) models adapt to noise-augmented speech. We identify crucial components for noise adaptation in BLSTM models by freezing model components during fine-tuning. We first freeze larger model subnetworks and then pursue a fine-grained freezing approach in the encoder after identifying its importance for noise adaptation. The first encoder layer is shown to be crucial for noise adaptation, and the weights are shown to be more important than the other layers. Appreciable accuracy benefits are identified when fine-tuning on a target noisy environment from a model pretrained with noisy speech relative to fine-tuning from a model pretrained with only clean speech when tested on the target noisy environment. For this analysis, we produce our own dataset augmentation tool and it is open-sourced to encourage future efforts in exploring noise adaptation in ASR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2105.01134v2-abstract-full').style.display = 'none'; document.getElementById('2105.01134v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 May, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ENLSP 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.14203">arXiv:2011.14203</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2011.14203">pdf</a>, <a href="https://arxiv.org/format/2011.14203">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> EdgeBERT: Sentence-Level Energy Optimizations for Latency-Aware Multi-Task NLP Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tambe%2C+T">Thierry Tambe</a>, <a href="/search/cs?searchtype=author&amp;query=Hooper%2C+C">Coleman Hooper</a>, <a href="/search/cs?searchtype=author&amp;query=Pentecost%2C+L">Lillian Pentecost</a>, <a href="/search/cs?searchtype=author&amp;query=Jia%2C+T">Tianyu Jia</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+E">En-Yu Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Donato%2C+M">Marco Donato</a>, <a href="/search/cs?searchtype=author&amp;query=Sanh%2C+V">Victor Sanh</a>, <a href="/search/cs?searchtype=author&amp;query=Whatmough%2C+P+N">Paul N. Whatmough</a>, <a href="/search/cs?searchtype=author&amp;query=Rush%2C+A+M">Alexander M. Rush</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+D">David Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Gu-Yeon Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.14203v5-abstract-short" style="display: inline;"> Transformer-based language models such as BERT provide significant accuracy improvement for a multitude of natural language processing (NLP) tasks. However, their hefty computational and memory demands make them challenging to deploy to resource-constrained edge platforms with strict latency requirements. We present EdgeBERT, an in-depth algorithm-hardware co-design for latency-aware energy optimi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.14203v5-abstract-full').style.display = 'inline'; document.getElementById('2011.14203v5-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.14203v5-abstract-full" style="display: none;"> Transformer-based language models such as BERT provide significant accuracy improvement for a multitude of natural language processing (NLP) tasks. However, their hefty computational and memory demands make them challenging to deploy to resource-constrained edge platforms with strict latency requirements. We present EdgeBERT, an in-depth algorithm-hardware co-design for latency-aware energy optimization for multi-task NLP. EdgeBERT employs entropy-based early exit predication in order to perform dynamic voltage-frequency scaling (DVFS), at a sentence granularity, for minimal energy consumption while adhering to a prescribed target latency. Computation and memory footprint overheads are further alleviated by employing a calibrated combination of adaptive attention span, selective network pruning, and floating-point quantization. Furthermore, in order to maximize the synergistic benefits of these algorithms in always-on and intermediate edge computing settings, we specialize a 12nm scalable hardware accelerator system, integrating a fast-switching low-dropout voltage regulator (LDO), an all-digital phase-locked loop (ADPLL), as well as, high-density embedded non-volatile memories (eNVMs) wherein the sparse floating-point bit encodings of the shared multi-task parameters are carefully stored. Altogether, latency-aware multi-task NLP inference acceleration on the EdgeBERT hardware system generates up to 7x, 2.5x, and 53x lower energy compared to the conventional inference without early stopping, the latency-unbounded early exit approach, and CUDA adaptations on an Nvidia Jetson Tegra X2 mobile GPU, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.14203v5-abstract-full').style.display = 'none'; document.getElementById('2011.14203v5-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages plus references. Paper to appear at the 54th IEEE/ACM International Symposium on Microarchitecture (MICRO 2021)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1307.0516">arXiv:1307.0516</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1307.0516">pdf</a>, <a href="https://arxiv.org/format/1307.0516">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Adaptation and Self-Organizing Systems">nlin.AO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Physics and Society">physics.soc-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Populations and Evolution">q-bio.PE</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.3390/e15114932">10.3390/e15114932 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Dynamical Structure of a Traditional Amazonian Social Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hooper%2C+P+L">Paul L. Hooper</a>, <a href="/search/cs?searchtype=author&amp;query=DeDeo%2C+S">Simon DeDeo</a>, <a href="/search/cs?searchtype=author&amp;query=Hooper%2C+A+E+C">Ann E. Caldwell Hooper</a>, <a href="/search/cs?searchtype=author&amp;query=Gurven%2C+M">Michael Gurven</a>, <a href="/search/cs?searchtype=author&amp;query=Kaplan%2C+H+S">Hillard S. Kaplan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1307.0516v2-abstract-short" style="display: inline;"> Reciprocity is a vital feature of social networks, but relatively little is known about its temporal structure or the mechanisms underlying its persistence in real world behavior. In pursuit of these two questions, we study the stationary and dynamical signals of reciprocity in a network of manioc beer (Spanish: chicha; Tsimane&#39;: shocdye&#39;) drinking events in a Tsimane&#39; village in lowland Bolivia.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1307.0516v2-abstract-full').style.display = 'inline'; document.getElementById('1307.0516v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1307.0516v2-abstract-full" style="display: none;"> Reciprocity is a vital feature of social networks, but relatively little is known about its temporal structure or the mechanisms underlying its persistence in real world behavior. In pursuit of these two questions, we study the stationary and dynamical signals of reciprocity in a network of manioc beer (Spanish: chicha; Tsimane&#39;: shocdye&#39;) drinking events in a Tsimane&#39; village in lowland Bolivia. At the stationary level, our analysis reveals that social exchange within the community is heterogeneously patterned according to kinship and spatial proximity. A positive relationship between the frequencies at which two families host each other, controlling for kinship and proximity, provides evidence for stationary reciprocity. Our analysis of the dynamical structure of this network presents a novel method for the study of conditional, or non-stationary, reciprocity effects. We find evidence that short-timescale reciprocity (within three days) is present among non- and distant-kin pairs; conversely, we find that levels of cooperation among close kin can be accounted for on the stationary hypothesis alone. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1307.0516v2-abstract-full').style.display = 'none'; document.getElementById('1307.0516v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2013; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 July, 2013; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2013. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">24 pages, 6 figures, 1 table; expanded results and discussion sections; matches published version</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Entropy 2013, 15, 4932-4955 </p> </li> </ol> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10