CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–19 of 19 results for author: <span class="mathjax">Ruwase, O</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Ruwase%2C+O">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Ruwase, O"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Ruwase%2C+O&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Ruwase, O"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08948">arXiv:2412.08948</a> <span> [<a href="https://arxiv.org/pdf/2412.08948">pdf</a>, <a href="https://arxiv.org/format/2412.08948">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Mojito: Motion Trajectory and Intensity Control for Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+X">Xuehai He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuohang Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jianwei Yang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiaoxia Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yiping Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kuan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+Z">Zheng Zhan</a>, <a href="/search/cs?searchtype=author&query=Ruwase%2C+O">Olatunji Ruwase</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yelong Shen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X+E">Xin Eric Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08948v2-abstract-short" style="display: inline;"> Recent advancements in diffusion models have shown great promise in producing high-quality video content. However, efficiently training video diffusion models capable of integrating directional guidance and controllable motion intensity remains a challenging and under-explored area. To tackle these challenges, this paper introduces Mojito, a diffusion model that incorporates both motion trajectory… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08948v2-abstract-full').style.display = 'inline'; document.getElementById('2412.08948v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08948v2-abstract-full" style="display: none;"> Recent advancements in diffusion models have shown great promise in producing high-quality video content. However, efficiently training video diffusion models capable of integrating directional guidance and controllable motion intensity remains a challenging and under-explored area. To tackle these challenges, this paper introduces Mojito, a diffusion model that incorporates both motion trajectory and intensity control for text-to-video generation. Specifically, Mojito features a Directional Motion Control (DMC) module that leverages cross-attention to efficiently direct the generated object's motion without training, alongside a Motion Intensity Modulator (MIM) that uses optical flow maps generated from videos to guide varying levels of motion intensity. Extensive experiments demonstrate Mojito's effectiveness in achieving precise trajectory and intensity control with high computational efficiency, generating motion patterns that closely match specified directions and intensities, providing realistic dynamics that align well with natural motion in real-world scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08948v2-abstract-full').style.display = 'none'; document.getElementById('2412.08948v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15241">arXiv:2409.15241</a> <span> [<a href="https://arxiv.org/pdf/2409.15241">pdf</a>, <a href="https://arxiv.org/format/2409.15241">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Domino: Eliminating Communication in LLM Training via Generic Tensor Slicing and Overlapping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guanhua Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chengming Zhang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Z">Zheyu Shen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+A">Ang Li</a>, <a href="/search/cs?searchtype=author&query=Ruwase%2C+O">Olatunji Ruwase</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15241v1-abstract-short" style="display: inline;"> Given the popularity of generative AI, Large Language Models (LLMs) often consume hundreds or thousands of GPUs for parallelizing and accelerating the training process. Communication overhead becomes more pronounced when training LLMs at scale. To eliminate communication overhead in distributed LLM training, we propose Domino, which provides a generic scheme to hide communication behind computatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15241v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15241v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15241v1-abstract-full" style="display: none;"> Given the popularity of generative AI, Large Language Models (LLMs) often consume hundreds or thousands of GPUs for parallelizing and accelerating the training process. Communication overhead becomes more pronounced when training LLMs at scale. To eliminate communication overhead in distributed LLM training, we propose Domino, which provides a generic scheme to hide communication behind computation. By breaking data dependency of a single batch training into smaller independent pieces, Domino pipelines these independent pieces training and provides generic strategy of fine-grained communication and computation overlapping. Extensive results show that, comparing with Megatron-LM, Domino achieves up to 1.3x speedup for LLM training on Nvidia DGX-H100 GPUs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15241v1-abstract-full').style.display = 'none'; document.getElementById('2409.15241v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.16978">arXiv:2408.16978</a> <span> [<a href="https://arxiv.org/pdf/2408.16978">pdf</a>, <a href="https://arxiv.org/format/2408.16978">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Training Ultra Long Context Language Model with Fully Pipelined Distributed Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+J">Jinghan Yao</a>, <a href="/search/cs?searchtype=author&query=Jacobs%2C+S+A">Sam Ade Jacobs</a>, <a href="/search/cs?searchtype=author&query=Tanaka%2C+M">Masahiro Tanaka</a>, <a href="/search/cs?searchtype=author&query=Ruwase%2C+O">Olatunji Ruwase</a>, <a href="/search/cs?searchtype=author&query=Shafi%2C+A">Aamir Shafi</a>, <a href="/search/cs?searchtype=author&query=Subramoni%2C+H">Hari Subramoni</a>, <a href="/search/cs?searchtype=author&query=Panda%2C+D+K">Dhabaleswar K. Panda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.16978v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) with long context capabilities are integral to complex tasks in natural language processing and computational biology, such as text generation and protein sequence analysis. However, training LLMs directly on extremely long contexts demands considerable GPU resources and increased memory, leading to higher costs and greater complexity. Alternative approaches that intro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16978v1-abstract-full').style.display = 'inline'; document.getElementById('2408.16978v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.16978v1-abstract-full" style="display: none;"> Large Language Models (LLMs) with long context capabilities are integral to complex tasks in natural language processing and computational biology, such as text generation and protein sequence analysis. However, training LLMs directly on extremely long contexts demands considerable GPU resources and increased memory, leading to higher costs and greater complexity. Alternative approaches that introduce long context capabilities via downstream finetuning or adaptations impose significant design limitations. In this paper, we propose Fully Pipelined Distributed Transformer (FPDT) for efficiently training long-context LLMs with extreme hardware efficiency. For GPT and Llama models, we achieve a 16x increase in sequence length that can be trained on the same hardware compared to current state-of-the-art solutions. With our dedicated sequence chunk pipeline design, we can now train 8B LLM with 2 million sequence length on only 4 GPUs, while also maintaining over 55% of MFU. Our proposed FPDT is agnostic to existing training techniques and is proven to work efficiently across different LLM models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16978v1-abstract-full').style.display = 'none'; document.getElementById('2408.16978v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18820">arXiv:2406.18820</a> <span> [<a href="https://arxiv.org/pdf/2406.18820">pdf</a>, <a href="https://arxiv.org/format/2406.18820">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Universal Checkpointing: Efficient and Flexible Checkpointing for Large Scale Distributed Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lian%2C+X">Xinyu Lian</a>, <a href="/search/cs?searchtype=author&query=Jacobs%2C+S+A">Sam Ade Jacobs</a>, <a href="/search/cs?searchtype=author&query=Kurilenko%2C+L">Lev Kurilenko</a>, <a href="/search/cs?searchtype=author&query=Tanaka%2C+M">Masahiro Tanaka</a>, <a href="/search/cs?searchtype=author&query=Bekman%2C+S">Stas Bekman</a>, <a href="/search/cs?searchtype=author&query=Ruwase%2C+O">Olatunji Ruwase</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Minjia Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18820v2-abstract-short" style="display: inline;"> Existing checkpointing approaches seem ill-suited for distributed training even though hardware limitations make model parallelism, i.e., sharding model state across multiple accelerators, a requirement for model scaling. Consolidating distributed model state into a single checkpoint unacceptably slows down training, and is impractical at extreme scales. Distributed checkpoints, in contrast, are t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18820v2-abstract-full').style.display = 'inline'; document.getElementById('2406.18820v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18820v2-abstract-full" style="display: none;"> Existing checkpointing approaches seem ill-suited for distributed training even though hardware limitations make model parallelism, i.e., sharding model state across multiple accelerators, a requirement for model scaling. Consolidating distributed model state into a single checkpoint unacceptably slows down training, and is impractical at extreme scales. Distributed checkpoints, in contrast, are tightly coupled to the model parallelism and hardware configurations of the training run, and thus unusable on different configurations. To address this problem, we propose Universal Checkpointing, a technique that enables efficient checkpoint creation while providing the flexibility of resuming on arbitrary parallelism strategy and hardware configurations. Universal Checkpointing unlocks unprecedented capabilities for large-scale training such as improved resilience to hardware failures through continued training on remaining healthy hardware, and reduced training time through opportunistic exploitation of elastic capacity. The key insight of Universal Checkpointing is the selection of the optimal representation in each phase of the checkpointing life cycle: distributed representation for saving, and consolidated representation for loading. This is achieved using two key mechanisms. First, the universal checkpoint format, which consists of a consolidated representation of each model parameter and metadata for mapping parameter fragments into training ranks of arbitrary model-parallelism configuration. Second, the universal checkpoint language, a simple but powerful specification language for converting distributed checkpoints into the universal checkpoint format. Our evaluation demonstrates the effectiveness and generality of Universal Checkpointing on state-of-the-art model architectures and a wide range of parallelism techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18820v2-abstract-full').style.display = 'none'; document.getElementById('2406.18820v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.13768">arXiv:2406.13768</a> <span> [<a href="https://arxiv.org/pdf/2406.13768">pdf</a>, <a href="https://arxiv.org/format/2406.13768">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> FastPersist: Accelerating Model Checkpointing in Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guanhua Wang</a>, <a href="/search/cs?searchtype=author&query=Ruwase%2C+O">Olatunji Ruwase</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+B">Bing Xie</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuxiong He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.13768v1-abstract-short" style="display: inline;"> Model checkpoints are critical Deep Learning (DL) artifacts that enable fault tolerance for training and downstream applications, such as inference. However, writing checkpoints to persistent storage, and other I/O aspects of DL training, are mostly ignored by compute-focused optimization efforts for faster training of rapidly growing models and datasets. Towards addressing this imbalance, we prop… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13768v1-abstract-full').style.display = 'inline'; document.getElementById('2406.13768v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.13768v1-abstract-full" style="display: none;"> Model checkpoints are critical Deep Learning (DL) artifacts that enable fault tolerance for training and downstream applications, such as inference. However, writing checkpoints to persistent storage, and other I/O aspects of DL training, are mostly ignored by compute-focused optimization efforts for faster training of rapidly growing models and datasets. Towards addressing this imbalance, we propose FastPersist to accelerate checkpoint creation in DL training. FastPersist combines three novel techniques: (i) NVMe optimizations for faster checkpoint writes to SSDs, (ii) efficient write parallelism using the available SSDs in training environments, and (iii) overlapping checkpointing with independent training computations. Our evaluation using real world dense and sparse DL models shows that FastPersist creates checkpoints in persistent storage up to 116x faster than baseline, and enables per-iteration checkpointing with negligible overhead. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13768v1-abstract-full').style.display = 'none'; document.getElementById('2406.13768v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.14219">arXiv:2404.14219</a> <span> [<a href="https://arxiv.org/pdf/2404.14219">pdf</a>, <a href="https://arxiv.org/format/2404.14219">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Abdin%2C+M">Marah Abdin</a>, <a href="/search/cs?searchtype=author&query=Aneja%2C+J">Jyoti Aneja</a>, <a href="/search/cs?searchtype=author&query=Awadalla%2C+H">Hany Awadalla</a>, <a href="/search/cs?searchtype=author&query=Awadallah%2C+A">Ahmed Awadallah</a>, <a href="/search/cs?searchtype=author&query=Awan%2C+A+A">Ammar Ahmad Awan</a>, <a href="/search/cs?searchtype=author&query=Bach%2C+N">Nguyen Bach</a>, <a href="/search/cs?searchtype=author&query=Bahree%2C+A">Amit Bahree</a>, <a href="/search/cs?searchtype=author&query=Bakhtiari%2C+A">Arash Bakhtiari</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+J">Jianmin Bao</a>, <a href="/search/cs?searchtype=author&query=Behl%2C+H">Harkirat Behl</a>, <a href="/search/cs?searchtype=author&query=Benhaim%2C+A">Alon Benhaim</a>, <a href="/search/cs?searchtype=author&query=Bilenko%2C+M">Misha Bilenko</a>, <a href="/search/cs?searchtype=author&query=Bjorck%2C+J">Johan Bjorck</a>, <a href="/search/cs?searchtype=author&query=Bubeck%2C+S">S茅bastien Bubeck</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+M">Martin Cai</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Q">Qin Cai</a>, <a href="/search/cs?searchtype=author&query=Chaudhary%2C+V">Vishrav Chaudhary</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dong Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dongdong Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weizhu Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yen-Chun Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yi-Ling Chen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao Cheng</a>, <a href="/search/cs?searchtype=author&query=Chopra%2C+P">Parul Chopra</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+X">Xiyang Dai</a> , et al. (104 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.14219v4-abstract-short" style="display: inline;"> We introduce phi-3-mini, a 3.8 billion parameter language model trained on 3.3 trillion tokens, whose overall performance, as measured by both academic benchmarks and internal testing, rivals that of models such as Mixtral 8x7B and GPT-3.5 (e.g., phi-3-mini achieves 69% on MMLU and 8.38 on MT-bench), despite being small enough to be deployed on a phone. Our training dataset is a scaled-up version… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.14219v4-abstract-full').style.display = 'inline'; document.getElementById('2404.14219v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.14219v4-abstract-full" style="display: none;"> We introduce phi-3-mini, a 3.8 billion parameter language model trained on 3.3 trillion tokens, whose overall performance, as measured by both academic benchmarks and internal testing, rivals that of models such as Mixtral 8x7B and GPT-3.5 (e.g., phi-3-mini achieves 69% on MMLU and 8.38 on MT-bench), despite being small enough to be deployed on a phone. Our training dataset is a scaled-up version of the one used for phi-2, composed of heavily filtered publicly available web data and synthetic data. The model is also further aligned for robustness, safety, and chat format. We also provide parameter-scaling results with a 7B, 14B models trained for 4.8T tokens, called phi-3-small, phi-3-medium, both significantly more capable than phi-3-mini (e.g., respectively 75%, 78% on MMLU, and 8.7, 8.9 on MT-bench). To enhance multilingual, multimodal, and long-context capabilities, we introduce three models in the phi-3.5 series: phi-3.5-mini, phi-3.5-MoE, and phi-3.5-Vision. The phi-3.5-MoE, a 16 x 3.8B MoE model with 6.6 billion active parameters, achieves superior performance in language reasoning, math, and code tasks compared to other open-source models of similar scale, such as Llama 3.1 and the Mixtral series, and on par with Gemini-1.5-Flash and GPT-4o-mini. Meanwhile, phi-3.5-Vision, a 4.2 billion parameter model derived from phi-3.5-mini, excels in reasoning tasks and is adept at handling both single-image and text prompts, as well as multi-image and text prompts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.14219v4-abstract-full').style.display = 'none'; document.getElementById('2404.14219v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">24 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.04797">arXiv:2403.04797</a> <span> [<a href="https://arxiv.org/pdf/2403.04797">pdf</a>, <a href="https://arxiv.org/format/2403.04797">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Found in the Middle: How Language Models Use Long Contexts Better via Plug-and-Play Positional Encoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhenyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+R">Runjin Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shiwei Liu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Z">Zhewei Yao</a>, <a href="/search/cs?searchtype=author&query=Ruwase%2C+O">Olatunji Ruwase</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Beidi Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiaoxia Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhangyang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.04797v1-abstract-short" style="display: inline;"> This paper aims to overcome the "lost-in-the-middle" challenge of large language models (LLMs). While recent advancements have successfully enabled LLMs to perform stable language modeling with up to 4 million tokens, the persistent difficulty faced by most LLMs in identifying relevant information situated in the middle of the context has not been adequately tackled. To address this problem, this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.04797v1-abstract-full').style.display = 'inline'; document.getElementById('2403.04797v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.04797v1-abstract-full" style="display: none;"> This paper aims to overcome the "lost-in-the-middle" challenge of large language models (LLMs). While recent advancements have successfully enabled LLMs to perform stable language modeling with up to 4 million tokens, the persistent difficulty faced by most LLMs in identifying relevant information situated in the middle of the context has not been adequately tackled. To address this problem, this paper introduces Multi-scale Positional Encoding (Ms-PoE) which is a simple yet effective plug-and-play approach to enhance the capacity of LLMs to handle the relevant information located in the middle of the context, without fine-tuning or introducing any additional overhead. Ms-PoE leverages the position indice rescaling to relieve the long-term decay effect introduced by RoPE, while meticulously assigning distinct scaling ratios to different attention heads to preserve essential knowledge learned during the pre-training step, forming a multi-scale context fusion from short to long distance. Extensive experiments with a wide range of LLMs demonstrate the efficacy of our approach. Notably, Ms-PoE achieves an average accuracy gain of up to 3.8 on the Zero-SCROLLS benchmark over the original LLMs. Code are available at https://github.com/VITA-Group/Ms-PoE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.04797v1-abstract-full').style.display = 'none'; document.getElementById('2403.04797v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.14112">arXiv:2401.14112</a> <span> [<a href="https://arxiv.org/pdf/2401.14112">pdf</a>, <a href="https://arxiv.org/format/2401.14112">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> FP6-LLM: Efficiently Serving Large Language Models Through FP6-Centric Algorithm-System Co-Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+H">Haojun Xia</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zhen Zheng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiaoxia Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shiyang Chen</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Z">Zhewei Yao</a>, <a href="/search/cs?searchtype=author&query=Youn%2C+S">Stephen Youn</a>, <a href="/search/cs?searchtype=author&query=Bakhtiari%2C+A">Arash Bakhtiari</a>, <a href="/search/cs?searchtype=author&query=Wyatt%2C+M">Michael Wyatt</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+D">Donglin Zhuang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhongzhu Zhou</a>, <a href="/search/cs?searchtype=author&query=Ruwase%2C+O">Olatunji Ruwase</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuxiong He</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S+L">Shuaiwen Leon Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.14112v2-abstract-short" style="display: inline;"> Six-bit quantization (FP6) can effectively reduce the size of large language models (LLMs) and preserve the model quality consistently across varied applications. However, existing systems do not provide Tensor Core support for FP6 quantization and struggle to achieve practical performance improvements during LLM inference. It is challenging to support FP6 quantization on GPUs due to (1) unfriendl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.14112v2-abstract-full').style.display = 'inline'; document.getElementById('2401.14112v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.14112v2-abstract-full" style="display: none;"> Six-bit quantization (FP6) can effectively reduce the size of large language models (LLMs) and preserve the model quality consistently across varied applications. However, existing systems do not provide Tensor Core support for FP6 quantization and struggle to achieve practical performance improvements during LLM inference. It is challenging to support FP6 quantization on GPUs due to (1) unfriendly memory access of model weights with irregular bit-width and (2) high runtime overhead of weight de-quantization. To address these problems, we propose TC-FPx, the first full-stack GPU kernel design scheme with unified Tensor Core support of float-point weights for various quantization bit-width. We integrate TC-FPx kernel into an existing inference system, providing new end-to-end support (called FP6-LLM) for quantized LLM inference, where better trade-offs between inference cost and model quality are achieved. Experiments show that FP6-LLM enables the inference of LLaMA-70b using only a single GPU, achieving 1.69x-2.65x higher normalized inference throughput than the FP16 baseline. The source code is publicly available at https://github.com/usyd-fsalab/fp6_llm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.14112v2-abstract-full').style.display = 'none'; document.getElementById('2401.14112v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Adding URL link of the source code</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.08583">arXiv:2312.08583</a> <span> [<a href="https://arxiv.org/pdf/2312.08583">pdf</a>, <a href="https://arxiv.org/format/2312.08583">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> ZeroQuant(4+2): Redefining LLMs Quantization with a New FP6-Centric Strategy for Diverse Generative Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiaoxia Wu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+H">Haojun Xia</a>, <a href="/search/cs?searchtype=author&query=Youn%2C+S">Stephen Youn</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zhen Zheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shiyang Chen</a>, <a href="/search/cs?searchtype=author&query=Bakhtiari%2C+A">Arash Bakhtiari</a>, <a href="/search/cs?searchtype=author&query=Wyatt%2C+M">Michael Wyatt</a>, <a href="/search/cs?searchtype=author&query=Aminabadi%2C+R+Y">Reza Yazdani Aminabadi</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuxiong He</a>, <a href="/search/cs?searchtype=author&query=Ruwase%2C+O">Olatunji Ruwase</a>, <a href="/search/cs?searchtype=author&query=Song%2C+L">Leon Song</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Z">Zhewei Yao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.08583v2-abstract-short" style="display: inline;"> This study examines 4-bit quantization methods like GPTQ in large language models (LLMs), highlighting GPTQ's overfitting and limited enhancement in Zero-Shot tasks. While prior works merely focusing on zero-shot measurement, we extend task scope to more generative categories such as code generation and abstractive summarization, in which we found that INT4 quantization can significantly underperf… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.08583v2-abstract-full').style.display = 'inline'; document.getElementById('2312.08583v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.08583v2-abstract-full" style="display: none;"> This study examines 4-bit quantization methods like GPTQ in large language models (LLMs), highlighting GPTQ's overfitting and limited enhancement in Zero-Shot tasks. While prior works merely focusing on zero-shot measurement, we extend task scope to more generative categories such as code generation and abstractive summarization, in which we found that INT4 quantization can significantly underperform. However, simply shifting to higher precision formats like FP6 has been particularly challenging, thus overlooked, due to poor performance caused by the lack of sophisticated integration and system acceleration strategies on current AI hardware. Our results show that FP6, even with a coarse-grain quantization scheme, performs robustly across various algorithms and tasks, demonstrating its superiority in accuracy and versatility. Notably, with the FP6 quantization, \codestar-15B model performs comparably to its FP16 counterpart in code generation, and for smaller models like the 406M it closely matches their baselines in summarization. Neither can be achieved by INT4. To better accommodate various AI hardware and achieve the best system performance, we propose a novel 4+2 design for FP6 to achieve similar latency to the state-of-the-art INT4 fine-grain quantization. With our design, FP6 can become a promising solution to the current 4-bit quantization methods used in LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.08583v2-abstract-full').style.display = 'none'; document.getElementById('2312.08583v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.14327">arXiv:2309.14327</a> <span> [<a href="https://arxiv.org/pdf/2309.14327">pdf</a>, <a href="https://arxiv.org/format/2309.14327">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DeepSpeed-VisualChat: Multi-Round Multi-Image Interleave Chat via Multi-Modal Causal Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+Z">Zhewei Yao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiaoxia Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Conglong Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Minjia Zhang</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+H">Heyang Qin</a>, <a href="/search/cs?searchtype=author&query=Ruwase%2C+O">Olatunji Ruwase</a>, <a href="/search/cs?searchtype=author&query=Awan%2C+A+A">Ammar Ahmad Awan</a>, <a href="/search/cs?searchtype=author&query=Rajbhandari%2C+S">Samyam Rajbhandari</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuxiong He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.14327v3-abstract-short" style="display: inline;"> Most of the existing multi-modal models, hindered by their incapacity to adeptly manage interleaved image-and-text inputs in multi-image, multi-round dialogues, face substantial constraints in resource allocation for training and data accessibility, impacting their adaptability and scalability across varied interaction realms. To address this, we present the DeepSpeed-VisualChat framework, designe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.14327v3-abstract-full').style.display = 'inline'; document.getElementById('2309.14327v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.14327v3-abstract-full" style="display: none;"> Most of the existing multi-modal models, hindered by their incapacity to adeptly manage interleaved image-and-text inputs in multi-image, multi-round dialogues, face substantial constraints in resource allocation for training and data accessibility, impacting their adaptability and scalability across varied interaction realms. To address this, we present the DeepSpeed-VisualChat framework, designed to optimize Large Language Models (LLMs) by incorporating multi-modal capabilities, with a focus on enhancing the proficiency of Large Vision and Language Models in handling interleaved inputs. Our framework is notable for (1) its open-source support for multi-round and multi-image dialogues, (2) introducing an innovative multi-modal causal attention mechanism, and (3) utilizing data blending techniques on existing datasets to assure seamless interactions in multi-round, multi-image conversations. Compared to existing frameworks, DeepSpeed-VisualChat shows superior scalability up to 70B parameter language model size, representing a significant advancement in multi-modal language models and setting a solid foundation for future explorations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.14327v3-abstract-full').style.display = 'none'; document.getElementById('2309.14327v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.01320">arXiv:2308.01320</a> <span> [<a href="https://arxiv.org/pdf/2308.01320">pdf</a>, <a href="https://arxiv.org/format/2308.01320">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DeepSpeed-Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+Z">Zhewei Yao</a>, <a href="/search/cs?searchtype=author&query=Aminabadi%2C+R+Y">Reza Yazdani Aminabadi</a>, <a href="/search/cs?searchtype=author&query=Ruwase%2C+O">Olatunji Ruwase</a>, <a href="/search/cs?searchtype=author&query=Rajbhandari%2C+S">Samyam Rajbhandari</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiaoxia Wu</a>, <a href="/search/cs?searchtype=author&query=Awan%2C+A+A">Ammar Ahmad Awan</a>, <a href="/search/cs?searchtype=author&query=Rasley%2C+J">Jeff Rasley</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Minjia Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Conglong Li</a>, <a href="/search/cs?searchtype=author&query=Holmes%2C+C">Connor Holmes</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhongzhu Zhou</a>, <a href="/search/cs?searchtype=author&query=Wyatt%2C+M">Michael Wyatt</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+M">Molly Smith</a>, <a href="/search/cs?searchtype=author&query=Kurilenko%2C+L">Lev Kurilenko</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+H">Heyang Qin</a>, <a href="/search/cs?searchtype=author&query=Tanaka%2C+M">Masahiro Tanaka</a>, <a href="/search/cs?searchtype=author&query=Che%2C+S">Shuai Che</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S+L">Shuaiwen Leon Song</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuxiong He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.01320v1-abstract-short" style="display: inline;"> ChatGPT-like models have revolutionized various applications in artificial intelligence, from summarization and coding to translation, matching or even surpassing human performance. However, the current landscape lacks an accessible, efficient, and cost-effective end-to-end RLHF (Reinforcement Learning with Human Feedback) training pipeline for these powerful models, particularly when training at… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.01320v1-abstract-full').style.display = 'inline'; document.getElementById('2308.01320v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.01320v1-abstract-full" style="display: none;"> ChatGPT-like models have revolutionized various applications in artificial intelligence, from summarization and coding to translation, matching or even surpassing human performance. However, the current landscape lacks an accessible, efficient, and cost-effective end-to-end RLHF (Reinforcement Learning with Human Feedback) training pipeline for these powerful models, particularly when training at the scale of billions of parameters. This paper introduces DeepSpeed-Chat, a novel system that democratizes RLHF training, making it accessible to the AI community. DeepSpeed-Chat offers three key capabilities: an easy-to-use training and inference experience for ChatGPT-like models, a DeepSpeed-RLHF pipeline that replicates the training pipeline from InstructGPT, and a robust DeepSpeed-RLHF system that combines various optimizations for training and inference in a unified way. The system delivers unparalleled efficiency and scalability, enabling training of models with hundreds of billions of parameters in record time and at a fraction of the cost. With this development, DeepSpeed-Chat paves the way for broader access to advanced RLHF training, even for data scientists with limited resources, thereby fostering innovation and further development in the field of AI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.01320v1-abstract-full').style.display = 'none'; document.getElementById('2308.01320v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.10209">arXiv:2306.10209</a> <span> [<a href="https://arxiv.org/pdf/2306.10209">pdf</a>, <a href="https://arxiv.org/format/2306.10209">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> ZeRO++: Extremely Efficient Collective Communication for Giant Model Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guanhua Wang</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+H">Heyang Qin</a>, <a href="/search/cs?searchtype=author&query=Jacobs%2C+S+A">Sam Ade Jacobs</a>, <a href="/search/cs?searchtype=author&query=Holmes%2C+C">Connor Holmes</a>, <a href="/search/cs?searchtype=author&query=Rajbhandari%2C+S">Samyam Rajbhandari</a>, <a href="/search/cs?searchtype=author&query=Ruwase%2C+O">Olatunji Ruwase</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+F">Feng Yan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Lei Yang</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuxiong He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.10209v1-abstract-short" style="display: inline;"> Zero Redundancy Optimizer (ZeRO) has been used to train a wide range of large language models on massive GPUs clusters due to its ease of use, efficiency, and good scalability. However, when training on low-bandwidth clusters, or at scale which forces batch size per GPU to be small, ZeRO's effective throughput is limited because of high communication volume from gathering weights in forward pass,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.10209v1-abstract-full').style.display = 'inline'; document.getElementById('2306.10209v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.10209v1-abstract-full" style="display: none;"> Zero Redundancy Optimizer (ZeRO) has been used to train a wide range of large language models on massive GPUs clusters due to its ease of use, efficiency, and good scalability. However, when training on low-bandwidth clusters, or at scale which forces batch size per GPU to be small, ZeRO's effective throughput is limited because of high communication volume from gathering weights in forward pass, backward pass, and averaging gradients. This paper introduces three communication volume reduction techniques, which we collectively refer to as ZeRO++, targeting each of the communication collectives in ZeRO. First is block-quantization based all-gather. Second is data remapping that trades-off communication for more memory. Third is a novel all-to-all based quantized gradient averaging paradigm as replacement of reduce-scatter collective, which preserves accuracy despite communicating low precision data. Collectively, ZeRO++ reduces communication volume of ZeRO by 4x, enabling up to 2.16x better throughput at 384 GPU scale. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.10209v1-abstract-full').style.display = 'none'; document.getElementById('2306.10209v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.06318">arXiv:2303.06318</a> <span> [<a href="https://arxiv.org/pdf/2303.06318">pdf</a>, <a href="https://arxiv.org/format/2303.06318">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3577193.3593704">10.1145/3577193.3593704 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A Hybrid Tensor-Expert-Data Parallelism Approach to Optimize Mixture-of-Experts Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Singh%2C+S">Siddharth Singh</a>, <a href="/search/cs?searchtype=author&query=Ruwase%2C+O">Olatunji Ruwase</a>, <a href="/search/cs?searchtype=author&query=Awan%2C+A+A">Ammar Ahmad Awan</a>, <a href="/search/cs?searchtype=author&query=Rajbhandari%2C+S">Samyam Rajbhandari</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuxiong He</a>, <a href="/search/cs?searchtype=author&query=Bhatele%2C+A">Abhinav Bhatele</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.06318v2-abstract-short" style="display: inline;"> Mixture-of-Experts (MoE) is a neural network architecture that adds sparsely activated expert blocks to a base model, increasing the number of parameters without impacting computational costs. However, current distributed deep learning frameworks are limited in their ability to train high-quality MoE models with large base models. In this work, we present DeepSpeed-TED, a novel, three-dimensional,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.06318v2-abstract-full').style.display = 'inline'; document.getElementById('2303.06318v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.06318v2-abstract-full" style="display: none;"> Mixture-of-Experts (MoE) is a neural network architecture that adds sparsely activated expert blocks to a base model, increasing the number of parameters without impacting computational costs. However, current distributed deep learning frameworks are limited in their ability to train high-quality MoE models with large base models. In this work, we present DeepSpeed-TED, a novel, three-dimensional, hybrid parallel algorithm that combines data, tensor, and expert parallelism to enable the training of MoE models with 4 to 8x larger base models than the current state-of-the-art. We also describe memory optimizations in the optimizer step, and communication optimizations that eliminate unnecessary data movement. We implement our approach in DeepSpeed and achieve speedups of 26% over a baseline (i.e. without our communication optimizations) when training a 40 billion parameter MoE model (6.7 billion base model with 16 experts) on 128 V100 GPUs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.06318v2-abstract-full').style.display = 'none'; document.getElementById('2303.06318v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.05100">arXiv:2211.05100</a> <span> [<a href="https://arxiv.org/pdf/2211.05100">pdf</a>, <a href="https://arxiv.org/format/2211.05100">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> BLOOM: A 176B-Parameter Open-Access Multilingual Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Workshop%2C+B">BigScience Workshop</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Scao%2C+T+L">Teven Le Scao</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+A">Angela Fan</a>, <a href="/search/cs?searchtype=author&query=Akiki%2C+C">Christopher Akiki</a>, <a href="/search/cs?searchtype=author&query=Pavlick%2C+E">Ellie Pavlick</a>, <a href="/search/cs?searchtype=author&query=Ili%C4%87%2C+S">Suzana Ili膰</a>, <a href="/search/cs?searchtype=author&query=Hesslow%2C+D">Daniel Hesslow</a>, <a href="/search/cs?searchtype=author&query=Castagn%C3%A9%2C+R">Roman Castagn茅</a>, <a href="/search/cs?searchtype=author&query=Luccioni%2C+A+S">Alexandra Sasha Luccioni</a>, <a href="/search/cs?searchtype=author&query=Yvon%2C+F">Fran莽ois Yvon</a>, <a href="/search/cs?searchtype=author&query=Gall%C3%A9%2C+M">Matthias Gall茅</a>, <a href="/search/cs?searchtype=author&query=Tow%2C+J">Jonathan Tow</a>, <a href="/search/cs?searchtype=author&query=Rush%2C+A+M">Alexander M. Rush</a>, <a href="/search/cs?searchtype=author&query=Biderman%2C+S">Stella Biderman</a>, <a href="/search/cs?searchtype=author&query=Webson%2C+A">Albert Webson</a>, <a href="/search/cs?searchtype=author&query=Ammanamanchi%2C+P+S">Pawan Sasanka Ammanamanchi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Thomas Wang</a>, <a href="/search/cs?searchtype=author&query=Sagot%2C+B">Beno卯t Sagot</a>, <a href="/search/cs?searchtype=author&query=Muennighoff%2C+N">Niklas Muennighoff</a>, <a href="/search/cs?searchtype=author&query=del+Moral%2C+A+V">Albert Villanova del Moral</a>, <a href="/search/cs?searchtype=author&query=Ruwase%2C+O">Olatunji Ruwase</a>, <a href="/search/cs?searchtype=author&query=Bawden%2C+R">Rachel Bawden</a>, <a href="/search/cs?searchtype=author&query=Bekman%2C+S">Stas Bekman</a>, <a href="/search/cs?searchtype=author&query=McMillan-Major%2C+A">Angelina McMillan-Major</a> , et al. (369 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.05100v4-abstract-short" style="display: inline;"> Large language models (LLMs) have been shown to be able to perform new tasks based on a few demonstrations or natural language instructions. While these capabilities have led to widespread adoption, most LLMs are developed by resource-rich organizations and are frequently kept from the public. As a step towards democratizing this powerful technology, we present BLOOM, a 176B-parameter open-access… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.05100v4-abstract-full').style.display = 'inline'; document.getElementById('2211.05100v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.05100v4-abstract-full" style="display: none;"> Large language models (LLMs) have been shown to be able to perform new tasks based on a few demonstrations or natural language instructions. While these capabilities have led to widespread adoption, most LLMs are developed by resource-rich organizations and are frequently kept from the public. As a step towards democratizing this powerful technology, we present BLOOM, a 176B-parameter open-access language model designed and built thanks to a collaboration of hundreds of researchers. BLOOM is a decoder-only Transformer language model that was trained on the ROOTS corpus, a dataset comprising hundreds of sources in 46 natural and 13 programming languages (59 in total). We find that BLOOM achieves competitive performance on a wide variety of benchmarks, with stronger results after undergoing multitask prompted finetuning. To facilitate future research and applications using LLMs, we publicly release our models and code under the Responsible AI License. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.05100v4-abstract-full').style.display = 'none'; document.getElementById('2211.05100v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.00032">arXiv:2207.00032</a> <span> [<a href="https://arxiv.org/pdf/2207.00032">pdf</a>, <a href="https://arxiv.org/format/2207.00032">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> DeepSpeed Inference: Enabling Efficient Inference of Transformer Models at Unprecedented Scale </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Aminabadi%2C+R+Y">Reza Yazdani Aminabadi</a>, <a href="/search/cs?searchtype=author&query=Rajbhandari%2C+S">Samyam Rajbhandari</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Minjia Zhang</a>, <a href="/search/cs?searchtype=author&query=Awan%2C+A+A">Ammar Ahmad Awan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Cheng Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Du Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+E">Elton Zheng</a>, <a href="/search/cs?searchtype=author&query=Rasley%2C+J">Jeff Rasley</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+S">Shaden Smith</a>, <a href="/search/cs?searchtype=author&query=Ruwase%2C+O">Olatunji Ruwase</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuxiong He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.00032v1-abstract-short" style="display: inline;"> The past several years have witnessed the success of transformer-based models, and their scale and application scenarios continue to grow aggressively. The current landscape of transformer models is increasingly diverse: the model size varies drastically with the largest being of hundred-billion parameters; the model characteristics differ due to the sparsity introduced by the Mixture-of-Experts;… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.00032v1-abstract-full').style.display = 'inline'; document.getElementById('2207.00032v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.00032v1-abstract-full" style="display: none;"> The past several years have witnessed the success of transformer-based models, and their scale and application scenarios continue to grow aggressively. The current landscape of transformer models is increasingly diverse: the model size varies drastically with the largest being of hundred-billion parameters; the model characteristics differ due to the sparsity introduced by the Mixture-of-Experts; the target application scenarios can be latency-critical or throughput-oriented; the deployment hardware could be single- or multi-GPU systems with different types of memory and storage, etc. With such increasing diversity and the fast-evolving pace of transformer models, designing a highly performant and efficient inference system is extremely challenging. In this paper, we present DeepSpeed Inference, a comprehensive system solution for transformer model inference to address the above-mentioned challenges. DeepSpeed Inference consists of (1) a multi-GPU inference solution to minimize latency while maximizing the throughput of both dense and sparse transformer models when they fit in aggregate GPU memory, and (2) a heterogeneous inference solution that leverages CPU and NVMe memory in addition to the GPU memory and compute to enable high inference throughput with large models which do not fit in aggregate GPU memory. DeepSpeed Inference reduces latency by up to 7.3X over the state-of-the-art for latency-oriented scenarios and increases throughput by over 1.5x for throughput-oriented scenarios. Moreover, it enables trillion parameter scale inference under real-time latency constraints by leveraging hundreds of GPUs, an unprecedented scale for inference. It can inference 25x larger models than with GPU-only solutions, while delivering a high throughput of 84 TFLOPS (over $50\%$ of A6000 peak). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.00032v1-abstract-full').style.display = 'none'; document.getElementById('2207.00032v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.07857">arXiv:2104.07857</a> <span> [<a href="https://arxiv.org/pdf/2104.07857">pdf</a>, <a href="https://arxiv.org/format/2104.07857">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rajbhandari%2C+S">Samyam Rajbhandari</a>, <a href="/search/cs?searchtype=author&query=Ruwase%2C+O">Olatunji Ruwase</a>, <a href="/search/cs?searchtype=author&query=Rasley%2C+J">Jeff Rasley</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+S">Shaden Smith</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuxiong He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.07857v1-abstract-short" style="display: inline;"> In the last three years, the largest dense deep learning models have grown over 1000x to reach hundreds of billions of parameters, while the GPU memory has only grown by 5x (16 GB to 80 GB). Therefore, the growth in model scale has been supported primarily though system innovations that allow large models to fit in the aggregate GPU memory of multiple GPUs. However, we are getting close to the GPU… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.07857v1-abstract-full').style.display = 'inline'; document.getElementById('2104.07857v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.07857v1-abstract-full" style="display: none;"> In the last three years, the largest dense deep learning models have grown over 1000x to reach hundreds of billions of parameters, while the GPU memory has only grown by 5x (16 GB to 80 GB). Therefore, the growth in model scale has been supported primarily though system innovations that allow large models to fit in the aggregate GPU memory of multiple GPUs. However, we are getting close to the GPU memory wall. It requires 800 NVIDIA V100 GPUs just to fit a trillion parameter model for training, and such clusters are simply out of reach for most data scientists. In addition, training models at that scale requires complex combinations of parallelism techniques that puts a big burden on the data scientists to refactor their model. In this paper we present ZeRO-Infinity, a novel heterogeneous system technology that leverages GPU, CPU, and NVMe memory to allow for unprecedented model scale on limited resources without requiring model code refactoring. At the same time it achieves excellent training throughput and scalability, unencumbered by the limited CPU or NVMe bandwidth. ZeRO-Infinity can fit models with tens and even hundreds of trillions of parameters for training on current generation GPU clusters. It can be used to fine-tune trillion parameter models on a single NVIDIA DGX-2 node, making large models more accessible. In terms of training throughput and scalability, it sustains over 25 petaflops on 512 NVIDIA V100 GPUs(40% of peak), while also demonstrating super linear scalability. An open source implementation of ZeRO-Infinity is available through DeepSpeed, a deep learning optimization library that makes distributed training easy, efficient, and effective. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.07857v1-abstract-full').style.display = 'none'; document.getElementById('2104.07857v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2101.06840">arXiv:2101.06840</a> <span> [<a href="https://arxiv.org/pdf/2101.06840">pdf</a>, <a href="https://arxiv.org/format/2101.06840">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ZeRO-Offload: Democratizing Billion-Scale Model Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ren%2C+J">Jie Ren</a>, <a href="/search/cs?searchtype=author&query=Rajbhandari%2C+S">Samyam Rajbhandari</a>, <a href="/search/cs?searchtype=author&query=Aminabadi%2C+R+Y">Reza Yazdani Aminabadi</a>, <a href="/search/cs?searchtype=author&query=Ruwase%2C+O">Olatunji Ruwase</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuangyan Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Minjia Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dong Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuxiong He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2101.06840v1-abstract-short" style="display: inline;"> Large-scale model training has been a playing ground for a limited few requiring complex model refactoring and access to prohibitively expensive GPU clusters. ZeRO-Offload changes the large model training landscape by making large model training accessible to nearly everyone. It can train models with over 13 billion parameters on a single GPU, a 10x increase in size compared to popular framework s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.06840v1-abstract-full').style.display = 'inline'; document.getElementById('2101.06840v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2101.06840v1-abstract-full" style="display: none;"> Large-scale model training has been a playing ground for a limited few requiring complex model refactoring and access to prohibitively expensive GPU clusters. ZeRO-Offload changes the large model training landscape by making large model training accessible to nearly everyone. It can train models with over 13 billion parameters on a single GPU, a 10x increase in size compared to popular framework such as PyTorch, and it does so without requiring any model change from the data scientists or sacrificing computational efficiency. ZeRO-Offload enables large model training by offloading data and compute to CPU. To preserve compute efficiency, it is designed to minimize the data movement to/from GPU, and reduce CPU compute time while maximizing memory savings on GPU. As a result, ZeRO-Offload can achieve 40 TFlops/GPU on a single NVIDIA V100 GPU for 10B parameter model compared to 30TF using PyTorch alone for a 1.4B parameter model, the largest that can be trained without running out of memory. ZeRO-Offload is also designed to scale on multiple-GPUs when available, offering near linear speedup on up to 128 GPUs. Additionally, it can work together with model parallelism to train models with over 70 billion parameters on a single DGX-2 box, a 4.5x increase in model size compared to using model parallelism alone. By combining compute and memory efficiency with ease-of-use, ZeRO-Offload democratizes large-scale model training making it accessible to even data scientists with access to just a single GPU. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.06840v1-abstract-full').style.display = 'none'; document.getElementById('2101.06840v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1911.01258">arXiv:1911.01258</a> <span> [<a href="https://arxiv.org/pdf/1911.01258">pdf</a>, <a href="https://arxiv.org/format/1911.01258">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> SHARP: An Adaptable, Energy-Efficient Accelerator for Recurrent Neural Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yazdani%2C+R">Reza Yazdani</a>, <a href="/search/cs?searchtype=author&query=Ruwase%2C+O">Olatunji Ruwase</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Minjia Zhang</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuxiong He</a>, <a href="/search/cs?searchtype=author&query=Arnau%2C+J">Jose-Maria Arnau</a>, <a href="/search/cs?searchtype=author&query=Gonzalez%2C+A">Antonio Gonzalez</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1911.01258v3-abstract-short" style="display: inline;"> The effectiveness of Recurrent Neural Networks (RNNs) for tasks such as Automatic Speech Recognition has fostered interest in RNN inference acceleration. Due to the recurrent nature and data dependencies of RNN computations, prior work has designed customized architectures specifically tailored to the computation pattern of RNN, getting high computation efficiency for certain chosen model sizes. H… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1911.01258v3-abstract-full').style.display = 'inline'; document.getElementById('1911.01258v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1911.01258v3-abstract-full" style="display: none;"> The effectiveness of Recurrent Neural Networks (RNNs) for tasks such as Automatic Speech Recognition has fostered interest in RNN inference acceleration. Due to the recurrent nature and data dependencies of RNN computations, prior work has designed customized architectures specifically tailored to the computation pattern of RNN, getting high computation efficiency for certain chosen model sizes. However, given that the dimensionality of RNNs varies a lot for different tasks, it is crucial to generalize this efficiency to diverse configurations. In this work, we identify adaptiveness as a key feature that is missing from today's RNN accelerators. In particular, we first show the problem of low resource-utilization and low adaptiveness for the state-of-the-art RNN implementations on GPU, FPGA and ASIC architectures. To solve these issues, we propose an intelligent tiled-based dispatching mechanism for increasing the adaptiveness of RNN computation, in order to efficiently handle the data dependencies. To do so, we propose Sharp as a hardware accelerator, which pipelines RNN computation using an effective scheduling scheme to hide most of the dependent serialization. Furthermore, Sharp employs dynamic reconfigurable architecture to adapt to the model's characteristics. Sharp achieves 2x, 2.8x, and 82x speedups on average, considering different RNN models and resource budgets, compared to the state-of-the-art ASIC, FPGA, and GPU implementations, respectively. Furthermore, we provide significant energy-reduction with respect to the previous solutions, due to the low power dissipation of Sharp (321 GFLOPS/Watt). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1911.01258v3-abstract-full').style.display = 'none'; document.getElementById('1911.01258v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1910.02054">arXiv:1910.02054</a> <span> [<a href="https://arxiv.org/pdf/1910.02054">pdf</a>, <a href="https://arxiv.org/format/1910.02054">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> ZeRO: Memory Optimizations Toward Training Trillion Parameter Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rajbhandari%2C+S">Samyam Rajbhandari</a>, <a href="/search/cs?searchtype=author&query=Rasley%2C+J">Jeff Rasley</a>, <a href="/search/cs?searchtype=author&query=Ruwase%2C+O">Olatunji Ruwase</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuxiong He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1910.02054v3-abstract-short" style="display: inline;"> Large deep learning models offer significant accuracy gains, but training billions to trillions of parameters is challenging. Existing solutions such as data and model parallelisms exhibit fundamental limitations to fit these models into limited device memory, while obtaining computation, communication and development efficiency. We develop a novel solution, Zero Redundancy Optimizer (ZeRO), to op… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.02054v3-abstract-full').style.display = 'inline'; document.getElementById('1910.02054v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1910.02054v3-abstract-full" style="display: none;"> Large deep learning models offer significant accuracy gains, but training billions to trillions of parameters is challenging. Existing solutions such as data and model parallelisms exhibit fundamental limitations to fit these models into limited device memory, while obtaining computation, communication and development efficiency. We develop a novel solution, Zero Redundancy Optimizer (ZeRO), to optimize memory, vastly improving training speed while increasing the model size that can be efficiently trained. ZeRO eliminates memory redundancies in data- and model-parallel training while retaining low communication volume and high computational granularity, allowing us to scale the model size proportional to the number of devices with sustained high efficiency. Our analysis on memory requirements and communication volume demonstrates: ZeRO has the potential to scale beyond 1 Trillion parameters using today's hardware. We implement and evaluate ZeRO: it trains large models of over 100B parameter with super-linear speedup on 400 GPUs, achieving throughput of 15 Petaflops. This represents an 8x increase in model size and 10x increase in achievable performance over state-of-the-art. In terms of usability, ZeRO can train large models of up to 13B parameters (e.g., larger than Megatron GPT 8.3B and T5 11B) without requiring model parallelism which is harder for scientists to apply. Last but not the least, researchers have used the system breakthroughs of ZeRO to create the world's largest language model (Turing-NLG, 17B parameters) with record breaking accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.02054v3-abstract-full').style.display = 'none'; document.getElementById('1910.02054v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2019. </p> </li> </ol> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>