CINXE.COM

<!doctype html> <html lang="en" dir="ltr" class="docs-wrapper docs-doc-page docs-version-current plugin-docs plugin-id-default docs-doc-id-concepts/distributed_training"> <head> <meta charset="UTF-8"> <meta name="generator" content="Docusaurus v2.3.1"> <title data-rh="true">Distributed Training | Colossal-AI</title><meta data-rh="true" name="viewport" content="width=device-width,initial-scale=1"><meta data-rh="true" name="twitter:card" content="summary_large_image"><meta data-rh="true" property="og:image" content="https://colossalai.org/img/social-card.png"><meta data-rh="true" name="twitter:image" content="https://colossalai.org/img/social-card.png"><meta data-rh="true" property="og:url" content="https://colossalai.org/docs/concepts/distributed_training"><meta data-rh="true" name="docusaurus_locale" content="en"><meta data-rh="true" name="docsearch:language" content="en"><meta data-rh="true" name="docusaurus_version" content="current"><meta data-rh="true" name="docusaurus_tag" content="docs-default-current"><meta data-rh="true" name="docsearch:version" content="current"><meta data-rh="true" name="docsearch:docusaurus_tag" content="docs-default-current"><meta data-rh="true" property="og:title" content="Distributed Training | Colossal-AI"><meta data-rh="true" name="description" content="Author: Shenggui Li, Siqi Mai"><meta data-rh="true" property="og:description" content="Author: Shenggui Li, Siqi Mai"><link data-rh="true" rel="icon" href="/img/favicon.ico"><link data-rh="true" rel="canonical" href="https://colossalai.org/docs/concepts/distributed_training"><link data-rh="true" rel="alternate" href="https://colossalai.org/docs/concepts/distributed_training" hreflang="en"><link data-rh="true" rel="alternate" href="https://colossalai.org/zh-Hans/docs/concepts/distributed_training" hreflang="zh-Hans"><link data-rh="true" rel="alternate" href="https://colossalai.org/docs/concepts/distributed_training" hreflang="x-default"><link data-rh="true" rel="preconnect" href="https://XP2V2KAOVI-dsn.algolia.net" crossorigin="anonymous"><link rel="alternate" type="application/rss+xml" href="/blog/rss.xml" title="Colossal-AI RSS Feed"> <link rel="alternate" type="application/atom+xml" href="/blog/atom.xml" title="Colossal-AI Atom Feed"> <link rel="preconnect" href="https://www.google-analytics.com"> <link rel="preconnect" href="https://www.googletagmanager.com"> <script async src="https://www.googletagmanager.com/gtag/js?id=G-1XKZVCCKRZ"></script> <script>function gtag(){dataLayer.push(arguments)}window.dataLayer=window.dataLayer||[],gtag("js",new Date),gtag("config","G-1XKZVCCKRZ",{anonymize_ip:!0})</script> <link rel="search" type="application/opensearchdescription+xml" title="Colossal-AI" href="/opensearch.xml"> <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.13.24/dist/katex.min.css" integrity="sha384-odtC+0UGzzFL/6PNoE8rX/SPcQDXBJ+uRepguP4QkPCm2LBxH3FA3y+fKSiJ+AmM" crossorigin="anonymous"> <script src="https://js-eu1.hs-scripts.com/26563514.js" id="hs-script-loader" async defer="defer"></script><link rel="stylesheet" href="/assets/css/styles.d9a9b3d5.css"> <link rel="preload" href="/assets/js/runtime~main.6b0f350a.js" as="script"> <link rel="preload" href="/assets/js/main.c6c31bc7.js" as="script"> </head> <body class="navigation-with-keyboard"> <script>!function(){function t(t){document.documentElement.setAttribute("data-theme",t)}var e=function(){var t=null;try{t=localStorage.getItem("theme")}catch(t){}return t}();t(null!==e?e:"light")}()</script><div id="__docusaurus"> <div role="region" aria-label="Skip to main content"><a class="skipToContent_fXgn" href="#docusaurus_skipToContent_fallback">Skip to main content</a></div><nav aria-label="Main" class="navbar navbar--fixed-top"><div class="navbar__inner"><div class="navbar__items"><button aria-label="Toggle navigation bar" aria-expanded="false" class="navbar__toggle clean-btn" type="button"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a class="navbar__brand" href="/"><div class="navbar__logo"><img src="/img/logo.svg" alt="project-logo" class="themedImage_ToTc themedImage--light_HNdA"><img src="/img/logo.svg" alt="project-logo" class="themedImage_ToTc themedImage--dark_i4oU"></div><b class="navbar__title text--truncate">Colossal-AI</b></a><a class="navbar__item navbar__link" href="/docs/get_started/installation">Tutorials</a><a href="https://github.com/hpcaitech/ColossalAI/tree/main/examples" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">Examples</a><a href="https://www.hpc-ai.tech/blog" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">Blogs</a></div><div class="navbar__items navbar__items--right"><a class="navbar__item navbar__link" href="/docs/get_started/installation">Next</a><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a href="#" aria-haspopup="true" aria-expanded="false" role="button" class="navbar__link"><svg viewBox="0 0 24 24" width="20" height="20" aria-hidden="true" class="iconLanguage_nlXk"><path fill="currentColor" d="M12.87 15.07l-2.54-2.51.03-.03c1.74-1.94 2.98-4.17 3.71-6.53H17V4h-7V2H8v2H1v1.99h11.17C11.5 7.92 10.44 9.75 9 11.35 8.07 10.32 7.3 9.19 6.69 8h-2c.73 1.63 1.73 3.17 2.98 4.56l-5.09 5.02L4 19l5-5 3.11 3.11.76-2.04zM18.5 10h-2L12 22h2l1.12-3h4.75L21 22h2l-4.5-12zm-2.62 7l1.62-4.33L19.12 17h-3.24z"></path></svg>English</a><ul class="dropdown__menu"><li><a href="/docs/concepts/distributed_training" target="_self" rel="noopener noreferrer" class="dropdown__link dropdown__link--active" lang="en">English</a></li><li><a href="/zh-Hans/docs/concepts/distributed_training" target="_self" rel="noopener noreferrer" class="dropdown__link" lang="zh-Hans">简体中文</a></li></ul></div><a href="https://github.com/hpcaitech/ColossalAI" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link header-github-link"></a><div class="searchBox_ZlJk"><button type="button" class="DocSearch DocSearch-Button" aria-label="Search"><span class="DocSearch-Button-Container"><svg width="20" height="20" class="DocSearch-Search-Icon" viewBox="0 0 20 20"><path d="M14.386 14.386l4.0877 4.0877-4.0877-4.0877c-2.9418 2.9419-7.7115 2.9419-10.6533 0-2.9419-2.9418-2.9419-7.7115 0-10.6533 2.9418-2.9419 7.7115-2.9419 10.6533 0 2.9419 2.9418 2.9419 7.7115 0 10.6533z" stroke="currentColor" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg><span class="DocSearch-Button-Placeholder">Search</span></span><span class="DocSearch-Button-Keys"></span></button></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div></nav><div id="docusaurus_skipToContent_fallback" class="main-wrapper mainWrapper_z2l0 docsWrapper_BCFX"><button aria-label="Scroll back to top" class="clean-btn theme-back-to-top-button backToTopButton_sjWU" type="button"></button><div class="docPage__5DB"><aside class="theme-doc-sidebar-container docSidebarContainer_b6E3"><div class="sidebarViewport_Xe31"><div class="sidebar_njMd"><nav aria-label="Docs sidebar" class="menu thin-scrollbar menu_SIkG"><ul class="theme-doc-sidebar-menu menu__list"><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/get_started/installation">Get started</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret menu__link--active" aria-expanded="true" href="/docs/concepts/distributed_training">Concepts</a></div><ul style="display:block;overflow:visible;height:auto" class="menu__list"><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link menu__link--active" aria-current="page" tabindex="0" href="/docs/concepts/distributed_training">Distributed Training</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/concepts/paradigms_of_parallelism">Paradigms of Parallelism</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/concepts/colossalai_overview">Colossal-AI Overview</a></li></ul></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/basics/command_line_tool">Basics</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/features/shardformer">Features</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/advanced_tutorials/train_vit_with_hybrid_parallelism">Advanced Tutorials</a></div></li></ul></nav></div></div></aside><main class="docMainContainer_gTbr"><div class="container padding-top--md padding-bottom--lg"><div class="row"><div class="col docItemCol_VOVn"><div class="docItemContainer_Djhp"><article><nav class="theme-doc-breadcrumbs breadcrumbsContainer_Z_bl" aria-label="Breadcrumbs"><ul class="breadcrumbs" itemscope="" itemtype="https://schema.org/BreadcrumbList"><li class="breadcrumbs__item"><a aria-label="Home page" class="breadcrumbs__link" href="/"><svg viewBox="0 0 24 24" class="breadcrumbHomeIcon_YNFT"><path d="M10 19v-5h4v5c0 .55.45 1 1 1h3c.55 0 1-.45 1-1v-7h1.7c.46 0 .68-.57.33-.87L12.67 3.6c-.38-.34-.96-.34-1.34 0l-8.36 7.53c-.34.3-.13.87.33.87H5v7c0 .55.45 1 1 1h3c.55 0 1-.45 1-1z" fill="currentColor"></path></svg></a></li><li class="breadcrumbs__item"><span class="breadcrumbs__link">Concepts</span><meta itemprop="position" content="1"></li><li itemscope="" itemprop="itemListElement" itemtype="https://schema.org/ListItem" class="breadcrumbs__item breadcrumbs__item--active"><span class="breadcrumbs__link" itemprop="name">Distributed Training</span><meta itemprop="position" content="2"></li></ul></nav><div class="tocCollapsible_ETCw theme-doc-toc-mobile tocMobile_ITEo"><button type="button" class="clean-btn tocCollapsibleButton_TO0P">On this page</button></div><div class="theme-doc-markdown markdown"><h1>Distributed Training</h1><p>Author: Shenggui Li, Siqi Mai</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="what-is-a-distributed-system">What is a distributed system?<a href="#what-is-a-distributed-system" class="hash-link" aria-label="Direct link to heading" title="Direct link to heading"></a></h2><figure style="text-align:center"><img loading="lazy" src="https://s2.loli.net/2022/01/28/sE5daHf2ohIy9wX.png" class="img_ev3q"><figcaption>Image source: <a href="https://towardsdatascience.com/distributed-training-in-the-cloud-cloud-machine-learning-engine-9e264ddde27f" target="_blank" rel="noopener noreferrer">Towards Data Science</a></figcaption></figure><p>A distributed system consists of multiple software components which run on multiple machines. For example, the traditional database runs on a single machine. As the amount of data gets incredibly large, a single machine can no longer deliver desirable performance to the business, especially in situations such as Black Friday where network traffic can be unexpectedly high. To handle such pressure, modern high-performance database is designed to run on multiple machines, and they work together to provide high throughput and low latency to the user.</p><p>One important evaluation metric for distributed system is scalability. For example, when we run an application on 4 machines, we naturally expect that the application can run 4 times faster. However, due to communication overhead and difference in hardware performance, it is difficult to achieve linear speedup. Thus, it is important to consider how to make the application faster when we implement it. Algorithms of good design and system optimization can help to deliver good performance. Sometimes, it is even possible to achieve linear and super-linear speedup.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="why-we-need-distributed-training-for-machine-learning">Why we need distributed training for machine learning?<a href="#why-we-need-distributed-training-for-machine-learning" class="hash-link" aria-label="Direct link to heading" title="Direct link to heading"></a></h2><p>Back in 2012, <a href="https://arxiv.org/abs/1404.5997" target="_blank" rel="noopener noreferrer">AlexNet</a> won the champion of the ImageNet competition, and it was trained on two GTX 580 3GB GPUs. Today, most models that appear in the top AI conferences are trained on multiple GPUs. Distributed training is definitely a common practice when researchers and engineers develop AI models. There are several reasons behind this trend.</p><ol><li>Model size increases rapidly. <a href="https://arxiv.org/abs/1512.03385" target="_blank" rel="noopener noreferrer">ResNet50</a> has 20 million parameters in 2015, <a href="https://arxiv.org/abs/1810.04805" target="_blank" rel="noopener noreferrer">BERT-Large</a> has 345 million parameters in 2018, <a href="https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf" target="_blank" rel="noopener noreferrer">GPT-2</a> has 1.5 billion parameters in 2018, and <a href="https://arxiv.org/abs/2005.14165" target="_blank" rel="noopener noreferrer">GPT-3</a> has 175 billion parameters in 2020. It is obvious that the model size grows exponentially with time. The current largest model has exceeded more than 1000 billion parameters. Super large models generally deliver more superior performance compared to their smaller counterparts.<figure style="text-align:center"><img loading="lazy" src="https://s2.loli.net/2022/01/28/sCyreJ9PF1EdZYf.jpg" class="img_ev3q"><figcaption>Image source: <a href="https://huggingface.co/blog/large-language-models" target="_blank" rel="noopener noreferrer">HuggingFace</a></figcaption></figure></li></ol><ol start="2"><li>Dataset size increases rapidly. For most machine learning developers, MNIST and CIFAR10 datasets are often the first few datasets on which they train their models. However, these datasets are very small compared to well-known ImageNet datasets. Google even has its own (unpublished) JFT-300M dataset which has around 300 million images, and this is close to 300 times larger than the ImageNet-1k dataset.</li></ol><ol start="3"><li>Computing power gets stronger. With the advancement in the semiconductor industry, graphics cards become more and more powerful. Due to its larger number of cores, GPU is the most common compute platform for deep learning. From K10 GPU in 2012 to A100 GPU in 2020, the computing power has increased several hundred times. This allows us to performance compute-intensive tasks faster and deep learning is exactly such a task.</li></ol><p>Nowadays, the model can be too large to fit into a single GPU, and the dataset can be large enough to train for a hundred days on a single GPU. Only by training our models on multiple GPUs with different parallelization techniques, we are able to speed up the training process and obtain results in a reasonable amount of time.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="basic-concepts-in-distributed-training">Basic Concepts in Distributed Training<a href="#basic-concepts-in-distributed-training" class="hash-link" aria-label="Direct link to heading" title="Direct link to heading"></a></h2><p>Distributed training requires multiple machines/GPUs. During training, there will be communication among these devices. To understand distributed training better, there are several important terms to be made clear.</p><ul><li>host: host is the main device in the communication network. It is often required as an argument when initializing the distributed environment.</li><li>port: port here mainly refers to master port on the host for communication.</li><li>rank: the unique ID given to a device in the network.</li><li>world size: the number of devices in the network.</li><li>process group: a process group is a communication network which include a subset of the devices. There is always a default process group which contains all the devices. A subset devices can form a process group so that they only communicate among the devices within the group.</li></ul><figure style="text-align:center"><img loading="lazy" src="https://s2.loli.net/2022/01/28/qnNBKh8AjzgM5sY.png" class="img_ev3q"><figcaption>A distributed system example</figcaption></figure><p>To illustrate these concepts, let's assume we have 2 machines (also called nodes), and each machine has 4 GPUs. When we initialize distributed environment over these two machines, we essentially launch 8 processes (4 processes on each machine) and each process is bound to a GPU.</p><p>Before initializing the distributed environment, we need to specify the host (master address) and port (master port). In this example, we can let host be node 0 and port be a number such as 29500. All the 8 processes will then look for the address and port and connect to one another. The default process group will then be created. The default process group has a world size of 8 and details are as follows:</p><table><thead><tr><th>process ID</th><th>rank</th><th>Node index</th><th>GPU index</th></tr></thead><tbody><tr><td>0</td><td>0</td><td>0</td><td>0</td></tr><tr><td>1</td><td>1</td><td>0</td><td>1</td></tr><tr><td>2</td><td>2</td><td>0</td><td>2</td></tr><tr><td>3</td><td>3</td><td>0</td><td>3</td></tr><tr><td>4</td><td>4</td><td>1</td><td>0</td></tr><tr><td>5</td><td>5</td><td>1</td><td>1</td></tr><tr><td>6</td><td>6</td><td>1</td><td>2</td></tr><tr><td>7</td><td>7</td><td>1</td><td>3</td></tr></tbody></table><p>We can also create a new process group. This new process group can contain any subset of the processes. For example, we can create one containing only even-number processes, and the details of this new group will be:</p><table><thead><tr><th>process ID</th><th>rank</th><th>Node index</th><th>GPU index</th></tr></thead><tbody><tr><td>0</td><td>0</td><td>0</td><td>0</td></tr><tr><td>2</td><td>1</td><td>0</td><td>2</td></tr><tr><td>4</td><td>2</td><td>1</td><td>0</td></tr><tr><td>6</td><td>3</td><td>1</td><td>2</td></tr></tbody></table><p><strong>Please note that rank is relative to the process group and one process can have a different rank in different process groups. The max rank is always <code>world size of the process group - 1</code>.</strong></p><p>In the process group, the processes can communicate in two ways:</p><ol><li>peer-to-peer: one process send data to another process</li><li>collective: a group of process perform operations such as scatter, gather, all-reduce, broadcast together.</li></ol><figure style="text-align:center"><img loading="lazy" src="https://s2.loli.net/2022/01/28/zTmlxgc3oeAdn97.png" class="img_ev3q"><figcaption>Collective communication, source: <a href="https://pytorch.org/tutorials/intermediate/dist_tuto.html" target="_blank" rel="noopener noreferrer">PyTorch distributed tutorial</a></figcaption></figure></div><footer class="theme-doc-footer docusaurus-mt-lg"><div class="theme-doc-footer-edit-meta-row row"><div class="col"><a href="https://github.com/facebook/docusaurus/tree/main/packages/create-docusaurus/templates/shared/docs/concepts/distributed_training.md" target="_blank" rel="noreferrer noopener" class="theme-edit-this-page"><svg fill="currentColor" height="20" width="20" viewBox="0 0 40 40" class="iconEdit_Z9Sw" aria-hidden="true"><g><path d="m34.5 11.7l-3 3.1-6.3-6.3 3.1-3q0.5-0.5 1.2-0.5t1.1 0.5l3.9 3.9q0.5 0.4 0.5 1.1t-0.5 1.2z m-29.5 17.1l18.4-18.5 6.3 6.3-18.4 18.4h-6.3v-6.2z"></path></g></svg>Edit this page</a></div><div class="col lastUpdated_vwxv"></div></div></footer></article><nav class="pagination-nav docusaurus-mt-lg" aria-label="Docs pages navigation"><a class="pagination-nav__link pagination-nav__link--prev" href="/docs/get_started/bonus"><div class="pagination-nav__sublabel">Previous</div><div class="pagination-nav__label">Bonus Event</div></a><a class="pagination-nav__link pagination-nav__link--next" href="/docs/concepts/paradigms_of_parallelism"><div class="pagination-nav__sublabel">Next</div><div class="pagination-nav__label">Paradigms of Parallelism</div></a></nav></div></div><div class="col col--3"><div class="tableOfContents_bqdL thin-scrollbar theme-doc-toc-desktop"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#what-is-a-distributed-system" class="table-of-contents__link toc-highlight">What is a distributed system?</a></li><li><a href="#why-we-need-distributed-training-for-machine-learning" class="table-of-contents__link toc-highlight">Why we need distributed training for machine learning?</a></li><li><a href="#basic-concepts-in-distributed-training" class="table-of-contents__link toc-highlight">Basic Concepts in Distributed Training</a></li></ul></div></div></div></div></main></div></div><footer class="footer"><div class="container container-fluid"><div class="row footer__links"><div class="col footer__col"><div class="footer__title">Resources</div><ul class="footer__items clean-list"><li class="footer__item"><a class="footer__link-item" href="/docs/get_started/installation">Tutorials</a></li><li class="footer__item"><a href="https://github.com/hpcaitech/ColossalAI/tree/main/examples" target="_blank" rel="noopener noreferrer" class="footer__link-item">Examples</a></li><li class="footer__item"><a href="https://github.com/hpcaitech/ColossalAI/discussions" target="_blank" rel="noopener noreferrer" class="footer__link-item">Forum</a></li></ul></div><div class="col footer__col"><div class="footer__title">Community</div><ul class="footer__items clean-list"><li class="footer__item"><a href="https://github.com/hpcaitech/ColossalAI" target="_blank" rel="noopener noreferrer" class="footer__link-item">GitHub</a></li><li class="footer__item"><a href="https://www.hpc-ai.tech/blog" target="_blank" rel="noopener noreferrer" class="footer__link-item">Blog</a></li><li class="footer__item"><a href="https://twitter.com/HPCAITech" target="_blank" rel="noopener noreferrer" class="footer__link-item">Twitter</a></li></ul></div><div class="col footer__col"><div class="footer__title">About</div><ul class="footer__items clean-list"><li class="footer__item"><a href="https://www.hpc-ai.tech/" target="_blank" rel="noopener noreferrer" class="footer__link-item">Company</a></li><li class="footer__item"><a href="https://www.hpc-ai.tech/services" target="_blank" rel="noopener noreferrer" class="footer__link-item">Services</a></li><li class="footer__item"><a href="https://www.hpc-ai.tech/customers" target="_blank" rel="noopener noreferrer" class="footer__link-item">Customers</a></li></ul></div></div><div class="footer__bottom text--center"><div class="footer__copyright">Copyright © 2024 All Rights Reserved by HPC-AI Technology Inc.</div></div></div></footer></div> <script src="/assets/js/runtime~main.6b0f350a.js"></script> <script src="/assets/js/main.c6c31bc7.js"></script> </body> </html>