CINXE.COM
Data Platform/Systems/Airflow - Wikitech
<!DOCTYPE html> <html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-disabled skin-theme-clientpref-day vector-toc-available" lang="en" dir="ltr"> <head> <meta charset="UTF-8"> <title>Data Platform/Systems/Airflow - Wikitech</title> <script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-disabled skin-theme-clientpref-day vector-toc-available";var cookie=document.cookie.match(/(?:^|; )labswikimwclientpreferences=([^;]+)/);if(cookie){cookie[1].split('%2C').forEach(function(pref){className=className.replace(new RegExp('(^| )'+pref.replace(/-clientpref-\w+$|[^\w-]+/g,'')+'-clientpref-\\w+( |$)'),'$1'+pref+'$2');});}document.documentElement.className=className;}());RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat": "dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"db756254-7ffc-458c-950a-2f6126792b9b","wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Data_Platform/Systems/Airflow","wgTitle":"Data Platform/Systems/Airflow","wgCurRevisionId":2244312,"wgRevisionId":2244312,"wgArticleId":447901,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Data platform","Data platform systems"],"wgPageViewLanguage":"en","wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRelevantPageName":"Data_Platform/Systems/Airflow","wgRelevantArticleId":447901,"wgIsProbablyEditable":false,"wgRelevantPageIsProbablyEditable":false,"wgRestrictionEdit":[],"wgRestrictionMove":[],"wgNoticeProject":"wikitech","wgCiteReferencePreviewsActive":true,"wgMediaViewerOnClick":true,"wgMediaViewerEnabledByDefault":true, "wgVisualEditor":{"pageLanguageCode":"en","pageLanguageDir":"ltr","pageVariantFallbacks":"en"},"wgMFDisplayWikibaseDescriptions":{"search":true,"watchlist":true,"tagline":false,"nearby":true},"wgWMESchemaEditAttemptStepOversample":false,"wgWMEPageLength":20000,"wgCentralAuthMobileDomain":false,"wgEditSubmitButtonLabelPublish":true,"wgDiscussionToolsFeaturesEnabled":{"replytool":true,"newtopictool":true,"sourcemodetoolbar":true,"topicsubscription":false,"autotopicsub":false,"visualenhancements":false,"visualenhancements_reply":false,"visualenhancements_pageframe":false},"wgDiscussionToolsFallbackEditMode":"visual","wgULSPosition":"personal","wgULSisCompactLinksEnabled":false,"wgVector2022LanguageInHeader":true,"wgULSisLanguageSelectorEmpty":false,"wgCheckUserClientHintsHeadersJsApi":["brands","architecture","bitness","fullVersionList","mobile","model","platform","platformVersion"],"wgSiteNoticeId":"2.0"};RLSTATE={"ext.globalCssJs.user.styles":"ready","site.styles":"ready","user.styles": "ready","ext.globalCssJs.user":"ready","user":"ready","user.options":"loading","ext.inputBox.styles":"ready","ext.pygments":"ready","ext.discussionTools.init.styles":"ready","oojs-ui-core.styles":"ready","oojs-ui.styles.indicators":"ready","mediawiki.widgets.styles":"ready","oojs-ui-core.icons":"ready","skins.vector.search.codex.styles":"ready","skins.vector.styles":"ready","skins.vector.icons":"ready","ext.wikimediamessages.styles":"ready","ext.visualEditor.desktopArticleTarget.noscript":"ready","ext.uls.pt":"ready","ext.dismissableSiteNotice.styles":"ready"};RLPAGEMODULES=["ext.pygments.view","mediawiki.page.media","site","mediawiki.page.ready","mediawiki.toc","skins.vector.js","ext.centralNotice.geoIP","ext.centralNotice.startUp","ext.gadget.site","ext.urlShortener.toolbar","ext.centralauth.centralautologin","mmv.bootstrap","ext.visualEditor.desktopArticleTarget.init","ext.visualEditor.targetLoader","ext.echo.centralauth","ext.discussionTools.init","ext.eventLogging", "ext.wikimediaEvents","ext.uls.interface","ext.checkUser.clientHints","ext.dismissableSiteNotice"];</script> <script>(RLQ=window.RLQ||[]).push(function(){mw.loader.impl(function(){return["user.options@12s5i",function($,jQuery,require,module){mw.user.tokens.set({"patrolToken":"+\\","watchToken":"+\\","csrfToken":"+\\"}); }];});});</script> <link rel="stylesheet" href="/w/load.php?lang=en&modules=ext.discussionTools.init.styles%7Cext.dismissableSiteNotice.styles%7Cext.inputBox.styles%7Cext.pygments%7Cext.uls.pt%7Cext.visualEditor.desktopArticleTarget.noscript%7Cext.wikimediamessages.styles%7Cmediawiki.widgets.styles%7Coojs-ui-core.icons%2Cstyles%7Coojs-ui.styles.indicators%7Cskins.vector.icons%2Cstyles%7Cskins.vector.search.codex.styles&only=styles&skin=vector-2022"> <script async="" src="/w/load.php?lang=en&modules=startup&only=scripts&raw=1&skin=vector-2022"></script> <meta name="ResourceLoaderDynamicStyles" content=""> <link rel="stylesheet" href="/w/load.php?lang=en&modules=site.styles&only=styles&skin=vector-2022"> <meta name="generator" content="MediaWiki 1.44.0-wmf.4"> <meta name="referrer" content="origin"> <meta name="referrer" content="origin-when-cross-origin"> <meta name="robots" content="max-image-preview:standard"> <meta name="format-detection" content="telephone=no"> <meta name="viewport" content="width=1120"> <meta property="og:title" content="Data Platform/Systems/Airflow - Wikitech"> <meta property="og:type" content="website"> <link rel="preconnect" href="//upload.wikimedia.org"> <link rel="icon" href="/static/favicon/wikitech.ico"> <link rel="search" type="application/opensearchdescription+xml" href="/w/rest.php/v1/search" title="Wikitech (en)"> <link rel="EditURI" type="application/rsd+xml" href="//wikitech.wikimedia.org/w/api.php?action=rsd"> <link rel="canonical" href="https://wikitech.wikimedia.org/wiki/Data_Platform/Systems/Airflow"> <link rel="license" href="https://creativecommons.org/licenses/by-sa/4.0/"> <link rel="alternate" type="application/atom+xml" title="Wikitech Atom feed" href="/w/index.php?title=Special:RecentChanges&feed=atom"> <link rel="dns-prefetch" href="//meta.wikimedia.org" /> <link rel="dns-prefetch" href="//login.wikimedia.org"> </head> <body class="ext-discussiontools-replytool-enabled ext-discussiontools-newtopictool-enabled ext-discussiontools-sourcemodetoolbar-enabled skin--responsive skin-vector skin-vector-search-vue mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject page-Data_Platform_Systems_Airflow rootpage-Data_Platform skin-vector-2022 action-view"><a class="mw-jump-link" href="#bodyContent">Jump to content</a> <div class="vector-header-container"> <header class="vector-header mw-header"> <div class="vector-header-start"> <nav class="vector-main-menu-landmark" aria-label="Site"> <div id="vector-main-menu-dropdown" class="vector-dropdown vector-main-menu-dropdown vector-button-flush-left vector-button-flush-right" > <input type="checkbox" id="vector-main-menu-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-main-menu-dropdown" class="vector-dropdown-checkbox " aria-label="Main menu" > <label id="vector-main-menu-dropdown-label" for="vector-main-menu-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-menu mw-ui-icon-wikimedia-menu"></span> <span class="vector-dropdown-label-text">Main menu</span> </label> <div class="vector-dropdown-content"> <div id="vector-main-menu-unpinned-container" class="vector-unpinned-container"> <div id="vector-main-menu" class="vector-main-menu vector-pinnable-element"> <div class="vector-pinnable-header vector-main-menu-pinnable-header vector-pinnable-header-unpinned" data-feature-name="main-menu-pinned" data-pinnable-element-id="vector-main-menu" data-pinned-container-id="vector-main-menu-pinned-container" data-unpinned-container-id="vector-main-menu-unpinned-container" > <div class="vector-pinnable-header-label">Main menu</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-main-menu.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-main-menu.unpin">hide</button> </div> <div id="p-navigation" class="vector-menu mw-portlet mw-portlet-navigation" > <div class="vector-menu-heading"> Navigation </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="n-mainpage-description" class="mw-list-item"><a href="/wiki/Main_Page" title="Visit the main page [z]" accesskey="z"><span>Main page</span></a></li><li id="n-recentchanges" class="mw-list-item"><a href="/wiki/Special:RecentChanges" title="A list of recent changes in the wiki [r]" accesskey="r"><span>Recent changes</span></a></li><li id="n-Server-admin-log:-Prod" class="mw-list-item"><a href="/wiki/Server_Admin_Log"><span>Server admin log: Prod</span></a></li><li id="n-Admin-log:-RelEng" class="mw-list-item"><a href="/wiki/Release_Engineering/SAL"><span>Admin log: RelEng</span></a></li><li id="n-Incident-status" class="mw-list-item"><a href="/wiki/Incident_status"><span>Incident status</span></a></li><li id="n-Deployments" class="mw-list-item"><a href="/wiki/Deployments"><span>Deployments</span></a></li><li id="n-SRE-Team-Help" class="mw-list-item"><a href="/wiki/SRE/SRE_Team_requests"><span>SRE Team Help</span></a></li> </ul> </div> </div> <div id="p-Cloud_VPS_&_Toolforge" class="vector-menu mw-portlet mw-portlet-Cloud_VPS_Toolforge" > <div class="vector-menu-heading"> Cloud VPS & Toolforge </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="n-Cloud-VPS-portal" class="mw-list-item"><a href="/wiki/Portal:Cloud_VPS"><span>Cloud VPS portal</span></a></li><li id="n-Toolforge-portal" class="mw-list-item"><a href="/wiki/Portal:Toolforge"><span>Toolforge portal</span></a></li><li id="n-Request-VPS-project" class="mw-list-item"><a href="https://phabricator.wikimedia.org/project/view/2875/"><span>Request VPS project</span></a></li><li id="n-Admin-log:-Cloud-VPS" class="mw-list-item"><a href="/wiki/Cloud_VPS_Server_Admin_Log"><span>Admin log: Cloud VPS</span></a></li> </ul> </div> </div> </div> </div> </div> </div> </nav> <a href="/wiki/Main_Page" class="mw-logo"> <img class="mw-logo-icon" src="/static/images/icons/wikitech.svg" alt="" aria-hidden="true" height="50" width="50"> <span class="mw-logo-container skin-invert"> <img class="mw-logo-wordmark" alt="Wikitech" src="/static/images/mobile/copyright/wikitech-wordmark.svg" style="width: 8.75em; height: 1.6875em;"> </span> </a> </div> <div class="vector-header-end"> <div id="p-search" role="search" class="vector-search-box-vue vector-search-box-collapses vector-search-box-show-thumbnail vector-search-box-auto-expand-width vector-search-box"> <a href="/wiki/Special:Search" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only search-toggle" title="Search Wikitech [f]" accesskey="f"><span class="vector-icon mw-ui-icon-search mw-ui-icon-wikimedia-search"></span> <span>Search</span> </a> <div class="vector-typeahead-search-container"> <div class="cdx-typeahead-search cdx-typeahead-search--show-thumbnail cdx-typeahead-search--auto-expand-width"> <form action="/w/index.php" id="searchform" class="cdx-search-input cdx-search-input--has-end-button"> <div id="simpleSearch" class="cdx-search-input__input-wrapper" data-search-loc="header-moved"> <div class="cdx-text-input cdx-text-input--has-start-icon"> <input class="cdx-text-input__input" type="search" name="search" placeholder="Search Wikitech" aria-label="Search Wikitech" autocapitalize="sentences" title="Search Wikitech [f]" accesskey="f" id="searchInput" > <span class="cdx-text-input__icon cdx-text-input__start-icon"></span> </div> <input type="hidden" name="title" value="Special:Search"> </div> <button class="cdx-button cdx-search-input__end-button">Search</button> </form> </div> </div> </div> <nav class="vector-user-links vector-user-links-wide" aria-label="Personal tools"> <div class="vector-user-links-main"> <div id="p-vector-user-menu-preferences" class="vector-menu mw-portlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-uls" class="mw-list-item active user-links-collapsible-item"><a data-mw="interface" href="#" class="uls-trigger cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet"><span class="vector-icon mw-ui-icon-wikimedia-language mw-ui-icon-wikimedia-wikimedia-language"></span> <span>English</span></a> </li> </ul> </div> </div> <div id="p-vector-user-menu-userpage" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <nav class="vector-appearance-landmark" aria-label="Appearance"> <div id="vector-appearance-dropdown" class="vector-dropdown " title="Change the appearance of the page's font size, width, and color" > <input type="checkbox" id="vector-appearance-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-appearance-dropdown" class="vector-dropdown-checkbox " aria-label="Appearance" > <label id="vector-appearance-dropdown-label" for="vector-appearance-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-appearance mw-ui-icon-wikimedia-appearance"></span> <span class="vector-dropdown-label-text">Appearance</span> </label> <div class="vector-dropdown-content"> <div id="vector-appearance-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <div id="p-vector-user-menu-notifications" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <div id="p-vector-user-menu-overflow" class="vector-menu mw-portlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-sitesupport-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="https://donate.wikimedia.org/?utm_source=donate&utm_medium=sidebar&utm_campaign=spontaneous&uselang=en" class=""><span>Donate</span></a> </li> <li id="pt-login-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="/w/index.php?title=Special:UserLogin&returnto=Data+Platform%2FSystems%2FAirflow" title="You are encouraged to log in; however, it is not mandatory [o]" accesskey="o" class=""><span>Log in</span></a> </li> </ul> </div> </div> </div> <div id="vector-user-links-dropdown" class="vector-dropdown vector-user-menu vector-button-flush-right vector-user-menu-logged-out user-links-collapsible-item" title="More options" > <input type="checkbox" id="vector-user-links-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-user-links-dropdown" class="vector-dropdown-checkbox " aria-label="Personal tools" > <label id="vector-user-links-dropdown-label" for="vector-user-links-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-ellipsis mw-ui-icon-wikimedia-ellipsis"></span> <span class="vector-dropdown-label-text">Personal tools</span> </label> <div class="vector-dropdown-content"> <div id="p-personal" class="vector-menu mw-portlet mw-portlet-personal user-links-collapsible-item" title="User menu" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-sitesupport" class="user-links-collapsible-item mw-list-item"><a href="https://donate.wikimedia.org/?utm_source=donate&utm_medium=sidebar&utm_campaign=spontaneous&uselang=en"><span>Donate</span></a></li><li id="pt-login" class="user-links-collapsible-item mw-list-item"><a href="/w/index.php?title=Special:UserLogin&returnto=Data+Platform%2FSystems%2FAirflow" title="You are encouraged to log in; however, it is not mandatory [o]" accesskey="o"><span class="vector-icon mw-ui-icon-logIn mw-ui-icon-wikimedia-logIn"></span> <span>Log in</span></a></li> </ul> </div> </div> </div> </div> </nav> </div> </header> </div> <div class="mw-page-container"> <div class="mw-page-container-inner"> <div class="vector-sitenotice-container"> <div id="siteNotice"><div id="mw-dismissablenotice-anonplace"></div><script>(function(){var node=document.getElementById("mw-dismissablenotice-anonplace");if(node){node.outerHTML="\u003Cdiv class=\"mw-dismissable-notice\"\u003E\u003Cdiv class=\"mw-dismissable-notice-close\"\u003E[\u003Ca tabindex=\"0\" role=\"button\"\u003Edismiss\u003C/a\u003E]\u003C/div\u003E\u003Cdiv class=\"mw-dismissable-notice-body\"\u003E\u003C!-- CentralNotice --\u003E\u003Cdiv id=\"localNotice\" data-nosnippet=\"\"\u003E\u003Cdiv class=\"sitenotice\" lang=\"en\" dir=\"ltr\"\u003E\u003Ctable style=\"width: 75%; background-color: var(--background-color-warning-subtle, #fdf2d5); border: var(--border-subtle, 1px solid #987027); color: var(--color-base, #202122); border-radius: 10px; padding: 5px; margin: 0 auto;\"\u003E\n\u003Ctbody\u003E\u003Ctr\u003E\n\u003Ctd style=\"width:40px; height:40px; text-align:center; vertical-align:middle; padding: 2px;\"\u003E\u003Cspan typeof=\"mw:File\"\u003E\u003Ca href=\"/wiki/File:OOjs_UI_icon_alert-warning.svg\" class=\"mw-file-description\"\u003E\u003Cimg src=\"//upload.wikimedia.org/wikipedia/commons/thumb/3/3b/OOjs_UI_icon_alert-warning.svg/30px-OOjs_UI_icon_alert-warning.svg.png\" decoding=\"async\" width=\"30\" height=\"30\" class=\"mw-file-element\" srcset=\"//upload.wikimedia.org/wikipedia/commons/thumb/3/3b/OOjs_UI_icon_alert-warning.svg/45px-OOjs_UI_icon_alert-warning.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/3/3b/OOjs_UI_icon_alert-warning.svg/60px-OOjs_UI_icon_alert-warning.svg.png 2x\" data-file-width=\"20\" data-file-height=\"20\" /\u003E\u003C/a\u003E\u003C/span\u003E\n\u003C/td\u003E\n\u003Ctd style=\"text-align:center; vertical-align: middle; padding: 4px; max-height: 60px;\"\u003E\u003Cb\u003EWe are migrating Wikitech to \u003Ca href=\"/wiki/Wikitech/SUL-migration\" title=\"Wikitech/SUL-migration\"\u003ESUL\u003C/a\u003E!\u003C/b\u003E\n\u003Cp\u003E\u003Cb\u003EAction may be required for your \u003Ca href=\"/wiki/Wikitech/SUL-migration#What_You_Should_Do\" title=\"Wikitech/SUL-migration\"\u003E account\u003C/a\u003E!\u003C/b\u003E\n\u003C/p\u003E\u003Cp\u003E\u003Cb\u003ETrouble logging in? Please visit \u003Ca href=\"https://phabricator.wikimedia.org/T376267\" class=\"extiw\" title=\"phab:T376267\"\u003ET376267\u003C/a\u003E\u003C/b\u003E\n\u003C/p\u003E\n\u003C/td\u003E\u003C/tr\u003E\u003C/tbody\u003E\u003C/table\u003E\u003C/div\u003E\u003C/div\u003E\u003C/div\u003E\u003C/div\u003E";}}());</script></div> </div> <div class="vector-column-start"> <div class="vector-main-menu-container"> <div id="mw-navigation"> <nav id="mw-panel" class="vector-main-menu-landmark" aria-label="Site"> <div id="vector-main-menu-pinned-container" class="vector-pinned-container"> </div> </nav> </div> </div> <div class="vector-sticky-pinned-container"> <nav id="mw-panel-toc" aria-label="Contents" data-event-name="ui.sidebar-toc" class="mw-table-of-contents-container vector-toc-landmark"> <div id="vector-toc-pinned-container" class="vector-pinned-container"> <div id="vector-toc" class="vector-toc vector-pinnable-element"> <div class="vector-pinnable-header vector-toc-pinnable-header vector-pinnable-header-pinned" data-feature-name="toc-pinned" data-pinnable-element-id="vector-toc" > <h2 class="vector-pinnable-header-label">Contents</h2> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-toc.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-toc.unpin">hide</button> </div> <ul class="vector-toc-contents" id="mw-panel-toc-list"> <li id="toc-mw-content-text" class="vector-toc-list-item vector-toc-level-1"> <a href="#" class="vector-toc-link"> <div class="vector-toc-text">Beginning</div> </a> </li> <li id="toc-Airflow_setup_and_conventions" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Airflow_setup_and_conventions"> <div class="vector-toc-text"> <span class="vector-toc-numb">1</span> <span>Airflow setup and conventions</span> </div> </a> <button aria-controls="toc-Airflow_setup_and_conventions-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Airflow setup and conventions subsection</span> </button> <ul id="toc-Airflow_setup_and_conventions-sublist" class="vector-toc-list"> <li id="toc-Authentication" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Authentication"> <div class="vector-toc-text"> <span class="vector-toc-numb">1.1</span> <span>Authentication</span> </div> </a> <ul id="toc-Authentication-sublist" class="vector-toc-list"> <li id="toc-Host-based_instances" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Host-based_instances"> <div class="vector-toc-text"> <span class="vector-toc-numb">1.1.1</span> <span>Host-based instances</span> </div> </a> <ul id="toc-Host-based_instances-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Kubernetes_instances" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Kubernetes_instances"> <div class="vector-toc-text"> <span class="vector-toc-numb">1.1.2</span> <span>Kubernetes instances</span> </div> </a> <ul id="toc-Kubernetes_instances-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Metadata_Database" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Metadata_Database"> <div class="vector-toc-text"> <span class="vector-toc-numb">1.2</span> <span>Metadata Database</span> </div> </a> <ul id="toc-Metadata_Database-sublist" class="vector-toc-list"> <li id="toc-Host-based_instances_2" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Host-based_instances_2"> <div class="vector-toc-text"> <span class="vector-toc-numb">1.2.1</span> <span>Host-based instances</span> </div> </a> <ul id="toc-Host-based_instances_2-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Kubernetes_instances_2" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Kubernetes_instances_2"> <div class="vector-toc-text"> <span class="vector-toc-numb">1.2.2</span> <span>Kubernetes instances</span> </div> </a> <ul id="toc-Kubernetes_instances_2-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Airflow_DAGs" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Airflow_DAGs"> <div class="vector-toc-text"> <span class="vector-toc-numb">1.3</span> <span>Airflow DAGs</span> </div> </a> <ul id="toc-Airflow_DAGs-sublist" class="vector-toc-list"> <li id="toc-Host-based_DAG_deployment" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Host-based_DAG_deployment"> <div class="vector-toc-text"> <span class="vector-toc-numb">1.3.1</span> <span>Host-based DAG deployment</span> </div> </a> <ul id="toc-Host-based_DAG_deployment-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Kubernetes_based_DAG_deployment" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Kubernetes_based_DAG_deployment"> <div class="vector-toc-text"> <span class="vector-toc-numb">1.3.2</span> <span>Kubernetes based DAG deployment</span> </div> </a> <ul id="toc-Kubernetes_based_DAG_deployment-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Skein" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Skein"> <div class="vector-toc-text"> <span class="vector-toc-numb">1.4</span> <span>Skein</span> </div> </a> <ul id="toc-Skein-sublist" class="vector-toc-list"> <li id="toc-I'm_getting_paged_for_a_Skein_certificate_about_to_expire" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#I'm_getting_paged_for_a_Skein_certificate_about_to_expire"> <div class="vector-toc-text"> <span class="vector-toc-numb">1.4.1</span> <span>I'm getting paged for a Skein certificate about to expire</span> </div> </a> <ul id="toc-I'm_getting_paged_for_a_Skein_certificate_about_to_expire-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> </ul> </li> <li id="toc-See_also" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#See_also"> <div class="vector-toc-text"> <span class="vector-toc-numb">2</span> <span>See also</span> </div> </a> <ul id="toc-See_also-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Airflow_Instances" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Airflow_Instances"> <div class="vector-toc-text"> <span class="vector-toc-numb">3</span> <span>Airflow Instances</span> </div> </a> <ul id="toc-Airflow_Instances-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Airflow_on_Kubernetes" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Airflow_on_Kubernetes"> <div class="vector-toc-text"> <span class="vector-toc-numb">4</span> <span>Airflow on Kubernetes</span> </div> </a> <ul id="toc-Airflow_on_Kubernetes-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Airflow_Upgrades" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Airflow_Upgrades"> <div class="vector-toc-text"> <span class="vector-toc-numb">5</span> <span>Airflow Upgrades</span> </div> </a> <ul id="toc-Airflow_Upgrades-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Administration" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Administration"> <div class="vector-toc-text"> <span class="vector-toc-numb">6</span> <span>Administration</span> </div> </a> <button aria-controls="toc-Administration-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Administration subsection</span> </button> <ul id="toc-Administration-sublist" class="vector-toc-list"> <li id="toc-Overview_of_Data_Engineering's_Airflow_deployments" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Overview_of_Data_Engineering's_Airflow_deployments"> <div class="vector-toc-text"> <span class="vector-toc-numb">6.1</span> <span>Overview of Data Engineering's Airflow deployments</span> </div> </a> <ul id="toc-Overview_of_Data_Engineering's_Airflow_deployments-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Creating_a_new_Airflow_Instance" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Creating_a_new_Airflow_Instance"> <div class="vector-toc-text"> <span class="vector-toc-numb">6.2</span> <span>Creating a new Airflow Instance</span> </div> </a> <ul id="toc-Creating_a_new_Airflow_Instance-sublist" class="vector-toc-list"> <li id="toc-Prepare_airflow-dags_for_deployment_to_the_new_instance" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Prepare_airflow-dags_for_deployment_to_the_new_instance"> <div class="vector-toc-text"> <span class="vector-toc-numb">6.2.1</span> <span>Prepare airflow-dags for deployment to the new instance</span> </div> </a> <ul id="toc-Prepare_airflow-dags_for_deployment_to_the_new_instance-sublist" class="vector-toc-list"> <li id="toc-Create_the_instance_specific_dags_folder" class="vector-toc-list-item vector-toc-level-4"> <a class="vector-toc-link" href="#Create_the_instance_specific_dags_folder"> <div class="vector-toc-text"> <span class="vector-toc-numb">6.2.1.1</span> <span>Create the instance specific dags folder</span> </div> </a> <ul id="toc-Create_the_instance_specific_dags_folder-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Create_the_instance_specific_scap_repository" class="vector-toc-list-item vector-toc-level-4"> <a class="vector-toc-link" href="#Create_the_instance_specific_scap_repository"> <div class="vector-toc-text"> <span class="vector-toc-numb">6.2.1.2</span> <span>Create the instance specific scap repository</span> </div> </a> <ul id="toc-Create_the_instance_specific_scap_repository-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Create_a_scap_deployment_source" class="vector-toc-list-item vector-toc-level-4"> <a class="vector-toc-link" href="#Create_a_scap_deployment_source"> <div class="vector-toc-text"> <span class="vector-toc-numb">6.2.1.3</span> <span>Create a scap deployment source</span> </div> </a> <ul id="toc-Create_a_scap_deployment_source-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Create_the_Airflow_PostgreSQL_Database" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Create_the_Airflow_PostgreSQL_Database"> <div class="vector-toc-text"> <span class="vector-toc-numb">6.2.2</span> <span>Create the Airflow PostgreSQL Database</span> </div> </a> <ul id="toc-Create_the_Airflow_PostgreSQL_Database-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Configure_the_Airflow_instance_in_Puppet" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Configure_the_Airflow_instance_in_Puppet"> <div class="vector-toc-text"> <span class="vector-toc-numb">6.2.3</span> <span>Configure the Airflow instance in Puppet</span> </div> </a> <ul id="toc-Configure_the_Airflow_instance_in_Puppet-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Add_service_user_to_the_Yarn_production_queue" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Add_service_user_to_the_Yarn_production_queue"> <div class="vector-toc-text"> <span class="vector-toc-numb">6.2.4</span> <span>Add service user to the Yarn production queue</span> </div> </a> <ul id="toc-Add_service_user_to_the_Yarn_production_queue-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> </ul> </li> <li id="toc-Incident_reports_&_known_issues" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Incident_reports_&_known_issues"> <div class="vector-toc-text"> <span class="vector-toc-numb">7</span> <span>Incident reports & known issues</span> </div> </a> <ul id="toc-Incident_reports_&_known_issues-sublist" class="vector-toc-list"> </ul> </li> </ul> </div> </div> </nav> </div> </div> <div class="mw-content-container"> <main id="content" class="mw-body"> <header class="mw-body-header vector-page-titlebar"> <nav aria-label="Contents" class="vector-toc-landmark"> <div id="vector-page-titlebar-toc" class="vector-dropdown vector-page-titlebar-toc vector-button-flush-left" > <input type="checkbox" id="vector-page-titlebar-toc-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-page-titlebar-toc" class="vector-dropdown-checkbox " aria-label="Toggle the table of contents" > <label id="vector-page-titlebar-toc-label" for="vector-page-titlebar-toc-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-listBullet mw-ui-icon-wikimedia-listBullet"></span> <span class="vector-dropdown-label-text">Toggle the table of contents</span> </label> <div class="vector-dropdown-content"> <div id="vector-page-titlebar-toc-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <h1 id="firstHeading" class="firstHeading mw-first-heading"><span class="mw-page-title-main">Data Platform/Systems/Airflow</span></h1> </header> <div class="vector-page-toolbar"> <div class="vector-page-toolbar-container"> <div id="left-navigation"> <nav aria-label="Namespaces"> <div id="p-associated-pages" class="vector-menu vector-menu-tabs mw-portlet mw-portlet-associated-pages" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-nstab-main" class="selected vector-tab-noicon mw-list-item"><a href="/wiki/Data_Platform/Systems/Airflow" title="View the content page [c]" accesskey="c"><span>Page</span></a></li><li id="ca-talk" class="new vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Talk:Data_Platform/Systems/Airflow&action=edit&redlink=1" rel="discussion" class="new" title="Discussion about the content page (page does not exist) [t]" accesskey="t"><span>Discussion</span></a></li> </ul> </div> </div> <div id="vector-variants-dropdown" class="vector-dropdown emptyPortlet" > <input type="checkbox" id="vector-variants-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-variants-dropdown" class="vector-dropdown-checkbox " aria-label="Change language variant" > <label id="vector-variants-dropdown-label" for="vector-variants-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet" aria-hidden="true" ><span class="vector-dropdown-label-text">English</span> </label> <div class="vector-dropdown-content"> <div id="p-variants" class="vector-menu mw-portlet mw-portlet-variants emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> </div> </div> </nav> </div> <div id="right-navigation" class="vector-collapsible"> <nav aria-label="Views"> <div id="p-views" class="vector-menu vector-menu-tabs mw-portlet mw-portlet-views" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-view" class="selected vector-tab-noicon mw-list-item"><a href="/wiki/Data_Platform/Systems/Airflow"><span>Read</span></a></li><li id="ca-viewsource" class="vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Data_Platform/Systems/Airflow&action=edit" title="This page is protected. You can view its source [e]" accesskey="e"><span>View source</span></a></li><li id="ca-history" class="vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Data_Platform/Systems/Airflow&action=history" title="Past revisions of this page [h]" accesskey="h"><span>View history</span></a></li> </ul> </div> </div> </nav> <nav class="vector-page-tools-landmark" aria-label="Page tools"> <div id="vector-page-tools-dropdown" class="vector-dropdown vector-page-tools-dropdown" > <input type="checkbox" id="vector-page-tools-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-page-tools-dropdown" class="vector-dropdown-checkbox " aria-label="Tools" > <label id="vector-page-tools-dropdown-label" for="vector-page-tools-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet" aria-hidden="true" ><span class="vector-dropdown-label-text">Tools</span> </label> <div class="vector-dropdown-content"> <div id="vector-page-tools-unpinned-container" class="vector-unpinned-container"> <div id="vector-page-tools" class="vector-page-tools vector-pinnable-element"> <div class="vector-pinnable-header vector-page-tools-pinnable-header vector-pinnable-header-unpinned" data-feature-name="page-tools-pinned" data-pinnable-element-id="vector-page-tools" data-pinned-container-id="vector-page-tools-pinned-container" data-unpinned-container-id="vector-page-tools-unpinned-container" > <div class="vector-pinnable-header-label">Tools</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-page-tools.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-page-tools.unpin">hide</button> </div> <div id="p-cactions" class="vector-menu mw-portlet mw-portlet-cactions emptyPortlet vector-has-collapsible-items" title="More options" > <div class="vector-menu-heading"> Actions </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-more-view" class="selected vector-more-collapsible-item mw-list-item"><a href="/wiki/Data_Platform/Systems/Airflow"><span>Read</span></a></li><li id="ca-more-viewsource" class="vector-more-collapsible-item mw-list-item"><a href="/w/index.php?title=Data_Platform/Systems/Airflow&action=edit"><span>View source</span></a></li><li id="ca-more-history" class="vector-more-collapsible-item mw-list-item"><a href="/w/index.php?title=Data_Platform/Systems/Airflow&action=history"><span>View history</span></a></li> </ul> </div> </div> <div id="p-tb" class="vector-menu mw-portlet mw-portlet-tb" > <div class="vector-menu-heading"> General </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="t-whatlinkshere" class="mw-list-item"><a href="/wiki/Special:WhatLinksHere/Data_Platform/Systems/Airflow" title="A list of all wiki pages that link here [j]" accesskey="j"><span>What links here</span></a></li><li id="t-recentchangeslinked" class="mw-list-item"><a href="/wiki/Special:RecentChangesLinked/Data_Platform/Systems/Airflow" rel="nofollow" title="Recent changes in pages linked from this page [k]" accesskey="k"><span>Related changes</span></a></li><li id="t-specialpages" class="mw-list-item"><a href="/wiki/Special:SpecialPages" title="A list of all special pages [q]" accesskey="q"><span>Special pages</span></a></li><li id="t-permalink" class="mw-list-item"><a href="/w/index.php?title=Data_Platform/Systems/Airflow&oldid=2244312" title="Permanent link to this revision of this page"><span>Permanent link</span></a></li><li id="t-info" class="mw-list-item"><a href="/w/index.php?title=Data_Platform/Systems/Airflow&action=info" title="More information about this page"><span>Page information</span></a></li><li id="t-cite" class="mw-list-item"><a href="/w/index.php?title=Special:CiteThisPage&page=Data_Platform%2FSystems%2FAirflow&id=2244312&wpFormIdentifier=titleform" title="Information on how to cite this page"><span>Cite this page</span></a></li><li id="t-urlshortener" class="mw-list-item"><a href="/w/index.php?title=Special:UrlShortener&url=https%3A%2F%2Fwikitech.wikimedia.org%2Fwiki%2FData_Platform%2FSystems%2FAirflow"><span>Get shortened URL</span></a></li><li id="t-urlshortener-qrcode" class="mw-list-item"><a href="/w/index.php?title=Special:QrCode&url=https%3A%2F%2Fwikitech.wikimedia.org%2Fwiki%2FData_Platform%2FSystems%2FAirflow"><span>Download QR code</span></a></li> </ul> </div> </div> <div id="p-coll-print_export" class="vector-menu mw-portlet mw-portlet-coll-print_export" > <div class="vector-menu-heading"> Print/export </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="coll-create_a_book" class="mw-list-item"><a href="/w/index.php?title=Special:Book&bookcmd=book_creator&referer=Data+Platform%2FSystems%2FAirflow"><span>Create a book</span></a></li><li id="coll-download-as-rl" class="mw-list-item"><a href="/w/index.php?title=Special:DownloadAsPdf&page=Data_Platform%2FSystems%2FAirflow&action=show-download-screen"><span>Download as PDF</span></a></li><li id="t-print" class="mw-list-item"><a href="/w/index.php?title=Data_Platform/Systems/Airflow&printable=yes" title="Printable version of this page [p]" accesskey="p"><span>Printable version</span></a></li> </ul> </div> </div> </div> </div> </div> </div> </nav> </div> </div> </div> <div class="vector-column-end"> <div class="vector-sticky-pinned-container"> <nav class="vector-page-tools-landmark" aria-label="Page tools"> <div id="vector-page-tools-pinned-container" class="vector-pinned-container"> </div> </nav> <nav class="vector-appearance-landmark" aria-label="Appearance"> <div id="vector-appearance-pinned-container" class="vector-pinned-container"> <div id="vector-appearance" class="vector-appearance vector-pinnable-element"> <div class="vector-pinnable-header vector-appearance-pinnable-header vector-pinnable-header-pinned" data-feature-name="appearance-pinned" data-pinnable-element-id="vector-appearance" data-pinned-container-id="vector-appearance-pinned-container" data-unpinned-container-id="vector-appearance-unpinned-container" > <div class="vector-pinnable-header-label">Appearance</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-appearance.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-appearance.unpin">hide</button> </div> </div> </div> </nav> </div> </div> <div id="bodyContent" class="vector-body" aria-labelledby="firstHeading" data-mw-ve-target-container> <div class="vector-body-before-content"> <div class="mw-indicators"> </div> <div id="siteSub" class="noprint">From Wikitech</div> </div> <div id="contentSub"><div id="mw-content-subtitle"><div class="subpages">< <bdi dir="ltr"><a href="/wiki/Data_Platform" title="Data Platform">Data Platform</a></bdi> | <bdi dir="ltr"><a href="/wiki/Data_Platform/Systems" title="Data Platform/Systems">Systems</a></bdi></div></div></div> <div id="mw-content-text" class="mw-body-content"><div class="mw-content-ltr mw-parser-output" lang="en" dir="ltr"><style data-mw-deduplicate="TemplateStyles:r2211903">.mw-parser-output .note{background-position:left 7px top 50%;padding:0.5em 0.5em 0.5em 40px;margin:0.5em 0;overflow:hidden;background-color:#f8f9fa;color:#333;background-repeat:no-repeat;border:1px solid #ddd}.mw-parser-output .note-inline{display:inline-block;vertical-align:middle}.mw-parser-output .note-info{background-color:#eaf3ff;color:#333;background-image:url("https://upload.wikimedia.org/wikipedia/commons/e/ec/OOjs_UI_icon_information-progressive.svg");background-size:25px;border-color:#a3caff;padding-left:40px;min-height:25px}.mw-parser-output .note-reminder{background-color:#fff9ea;color:#333;background-image:url("https://upload.wikimedia.org/wikipedia/commons/a/a8/OOjs_UI_icon_lightbulb-yellow.svg");background-size:25px;border-color:#fc3;min-height:25px}.mw-parser-output .note-warn{background-color:#fff9ea;color:#333;background-image:url("https://upload.wikimedia.org/wikipedia/commons/3/3b/OOjs_UI_icon_alert-warning.svg");background-size:25px;border-color:#fc3;min-height:25px}.mw-parser-output .note-error{background-color:#fee7e6;color:#333;background-image:url("https://upload.wikimedia.org/wikipedia/commons/b/bf/OOjs_UI_icon_notice-destructive.svg");background-size:25px;border-color:#c33;min-height:25px}@media screen{html.skin-theme-clientpref-night .mw-parser-output .note{background-color:transparent;color:inherit}}@media screen and (prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .note{background-color:transparent;color:inherit}}</style><div role="note" class="note note-info note-inline">Please note that we are currently in the process of migrating Airflow to Kubernetes (<a href="https://phabricator.wikimedia.org/T362788" class="extiw" title="phabricator:T362788">task T362788</a>) - See <a href="/wiki/Data_Platform/Systems/Airflow/Kubernetes" title="Data Platform/Systems/Airflow/Kubernetes">Data Platform/Systems/Airflow/Kubernetes</a> for more details. Many of the instructions on this page refer to the pre-Kubernetes system and will need to be updated.</div><style data-mw-deduplicate="TemplateStyles:r2241375">.mw-parser-output .tpl-navsidebar{max-width:22em;background:var(--background-color-base,#fff);color:var(--color-base,#202122);border:1px solid var(--border-color-base,#a2a9b1);float:right;clear:right;margin:.5em 0 1em 1em}.mw-parser-output .tpl-navsidebar-floatright{float:right;clear:right;margin:.5em 0 1em 1em}.mw-parser-output .tpl-navsidebar-floatleft{float:left;clear:left;margin:.5em 1em 1em 0}.mw-parser-output .tpl-navsidebar-floatnone{float:none;clear:both;margin:.5em 0}.mw-parser-output .tpl-navsidebar-topimage{margin:0 0 16px 0}.mw-parser-output .tpl-navsidebar-title{margin:8px 16px;border-bottom:3px solid var(--border-color-muted,#eaecf0);font-size:20px;text-align:center}.mw-parser-output .tpl-navsidebar-image{margin:0 0 8px}.mw-parser-output .tpl-navsidebar-content{margin:0 0 16px 0;padding:0 8px}.mw-parser-output .tpl-navsidebar-heading{margin:8px 0;font-weight:bold}.mw-parser-output .tpl-navsidebar-foot{padding:0 8px;margin:0;text-align:right;font-size:smaller}@media not (min-width:720px){.mw-parser-output .tpl-navsidebar{float:none;clear:both;margin:.5em 0;max-width:none}}</style><div role="navigation" class="navigation-not-searchable tpl-navsidebar" style=""><p class="tpl-navsidebar-title"><a href="/wiki/Data_Platform" title="Data Platform">Data Platform</a></p><div class="tpl-navsidebar-contents"><div class="tpl-navsidebar-content"> <div class="mw-inputbox-centered" style=""><form name="searchbox" class="searchbox mw-inputbox-form-inline" action="/wiki/Special:Search"><div class="cdx-text-input"><input class="mw-searchInput searchboxInput cdx-text-input__input" name="search" placeholder="Search Data Platform documentation" size="40" dir="ltr"/></div><input type="hidden" value="incategory:Data_platform" name="searchfilter"/> <input type="submit" name="fulltext" value="Search" class="cdx-button"/><input type="hidden" value="Search" name="fulltext"/></form></div> </div><div class="tpl-navsidebar-content"> <p class="tpl-navsidebar-heading"><a href="/wiki/Data_Platform/Discover_data" title="Data Platform/Discover data">Discover data</a></p><p class="mw-empty-elt"> </p><ul><li><a class="external text" href="https://datahub.wikimedia.org/">Explore datasets in DataHub</a></li> <li><a href="/wiki/Data_Platform/Data_Lake" title="Data Platform/Data Lake">Data Lake</a> <ul><li><a href="/wiki/Data_Platform/Data_Lake/Traffic" title="Data Platform/Data Lake/Traffic">Traffic data</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Edits" title="Data Platform/Data Lake/Edits">Edits data</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Content" title="Data Platform/Data Lake/Content">Content data</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Events" title="Data Platform/Data Lake/Events">Events data</a></li></ul></li> <li><a href="/wiki/Data_Platform/AQS" title="Data Platform/AQS">Analytics Query Service (AQS)</a></li></ul> </div><div class="tpl-navsidebar-content"> <p class="tpl-navsidebar-heading"><a href="/wiki/Data_Platform/Analyze_data" title="Data Platform/Analyze data">Access, query, and analyze data</a></p><p class="mw-empty-elt"> </p><ul><li><a href="/wiki/Data_Platform/Data_access" title="Data Platform/Data access">Get access to internal data</a></li> <li>Analytics tools <ul><li><a href="/wiki/Data_Platform/Systems/Jupyter" title="Data Platform/Systems/Jupyter">Jupyter notebooks</a></li> <li><a href="/wiki/Data_Platform/Systems/Superset" title="Data Platform/Systems/Superset">Superset</a></li> <li><a href="/wiki/Data_Platform/Systems/Spark" title="Data Platform/Systems/Spark">Spark</a></li> <li><a href="/wiki/Data_Platform/Systems/Presto" title="Data Platform/Systems/Presto">Presto</a></li></ul></li> <li><a rel="nofollow" class="external text" href="https://github.com/wikimedia/wmfdata-python/blob/main/docs/quickstart.ipynb">Quickstart notebook</a></li> <li><a href="/wiki/Data_Platform/Internal_API_requests" title="Data Platform/Internal API requests">Internal API requests</a></li></ul> </div><div class="tpl-navsidebar-content"> <p class="tpl-navsidebar-heading"><a href="/wiki/Data_Platform/Transform_data" title="Data Platform/Transform data">Transform and publish data</a></p><p class="mw-empty-elt"> </p><ul><li><a href="https://www.mediawiki.org/wiki/Data_Platform_Engineering/Intake_Process" class="extiw" title="mw:Data Platform Engineering/Intake Process">Get help or file a request</a></li> <li><a href="/wiki/Data_Platform/Transform_data#Plan_data_lifecyle" title="Data Platform/Transform data">Plan data lifecyle</a></li> <li>Build tables and datasets <ul><li><a href="/wiki/Data_Platform/Dataset_creation" title="Data Platform/Dataset creation">Dataset creation process</a></li> <li><a href="/wiki/Data_Platform/Data_modeling_guidelines" title="Data Platform/Data modeling guidelines"> Data modeling guidelines</a></li> <li><a href="/wiki/Data_Platform/Systems/Airflow/Developer_guide" title="Data Platform/Systems/Airflow/Developer guide">Airflow developer guide</a></li> <li><a href="/wiki/Data_Platform/Systems/Hive" title="Data Platform/Systems/Hive">Hive</a></li> <li><a href="/wiki/Data_Platform/Systems/Iceberg" title="Data Platform/Systems/Iceberg">Iceberg</a></li> <li><a href="/wiki/Data_Platform/Systems/Druid" title="Data Platform/Systems/Druid">Druid</a></li></ul></li> <li>Share data and dashboards <ul><li><a href="https://foundation.wikimedia.org/wiki/Legal:Data_publication_guidelines" class="extiw" title="foundation:Legal:Data publication guidelines"> Data publication guidelines</a></li> <li><a href="/wiki/Data_Platform/Systems/Turnilo" title="Data Platform/Systems/Turnilo">Turnilo</a></li> <li><a href="/wiki/Data_Platform/Systems/Superset" title="Data Platform/Systems/Superset">Superset</a></li> <li><a href="/wiki/Data_Platform/Systems/analytics.wikimedia.org" title="Data Platform/Systems/analytics.wikimedia.org"> analytics.wikimedia.org</a></li> <li><a href="/wiki/Data_Platform/Web_publication" title="Data Platform/Web publication"> Web publication guide</a></li> <li><a href="/wiki/Data_Platform/Systems/Dashiki" title="Data Platform/Systems/Dashiki"> Dashiki</a></li></ul></li> <li>Manage published data <ul><li><a href="/wiki/Data_Incident_management" class="mw-redirect" title="Data Incident management"> Data Incident management</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Data_Issues" title="Data Platform/Data Lake/Data Issues"> Data Issue reporting</a></li> <li><a href="https://foundation.wikimedia.org/wiki/Legal:Data_retention_guidelines" class="extiw" title="foundation:Legal:Data retention guidelines">Data Retention Guidelines</a></li> <li><a href="/wiki/Data_Platform/Systems/Event_Data_retention" title="Data Platform/Systems/Event Data retention">Event data retention</a></li> <li><a href="/wiki/Data_Platform/Event_Sanitization" title="Data Platform/Event Sanitization">Event Sanitization</a></li> <li><a href="/wiki/Data_Platform/Dataset_archiving_and_deletion" title="Data Platform/Dataset archiving and deletion">Dataset archiving and deletion</a></li></ul></li></ul> </div><div class="tpl-navsidebar-content"> <p class="tpl-navsidebar-heading">Collect data</p><p class="mw-empty-elt"> </p><ul><li><a href="/wiki/Metrics_Platform" title="Metrics Platform">Metrics platform</a></li> <li><a href="/wiki/Event_Platform/Instrumentation_How_To" title="Event Platform/Instrumentation How To">Instrumentation tutorial</a></li> <li><a href="/wiki/Event_Platform" title="Event Platform">Event Platform</a></li></ul> <hr/> </div><div class="tpl-navsidebar-content"> <p class="tpl-navsidebar-heading">Data Platform infrastructure and operations</p><p class="mw-empty-elt"> </p><ul><li><a href="/wiki/Data_Platform/Systems" title="Data Platform/Systems">Systems overview</a></li> <li><a href="/wiki/Category:Data_pipelines" title="Category:Data pipelines"> Data pipelines</a></li> <li>Search <ul><li><a href="/wiki/Search/Technical_interactions" title="Search/Technical interactions"> Using search for new features </a></li> <li><a href="/wiki/Search_Platform/Documentation#Search" title="Search Platform/Documentation"> Search Platform </a></li> <li><a href="/wiki/Wikidata_Query_Service" title="Wikidata Query Service"> Wikidata Query Service (WDQS) </a></li></ul></li> <li>Operations and team processes <ul><li><a href="/wiki/Data_Platform_Engineering/Ops_week" title="Data Platform Engineering/Ops week">Ops week</a></li> <li><a href="/wiki/Data_Platform_Engineering" title="Data Platform Engineering">Team pages on Wikitech</a></li> <li><a href="https://www.mediawiki.org/wiki/Data_Platform_Engineering" class="extiw" title="mw:Data Platform Engineering">Team and project pages on MediaWiki.org</a></li></ul></li></ul> </div></div><p class="tpl-navsidebar-foot">[<span class="noprint plainlinks"><a class="external text" href="https://wikitech.wikimedia.org/w/index.php?title=Template:Navigation_Data_Platform&action=edit"><span title="Edit this template">edit</span></a></span>]</p></div> <p><a rel="nofollow" class="external text" href="https://airflow.apache.org/">Apache Airflow</a> is a workflow job scheduler. Developers declare job workflows using a custom DAG (directed acyclic graph) python API. </p><p>This page documents the Data Engineering managed Airflow instances in the Analytics Cluster. As of November 2024, we are running Airflow 2.10.3 (<a rel="nofollow" class="external text" href="https://airflow.apache.org/docs/apache-airflow/2.10.3/">docs</a>). </p><p>If you wish to develop DAGs with Airflow, you can find more information on the <a href="/wiki/Analytics/Systems/Airflow/Developer_guide" class="mw-redirect" title="Analytics/Systems/Airflow/Developer guide">Airflow Developer guide</a> page. </p> <meta property="mw:PageProp/toc"/> <div class="mw-heading mw-heading2 ext-discussiontools-init-section"><h2 id="Airflow_setup_and_conventions" data-mw-thread-id="h-Airflow_setup_and_conventions"><span data-mw-comment-start="" id="h-Airflow_setup_and_conventions"></span>Airflow setup and conventions<span data-mw-comment-end="h-Airflow_setup_and_conventions"></span></h2><!--__DTELLIPSISBUTTON__{"threadItem":{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-Airflow_setup_and_conventions","replies":["h-Authentication-Airflow_setup_and_conventions","h-Metadata_Database-Airflow_setup_and_conventions","h-Airflow_DAGs-Airflow_setup_and_conventions","h-Skein-Airflow_setup_and_conventions"]}}--></div> <p>The Data Engineering team maintains several Airflow instances. Usually, these instances are team specific. Teams have full control over their airflow instance. Data Platform Engineering manages the tooling needed to deploy and run these instances. </p><p>The current production instances are all host-based, meaning that they run on either bare-metal or individual virtual machines. The <a href="/wiki/Data_Platform/Systems/Airflow/Kubernetes" title="Data Platform/Systems/Airflow/Kubernetes">Airflow instances on Kubernetes</a> run as distributed applications on the dse-k8s cluster in eqiad. </p><p>The host-based instances all live within the Analytics Cluster VLAN, and have access to Hadoop and other Analytics Cluster related tools. It is expected that the Airflow instances themselves do not perform real computation tasks; instead they should submit jobs to the Hadoop cluster. Airflow is used for the pipelining and scheduling of these jobs. </p><p>The Kubernetes based instances will also support running jobs on the Hadoop cluster using exactly the same tooling, but will also be able to support different execution models owing to the use of the Kubernetes Executor for Airflow. </p> <div class="mw-heading mw-heading3"><h3 id="Authentication" data-mw-thread-id="h-Authentication-Airflow_setup_and_conventions"><span data-mw-comment-start="" id="h-Authentication-Airflow_setup_and_conventions"></span>Authentication<span data-mw-comment-end="h-Authentication-Airflow_setup_and_conventions"></span></h3></div> <div class="mw-heading mw-heading4"><h4 id="Host-based_instances" data-mw-thread-id="h-Host-based_instances-Authentication"><span data-mw-comment-start="" id="h-Host-based_instances-Authentication"></span>Host-based instances<span data-mw-comment-end="h-Host-based_instances-Authentication"></span></h4></div> <p>All of our host-based <a href="/wiki/Data_Engineering/Systems/Airflow/Instances" class="mw-redirect" title="Data Engineering/Systems/Airflow/Instances">Airflow instances</a> are currently accessed via SSH tunnels, so management of DAG runs and tasks requires <a href="/wiki/SRE/Production_access" title="SRE/Production access">production shell access</a> and membership of a specific group. This access is controlled by SRE, which means that our authentication and access control mechanism is external to Airflow itself. For this reason we allow full access rights to any user of the Airflow web interface. </p> <figure class="mw-default-size" typeof="mw:File/Thumb"><a href="/wiki/File:DAG_run_notes.png" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/labs/thumb/5/54/DAG_run_notes.png/220px-DAG_run_notes.png" decoding="async" width="220" height="87" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/labs/thumb/5/54/DAG_run_notes.png/330px-DAG_run_notes.png 1.5x, //upload.wikimedia.org/wikipedia/labs/thumb/5/54/DAG_run_notes.png/440px-DAG_run_notes.png 2x" data-file-width="1917" data-file-height="756"/></a><figcaption>Where to locate DAG run notes</figcaption></figure> <p>However, a recent <a rel="nofollow" class="external text" href="https://github.com/apache/airflow/issues/36206">bug</a> (see also: <a href="https://phabricator.wikimedia.org/T352534" class="extiw" title="phabricator:T352534">task T352534</a>) means that we can <b>no longer add notes to DAG runs unless we are logged in</b>. </p><p>For this reason, we have created a simple user account called <code>admin</code> on each instance and assigned the password <code>admin</code> to it. You can log into this account if you wish to add add or edit notes associated with tasks. We expect to be reviewing this configuration in the near future, as we improve the Airflow service and user experience. </p> <div class="mw-heading mw-heading4"><h4 id="Kubernetes_instances" data-mw-thread-id="h-Kubernetes_instances-Authentication"><span data-mw-comment-start="" id="h-Kubernetes_instances-Authentication"></span>Kubernetes instances<span data-mw-comment-end="h-Kubernetes_instances-Authentication"></span></h4></div> <p>The instances running on Kubernetes use an authentication mechanism that is integrated with airflow and backed by our <a href="/wiki/CAS-SSO" title="CAS-SSO">CAS-SSO</a> system. Users authenticate using their Mediwiki <a href="/wiki/Developer_account" title="Developer account">developer account</a> and an LDAP group mapping determines the level of access permitted. Membership of the <code>wmf</code> or <code>nda</code> groups is required for read-only access. Each instance then has a specific LDAP group that maps to the <a rel="nofollow" class="external text" href="https://airflow.apache.org/docs/apache-airflow/stable/security/security_model.html#operations-users">operations users</a> capability. Members of <code>ops</code> are granted <a rel="nofollow" class="external text" href="https://airflow.apache.org/docs/apache-airflow/stable/security/security_model.html#admin-users">admin rights</a>. on the instances. </p> <div class="mw-heading mw-heading3"><h3 id="Metadata_Database" data-mw-thread-id="h-Metadata_Database-Airflow_setup_and_conventions"><span data-mw-comment-start="" id="h-Metadata_Database-Airflow_setup_and_conventions"></span>Metadata Database<span data-mw-comment-end="h-Metadata_Database-Airflow_setup_and_conventions"></span></h3></div> <div class="mw-heading mw-heading4"><h4 id="Host-based_instances_2" data-mw-thread-id="h-Host-based_instances_2-Metadata_Database"><span data-mw-comment-start="" id="h-Host-based_instances_2-Metadata_Database"></span>Host-based instances<span data-mw-comment-end="h-Host-based_instances_2-Metadata_Database"></span></h4></div> <p>The current airflow backend is PostgreSQL. The architecture is as follows: </p> <ul><li>1 instance of PostgreSQL on an-db1001</li> <li>1 DB per Airflow instance</li> <li>1 user account per database</li></ul> <p>Some of the reasons are: </p> <ul><li>Prevents unwanted queries across Airflow instance databases, or unwanted access to data (isolation).</li> <li>Easier configuration with 1 database per instance (authorization, backups,...).</li></ul> <div class="mw-heading mw-heading4"><h4 id="Kubernetes_instances_2" data-mw-thread-id="h-Kubernetes_instances_2-Metadata_Database"><span data-mw-comment-start="" id="h-Kubernetes_instances_2-Metadata_Database"></span>Kubernetes instances<span data-mw-comment-end="h-Kubernetes_instances_2-Metadata_Database"></span></h4></div> <p>Each of these instances has its own PostgreSQL cluster that is deployed with the instance by the <a href="/wiki/Data_Platform/Systems/CloudnativePG" title="Data Platform/Systems/CloudnativePG">CloudnativePG</a> operator. There are two PostgreSQL database instances in each cluster, operating in a high-availability mode, with automatic failover. The storage layer for these databases is the <a href="/wiki/Data_Platform/Systems/Ceph" title="Data Platform/Systems/Ceph">Ceph</a> storage cluster operated by the Data Platform Engineering team. </p><p>These <a href="/wiki/Data_Platform/Systems/CloudnativePG/Clusters" title="Data Platform/Systems/CloudnativePG/Clusters">CloudnativePG clusters</a> also include a set of three pgbouncer pods, running as a connection pooler. </p> <div class="mw-heading mw-heading3"><h3 id="Airflow_DAGs" data-mw-thread-id="h-Airflow_DAGs-Airflow_setup_and_conventions"><span data-mw-comment-start="" id="h-Airflow_DAGs-Airflow_setup_and_conventions"></span>Airflow DAGs<span data-mw-comment-end="h-Airflow_DAGs-Airflow_setup_and_conventions"></span></h3></div> <p>To develop best practices around Airflow, we use a single shared git repository for Airflow DAGs for all instances: <a class="external text" href="https://gitlab.wikimedia.org/repos/data-engineering/airflow-dags">data-engineering/airflow-dags</a>. Airflow instance (and team) specific DAGs live in subdirectories of this repository, e.g. in <tt><instance_name>/dags</tt>. </p> <div class="mw-heading mw-heading4"><h4 id="Host-based_DAG_deployment" data-mw-thread-id="h-Host-based_DAG_deployment-Airflow_DAGs"><span data-mw-comment-start="" id="h-Host-based_DAG_deployment-Airflow_DAGs"></span>Host-based DAG deployment<span data-mw-comment-end="h-Host-based_DAG_deployment-Airflow_DAGs"></span></h4></div> <p>Each Airflow instance has its own scap deployment of data-engineering/airflow-dags. See <a href="/wiki/Scap#Other_software_deployments" title="Scap">Scap#Other_software_deployments</a> for instructions on how to use scap to deploy. </p><p>Your airflow instance's airflow-dags scap deployment directory is located at <tt>/srv/deployment/airflow-dags/<instance_name></tt> on the deployment server as well as on your airflow host. To deploy: </p> <div class="mw-highlight mw-highlight-lang-bash mw-content-ltr" dir="ltr"><pre><span></span>ssh<span class="w"> </span>deployment.eqiad.wmnet <span class="nb">cd</span><span class="w"> </span>/srv/deployment/airflow-dags/<instance_name> git<span class="w"> </span>pull<span class="w"> </span><span class="c1"># or checkout, do whatever you need to make this git clone ready for deployment</span> scap<span class="w"> </span>deploy </pre></div> <div class="mw-heading mw-heading4"><h4 id="Kubernetes_based_DAG_deployment" data-mw-thread-id="h-Kubernetes_based_DAG_deployment-Airflow_DAGs"><span data-mw-comment-start="" id="h-Kubernetes_based_DAG_deployment-Airflow_DAGs"></span>Kubernetes based DAG deployment<span data-mw-comment-end="h-Kubernetes_based_DAG_deployment-Airflow_DAGs"></span></h4></div> <p>We are intending to use a continuous-deployment model for this, although the precise mechanism has yet to be decided. See <a href="https://phabricator.wikimedia.org/T368033" class="extiw" title="phabricator:T368033">task T368033</a> for the current work. </p> <div class="mw-heading mw-heading3"><h3 id="Skein" data-mw-thread-id="h-Skein-Airflow_setup_and_conventions"><span data-mw-comment-start="" id="h-Skein-Airflow_setup_and_conventions"></span>Skein<span data-mw-comment-end="h-Skein-Airflow_setup_and_conventions"></span></h3></div> <p>We run <a rel="nofollow" class="external text" href="https://jcristharif.com/introducing-skein.html">Skein</a> as a way to schedule Python Spark jobs on YARN from Airflow scheduled jobs. Skein is deployed on all airflow hosts. </p><p>In order to have Airflow authenticate against the Skein server, an x509 certificate is automatically generated in the airflow home directory, under <code>.skein/skein.crt</code> that has a validity period of 1 year. If that certificate expires though, Airflow won't be able to schedule Spark jobs on YARN and we will face an outage (see <a class="external free" href="https://phabricator.wikimedia.org/T344617">https://phabricator.wikimedia.org/T344617</a>). </p><p>We monitor the expiry date of these certificates <b>(</b><a class="external text" href="https://grafana.wikimedia.org/d/980N6H7Iz/skein-certificate-expiry?orgId=1">https://grafana-rw.wikimedia.org/d/980N6H7Iz/skein-certificate-expiry?orgId=1</a>), and we have a weekly systemd job in charge of renewing the certificate, to make sure that we never face such an outage. </p> <div class="mw-heading mw-heading4"><h4 id="I'm_getting_paged_for_a_Skein_certificate_about_to_expire" data-mw-thread-id="h-I'm_getting_paged_for_a_Skein_certificate_about_to_expire-Skein"><span id="I.27m_getting_paged_for_a_Skein_certificate_about_to_expire"></span><span data-mw-comment-start="" id="h-I'm_getting_paged_for_a_Skein_certificate_about_to_expire-Skein"></span>I'm getting paged for a Skein certificate about to expire<span data-mw-comment-end="h-I'm_getting_paged_for_a_Skein_certificate_about_to_expire-Skein"></span></h4></div><p> ssh onto the host associated to the alert, and go the the Airflow home directory. Hint: the alert will include a <code>cert</code> label which value is the absolute path of the associated certificate. Assuming that label has a value of <code>/srv/airflow-platform_eng/.skein/skein.crt</code>, run</p><div class="mw-highlight mw-highlight-lang-bash mw-content-ltr" dir="ltr"><pre><span></span>sudo<span class="w"> </span>su<span class="w"> </span>-<span class="w"> </span>analytics-platform-eng <span class="nb">export</span><span class="w"> </span><span class="nv">HOME</span><span class="o">=</span>/srv/airflow-platform_eng/ <span class="nb">source</span><span class="w"> </span>/usr/lib/airflow/bin/activate skein<span class="w"> </span>config<span class="w"> </span>gencerts<span class="w"> </span>--force </pre></div><p>You can then check the new expiry date of the certificate</p><div class="mw-highlight mw-highlight-lang-bash mw-content-ltr" dir="ltr"><pre><span></span>openssl<span class="w"> </span>x509<span class="w"> </span>-in<span class="w"> </span>~/.skein/skein.crt<span class="w"> </span>-dates<span class="w"> </span><span class="p">|</span>head<span class="w"> </span>-n<span class="w"> </span><span class="m">2</span> <span class="nv">notBefore</span><span class="o">=</span>Aug<span class="w"> </span><span class="m">21</span><span class="w"> </span><span class="m">15</span>:39:26<span class="w"> </span><span class="m">2023</span><span class="w"> </span>GMT <span class="nv">notAfter</span><span class="o">=</span>Aug<span class="w"> </span><span class="m">20</span><span class="w"> </span><span class="m">15</span>:39:26<span class="w"> </span><span class="m">2024</span><span class="w"> </span>GMT </pre></div><p>(See original phab task for details: <a class="external free" href="https://phabricator.wikimedia.org/T344617#9106681">https://phabricator.wikimedia.org/T344617#9106681</a>) </p><div class="mw-heading mw-heading2 ext-discussiontools-init-section"><h2 id="See_also" data-mw-thread-id="h-See_also"><span data-mw-comment-start="" id="h-See_also"></span>See also<span data-mw-comment-end="h-See_also"></span></h2><!--__DTELLIPSISBUTTON__{"threadItem":{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-See_also","replies":[]}}--></div> <ul><li><a rel="nofollow" class="external text" href="https://docs.google.com/document/d/1hp6JYVy3SLRgTx1BYfnNOCPk5VFJeZ4jMpxD8WJKVB0/edit">Shared Airflow - Design Document</a></li> <li><a href="https://phabricator.wikimedia.org/T272973" class="extiw" title="phab:T272973">phab:T272973</a></li> <li><a href="/wiki/Analytics/Systems/Cluster/Workflow_management_tools_study" class="mw-redirect" title="Analytics/Systems/Cluster/Workflow management tools study">Analytics/Systems/Cluster/Workflow_management_tools_study</a></li> <li><a href="https://phabricator.wikimedia.org/tag/airflow/" class="extiw" title="phab:tag/airflow/">Phabricator project</a></li></ul> <div class="mw-heading mw-heading2 ext-discussiontools-init-section"><h2 id="Airflow_Instances" data-mw-thread-id="h-Airflow_Instances"><span data-mw-comment-start="" id="h-Airflow_Instances"></span>Airflow Instances<span data-mw-comment-end="h-Airflow_Instances"></span></h2><!--__DTELLIPSISBUTTON__{"threadItem":{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-Airflow_Instances","replies":[]}}--></div> <p>Kept up to date at: <a href="/wiki/Data_Engineering/Systems/Airflow/Instances#List_of_instances" class="mw-redirect" title="Data Engineering/Systems/Airflow/Instances">Data_Engineering/Systems/Airflow/Instances#List_of_instances</a> </p> <div class="mw-heading mw-heading2 ext-discussiontools-init-section"><h2 id="Airflow_on_Kubernetes" data-mw-thread-id="h-Airflow_on_Kubernetes"><span data-mw-comment-start="" id="h-Airflow_on_Kubernetes"></span>Airflow on Kubernetes<span data-mw-comment-end="h-Airflow_on_Kubernetes"></span></h2><!--__DTELLIPSISBUTTON__{"threadItem":{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-Airflow_on_Kubernetes","replies":[]}}--></div> <p>Kept up to date at <a href="/wiki/Data_Platform/Systems/Airflow/Kubernetes" title="Data Platform/Systems/Airflow/Kubernetes">Data Platform/Systems/Airflow/Kubernetes</a> </p> <div class="mw-heading mw-heading2 ext-discussiontools-init-section"><h2 id="Airflow_Upgrades" data-mw-thread-id="h-Airflow_Upgrades"><span data-mw-comment-start="" id="h-Airflow_Upgrades"></span>Airflow Upgrades<span data-mw-comment-end="h-Airflow_Upgrades"></span></h2><!--__DTELLIPSISBUTTON__{"threadItem":{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-Airflow_Upgrades","replies":[]}}--></div> <p>The Airflow upgrade procedure is documented at: <a href="/wiki/Data_Engineering/Systems/Airflow/Upgrading" class="mw-redirect" title="Data Engineering/Systems/Airflow/Upgrading">Data_engineering/Systems/Airflow/Upgrading</a> </p> <div class="mw-heading mw-heading2 ext-discussiontools-init-section"><h2 id="Administration" data-mw-thread-id="h-Administration"><span data-mw-comment-start="" id="h-Administration"></span>Administration<span data-mw-comment-end="h-Administration"></span></h2><!--__DTELLIPSISBUTTON__{"threadItem":{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-Administration","replies":["h-Overview_of_Data_Engineering's_Airflow_deployments-Administration","h-Creating_a_new_Airflow_Instance-Administration"]}}--></div> <div class="mw-heading mw-heading3"><h3 id="Overview_of_Data_Engineering's_Airflow_deployments" data-mw-thread-id="h-Overview_of_Data_Engineering's_Airflow_deployments-Administration"><span id="Overview_of_Data_Engineering.27s_Airflow_deployments"></span><span data-mw-comment-start="" id="h-Overview_of_Data_Engineering's_Airflow_deployments-Administration"></span>Overview of Data Engineering's Airflow deployments<span data-mw-comment-end="h-Overview_of_Data_Engineering's_Airflow_deployments-Administration"></span></h3></div> <p>Data Engineering maintains a debian package for Airflow at <a class="external text" href="https://gerrit.wikimedia.org/r/plugins/gitiles/operations/debs/airflow/">operations/debs/airflow/</a>. This debian packaging installs a premade <a rel="nofollow" class="external text" href="https://docs.conda.io/en/latest/">conda</a> environment with all dependencies needed to run Airflow. The debian package installs this conda environment to <tt>/usr/lib/airflow</tt>. </p><p>The <code>airflow::instance</code> Puppet define is used to set up and run Airflow instances. This define can be used multiple times on the same host to declare multiple airflow instances. The instance specific configs are installed in <tt>/srv/airflow-<instance_name></tt>, and templated systemd units are set up for services <tt>airflow-scheduler@<instance_name></tt> and <tt>airflow-webserver@<instance_name></tt>. </p><p>The <code>profile::airflow</code> Puppet class uses the <code>profile::airflow::instances</code> <a rel="nofollow" class="external text" href="https://www.puppet.com/docs/puppet/7/hiera_intro.html">hiera</a> variable to declare <code>airflow::instance</code>s. This allows each <code>airflow::instance</code> to be fully specified via hiera. <code>profile::airflow</code> by default will use Data Engineering conventions as defaults for an <code>airflow::instance</code>. </p><p>These defaults include setting up instance specific <code>scap::target</code>s of the <a class="external text" href="https://gitlab.wikimedia.org/repos/data-engineering/airflow-dags">data-engineering/airflow-dags</a> repository. (There is still some manual setup needed for this, see the instructions below on how to configure this for new instances.) The Airflow instance's <code>dags_folder</code> will be automatically set to one of the instance specific subdirectories in the airflow-dags repository. (You can override this in hiera if you need.) </p> <div class="mw-heading mw-heading3"><h3 id="Creating_a_new_Airflow_Instance" data-mw-thread-id="h-Creating_a_new_Airflow_Instance-Administration"><span data-mw-comment-start="" id="h-Creating_a_new_Airflow_Instance-Administration"></span>Creating a new Airflow Instance<span data-mw-comment-end="h-Creating_a_new_Airflow_Instance-Administration"></span></h3></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r2211903"><div role="note" class="note note-warn">The information below is outdated, since any new instances will be deployed to Kubernetes.</div> <p>In this example, we'll be creating a new Airflow instance named 'test'. </p> <div class="mw-heading mw-heading4"><h4 id="Prepare_airflow-dags_for_deployment_to_the_new_instance" data-mw-thread-id="h-Prepare_airflow-dags_for_deployment_to_the_new_instance-Creating_a_new_Airflow_Instance"><span data-mw-comment-start="" id="h-Prepare_airflow-dags_for_deployment_to_the_new_instance-Creating_a_new_Airflow_Instance"></span>Prepare airflow-dags for deployment to the new instance<span data-mw-comment-end="h-Prepare_airflow-dags_for_deployment_to_the_new_instance-Creating_a_new_Airflow_Instance"></span></h4></div> <div class="mw-heading mw-heading5"><h5 id="Create_the_instance_specific_dags_folder" data-mw-thread-id="h-Create_the_instance_specific_dags_folder-Prepare_airflow-dags_for_deployment_to_the_new_instance"><span data-mw-comment-start="" id="h-Create_the_instance_specific_dags_folder-Prepare_airflow-dags_for_deployment_to_the_new_instance"></span>Create the instance specific dags folder<span data-mw-comment-end="h-Create_the_instance_specific_dags_folder-Prepare_airflow-dags_for_deployment_to_the_new_instance"></span></h5></div> <p>By convention, all Airflow team instances use the same DAGs repository: <a class="external text" href="https://gitlab.wikimedia.org/repos/data-engineering/airflow-dags">data-engineering/airflow-dags</a>. Instance specific DAGs are located in the <tt><instance-name>/dags</tt> directory. Unless you override defaults in puppet/hiera, this will be used as airflow's <code>dags_folder</code>. </p><p>Create this directory and commit the changes before proceeding. In our example, this directory would be <tt>test/dags</tt>, since 'test' is our instance name. </p> <div class="mw-heading mw-heading5"><h5 id="Create_the_instance_specific_scap_repository" data-mw-thread-id="h-Create_the_instance_specific_scap_repository-Prepare_airflow-dags_for_deployment_to_the_new_instance"><span data-mw-comment-start="" id="h-Create_the_instance_specific_scap_repository-Prepare_airflow-dags_for_deployment_to_the_new_instance"></span>Create the instance specific scap repository<span data-mw-comment-end="h-Create_the_instance_specific_scap_repository-Prepare_airflow-dags_for_deployment_to_the_new_instance"></span></h5></div> <p><a class="external text" href="https://doc.wikimedia.org/mw-tools-scap/scap3/repo_config.html">Scap requires configuration</a> that is declared for each of its deployments. Because we use the same source DAGs repository for all airflow instances, we can't just add the scap.cfg file to the main airflow-dags repository. Instead, we use separately managed 'scap repositories' in which the deployment configuration is declared. </p><p>Create a new repository in gitlab with the name <tt>data-engineering/airflow-dags-scap-<instance_name></tt>. For our example, we'll be creating <tt>data-engineering/airflow-dags-scap-test</tt>. </p><p>You'll need to create two files in this repository: </p><p>Create <tt>scap/scap.cfg</tt> with the following content: </p> <div class="mw-highlight mw-highlight-lang-text mw-content-ltr" dir="ltr"><pre><span></span>[global] git_repo: data-engineering/airflow-dags ssh_user: test_user # (this user must exist on the airflow host, and it must be in the deploy_airflow.trusted_groups (see below) dsh_targets: targets </pre></div> <p>And create a <tt>scap/targets</tt> file with the list of hostnames that will be deployed too. Likely this will be only your airflow host. </p> <div class="mw-highlight mw-highlight-lang-text mw-content-ltr" dir="ltr"><pre><span></span>hostname1001.eqiad.wmnet </pre></div> <div class="mw-heading mw-heading5"><h5 id="Create_a_scap_deployment_source" data-mw-thread-id="h-Create_a_scap_deployment_source-Prepare_airflow-dags_for_deployment_to_the_new_instance"><span data-mw-comment-start="" id="h-Create_a_scap_deployment_source-Prepare_airflow-dags_for_deployment_to_the_new_instance"></span>Create a scap deployment source<span data-mw-comment-end="h-Create_a_scap_deployment_source-Prepare_airflow-dags_for_deployment_to_the_new_instance"></span></h5></div> <p><a href="/wiki/Scap" title="Scap">Scap</a> is used to deploy the <a class="external text" href="https://gitlab.wikimedia.org/repos/data-engineering/airflow-dags">data-engineering/airflow-dags</a> repository to airflow instances. Declaration of <code>scap::target</code> will be taken care for you by <code>profile::airflow</code>, but you will need to declare the <code>scap::source</code> for the deployment server. </p><p>Edit <a rel="nofollow" class="external text" href="https://github.com/wikimedia/puppet/blob/production/hieradata/role/common/deployment%20server/kubernetes.yaml"><tt>hieradata/role/common/deployment_server/kubernetes.yaml</tt></a> and add a new entry to <code>scap::sources</code>: </p> <div class="mw-highlight mw-highlight-lang-yaml mw-content-ltr" dir="ltr"><pre><span></span><span class="nt">scap::sources</span><span class="p">:</span> <span class="w"> </span><span class="nt">airflow-dags/test</span><span class="p">:</span> <span class="w"> </span><span class="nt">repository</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">data-engineering/airflow-dags</span> <span class="w"> </span><span class="c1"># This is the name of the scap repository we created in the previous step.</span> <span class="w"> </span><span class="nt">scap_repository</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">data-engineering/airflow-dags-scap-test</span> <span class="w"> </span><span class="nt">origin</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">gitlab</span> </pre></div> <p>You'll also need to make sure that real users will be able to deploy. They must be in a posix group that has access to the deployment server, as well as in a group listed in this hiera config: </p> <div class="mw-highlight mw-highlight-lang-yaml mw-content-ltr" dir="ltr"><pre><span></span><span class="w"> </span><span class="c1"># Shared deploy ssh key for Data Engineering maintained</span> <span class="w"> </span><span class="c1"># Airflow instances. For now, all admins of Airflow instances</span> <span class="w"> </span><span class="c1"># can deploy any Airflow instance.</span> <span class="w"> </span><span class="nt">deploy_airflow</span><span class="p">:</span> <span class="w"> </span><span class="nt">trusted_groups</span><span class="p">:</span> <span class="w"> </span><span class="p p-Indicator">-</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">analytics-deployers</span> <span class="w"> </span><span class="c1"># ...</span> </pre></div> <p>Merge any changes and run puppet on the deployment server. </p> <div class="mw-heading mw-heading4"><h4 id="Create_the_Airflow_PostgreSQL_Database" data-mw-thread-id="h-Create_the_Airflow_PostgreSQL_Database-Creating_a_new_Airflow_Instance"><span data-mw-comment-start="" id="h-Create_the_Airflow_PostgreSQL_Database-Creating_a_new_Airflow_Instance"></span>Create the Airflow PostgreSQL Database<span data-mw-comment-end="h-Create_the_Airflow_PostgreSQL_Database-Creating_a_new_Airflow_Instance"></span></h4></div> <p>Add a reference to the instance name in <code>puppet/hieradata/role/common/analytics_cluster/postgresql.yaml</code> under the <code>profile::analytics::postgresql::databases</code> key. </p><p>Add the corresponding password in the private repo in the file: <code>/srv/private/hieradata/role/common/analytics_cluster/postgresql.yaml</code> </p> <div class="mw-heading mw-heading4"><h4 id="Configure_the_Airflow_instance_in_Puppet" data-mw-thread-id="h-Configure_the_Airflow_instance_in_Puppet-Creating_a_new_Airflow_Instance"><span data-mw-comment-start="" id="h-Configure_the_Airflow_instance_in_Puppet-Creating_a_new_Airflow_Instance"></span>Configure the Airflow instance in Puppet<span data-mw-comment-end="h-Configure_the_Airflow_instance_in_Puppet-Creating_a_new_Airflow_Instance"></span></h4></div> <p>Add the <code>profile::airflow</code> class to your node's role in Puppet and configure the Airflow instance(s) in your role's hiera. </p><p>Let's assume we're adding this instance in a role class <code>role::airflow::test</code>. </p> <div class="mw-highlight mw-highlight-lang-puppet mw-content-ltr" dir="ltr"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="k">role</span><span class="p">::</span><span class="na">airflow</span><span class="p">::</span><span class="na">test</span><span class="w"> </span><span class="p">{</span> <span class="w"> </span><span class="k">include</span><span class="w"> </span><span class="p">::</span><span class="na">profile</span><span class="p">::</span><span class="na">airflow</span> <span class="w"> </span><span class="c"># profile::kerberos::keytabs is needed if your Airflow</span> <span class="w"> </span><span class="c"># instance needs to authenticate with Kerberos.</span> <span class="w"> </span><span class="c"># You'll need to create and configure the keytab for the Airflow instance's</span> <span class="w"> </span><span class="c"># $service_user we'll set below.</span> <span class="w"> </span><span class="k">include</span><span class="w"> </span><span class="p">::</span><span class="na">profile</span><span class="p">::</span><span class="na">kerberos</span><span class="p">::</span><span class="na">keytabs</span> <span class="p">}</span> </pre></div> <p><br/> Then, in <code>hieradata/role/common/airflow/test.yaml</code>: </p> <div class="mw-highlight mw-highlight-lang-yaml mw-content-ltr" dir="ltr"><pre><span></span><span class="c1"># Set up airflow instances.</span> <span class="nt">profile::airflow::instances</span><span class="p">:</span> <span class="w"> </span><span class="c1"># airflow@test instance.</span> <span class="w"> </span><span class="nt">test</span><span class="p">:</span> <span class="w"> </span><span class="c1"># Since we set security: kerberos a keytab must be deployed for the service_user.</span> <span class="w"> </span><span class="nt">service_user</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">test_user</span> <span class="w"> </span><span class="nt">service_group</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">test_group</span> <span class="w"> </span><span class="c1"># Set this to true if you want enable alerting for your airflow instance.</span> <span class="w"> </span><span class="nt">monitoring_enabled</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">false</span> <span class="w"> </span><span class="c1"># Configuration for /srv/airflow-test/airflow.cfg</span> <span class="w"> </span><span class="c1"># Any airflow::instance configs can go here. See:</span> <span class="w"> </span><span class="c1"># https://airflow.apache.org/docs/apache-airflow/stable/configurations-ref.html</span> <span class="w"> </span><span class="c1"># NOTE: unless your airflow instance does special things, the defaults</span> <span class="w"> </span><span class="c1"># set in profile::airflow should be sufficient for setting up a</span> <span class="w"> </span><span class="c1"># WMF Data Engineering managed airflow::instance.</span> <span class="w"> </span><span class="c1">#airflow_config:</span> <span class="w"> </span><span class="c1"># core:</span> <span class="c1"># Make sure the keytab for test_user is deployed via profile::kerberos::keytabs</span> <span class="nt">profile::kerberos::keytabs::keytabs_metadata</span><span class="p">:</span> <span class="w"> </span><span class="p p-Indicator">-</span><span class="w"> </span><span class="nt">role</span><span class="p">:</span><span class="w"> </span><span class="s">'test_user'</span> <span class="w"> </span><span class="nt">owner</span><span class="p">:</span><span class="w"> </span><span class="s">'test_user'</span> <span class="w"> </span><span class="nt">group</span><span class="p">:</span><span class="w"> </span><span class="s">'test_group'</span> <span class="w"> </span><span class="nt">filename</span><span class="p">:</span><span class="w"> </span><span class="s">'test_user.keytab'</span> </pre></div> <p>See <a href="/wiki/Analytics/Systems/Kerberos#Create_a_keytab_for_a_service" class="mw-redirect" title="Analytics/Systems/Kerberos">Create_a_keytab_for_a_service</a> for instructions on creating keytabs. </p><p>Note that we didn't set <code>db_user</code> or <code>db_password</code>. These are secrets and should be set in the <a href="/wiki/Puppet#Private_puppet" title="Puppet">operations puppet private repository</a> in the hiera variable <code>profile::airflow::instances_secrets</code>. So, in puppet private in the <code>hieradata/role/common/airflow/test.yaml</code> file: </p> <div class="mw-highlight mw-highlight-lang-yaml mw-content-ltr" dir="ltr"><pre><span></span><span class="c1"># Set up airflow instances.</span> <span class="nt">profile::airflow::instances_secrets</span><span class="p">:</span> <span class="w"> </span><span class="c1"># airflow@test instance.</span> <span class="w"> </span><span class="nt">test</span><span class="p">:</span> <span class="w"> </span><span class="nt">db_user</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">airflow_test</span> <span class="w"> </span><span class="nt">db_password</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">password_here</span> </pre></div> <p><code>profile::airflow::instances_secrets</code> will be merged with <code>profile::airflow::instances</code> by the <code>profile::airflow</code> class, and the parameters to <code>airflow::instance</code> will be available for use in the <code>sql_alchemy_conn</code> as an ERb template. </p><p>Once this is merged and applied, the node with the <code>role::airflow::test</code> will run the systemd services <code>airflow-scheduler@test</code>, <code>airflow-webserver@test</code>, <code>airflow-kerberos@test</code>, as well as some 'control' systemd services <code>airflow@test</code> and <code>airflow</code> that can be used to manage the Airflow test instance. </p><p>Create the airflow tables by running </p> <pre> sudo -u test_user airflow-test db upgrade </pre> <p>The airflow services were probably already started by the earlier puppet run. Restart them now that the airflow tables are created properly. </p> <pre> sudo systemctl restart airflow@test.service </pre> <div class="mw-heading mw-heading4"><h4 id="Add_service_user_to_the_Yarn_production_queue" data-mw-thread-id="h-Add_service_user_to_the_Yarn_production_queue-Creating_a_new_Airflow_Instance"><span data-mw-comment-start="" id="h-Add_service_user_to_the_Yarn_production_queue-Creating_a_new_Airflow_Instance"></span>Add service user to the Yarn production queue<span data-mw-comment-end="h-Add_service_user_to_the_Yarn_production_queue-Creating_a_new_Airflow_Instance"></span></h4></div><p> Since all Yarn applications (Spark job, Skein apps, etc.) are submitted by the service user running the Airflow instance, we need to grant this user permissions in one of Yarn's queues defined in <a rel="nofollow" class="external text" href="https://github.com/wikimedia/puppet/blob/production/modules/profile/manifests/analytics/cluster/hadoop/yarn_capacity_scheduler.pp">yarn_capacity_scheduler.pp</a>. All Airflow instance users should be allowed to run on the "<a rel="nofollow" class="external text" href="https://github.com/wikimedia/puppet/blob/production/modules/profile/manifests/analytics/cluster/hadoop/yarn_capacity_scheduler.pp#L125">production</a>" queue. Example adding "test_user" below:</p><div class="mw-highlight mw-highlight-lang-puppet mw-content-ltr" dir="ltr"><pre><span></span><span class="err">...</span> <span class="w"> </span><span class="c"># this allows test_user to submit applications to the 'production' queue</span> <span class="w"> </span><span class="s">'yarn.scheduler.capacity.root.production.acl_submit_applications'</span><span class="w"> </span><span class="o">=></span><span class="w"> </span><span class="s">'test_user,existingUser1,existingUser2'</span><span class="p">,</span> <span class="err">...</span> <span class="w"> </span><span class="c"># this redirects applications submitted by test_user to the 'production' queue if no queue was specified.</span> <span class="w"> </span><span class="s">'yarn.scheduler.capacity.queue-mappings'</span><span class="w"> </span><span class="o">=></span><span class="w"> </span><span class="s">'u:test_user:production,u:existingUser1:production,u:u:existingUser2:production'</span><span class="p">,</span> <span class="err">...</span> </pre></div> <div class="mw-heading mw-heading2 ext-discussiontools-init-section"><h2 id="Incident_reports_&_known_issues" data-mw-thread-id="h-Incident_reports_&_known_issues"><span id="Incident_reports_.26_known_issues"></span><span data-mw-comment-start="" id="h-Incident_reports_&_known_issues"></span>Incident reports & known issues<span data-mw-comment-end="h-Incident_reports_&_known_issues"></span></h2><!--__DTELLIPSISBUTTON__{"threadItem":{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-Incident_reports_&_known_issues","replies":[]}}--></div> <p>Add incident reports and knowns issues in the following table. Please add a short description of the issue and a link to a more detailed one: either a wiki page or a Phabricator task. Thanks!聽:] </p> <table class="wikitable"> <tbody><tr> <th>Date </th> <th>Incident / Issue description </th> <th>link </th></tr> <tr> <td>2022-07-26 </td> <td>This is an example incident description. </td> <td>example.link </td></tr></tbody></table> <!-- NewPP limit report Parsed by mw鈥恮eb.eqiad.main鈥恇f56d8766鈥恉wx89 Cached time: 20241114115056 Cache expiry: 2592000 Reduced expiry: false Complications: [show鈥恡oc] DiscussionTools time usage: 0.022 seconds CPU time usage: 0.092 seconds Real time usage: 0.110 seconds Preprocessor visited node count: 492/1000000 Post鈥恊xpand include size: 13940/2097152 bytes Template argument size: 7289/2097152 bytes Highest expansion depth: 11/100 Expensive parser function count: 11/500 Unstrip recursion depth: 0/20 Unstrip post鈥恊xpand size: 15294/5000000 bytes --> <!-- Transclusion expansion time report (%,ms,calls,template) 100.00% 43.578 1 -total 48.00% 20.918 1 Template:Navigation_Data_Platform 44.89% 19.561 2 Template:Note 38.75% 16.886 1 Template:Navigation_sidebar 9.93% 4.329 3 Template:Phabricator/en --> <!-- Saved in parser cache with key labswiki:pcache:447901:|#|:idhash:canonical and timestamp 20241114115056 and revision id 2244312. Rendering was triggered because: diff-page --> </div><!--esi <esi:include src="/esitest-fa8a495983347898/content" /> --><noscript><img src="https://login.wikimedia.org/wiki/Special:CentralAutoLogin/start?type=1x1" alt="" width="1" height="1" style="border: none; position: absolute;"></noscript> <div class="printfooter" data-nosnippet="">Retrieved from "<a dir="ltr" href="https://wikitech.wikimedia.org/w/index.php?title=Data_Platform/Systems/Airflow&oldid=2244312">https://wikitech.wikimedia.org/w/index.php?title=Data_Platform/Systems/Airflow&oldid=2244312</a>"</div></div> <div id="catlinks" class="catlinks" data-mw="interface"><div id="mw-normal-catlinks" class="mw-normal-catlinks"><a href="/wiki/Special:Categories" title="Special:Categories">Categories</a>: <ul><li><a href="/wiki/Category:Data_platform" title="Category:Data platform">Data platform</a></li><li><a href="/wiki/Category:Data_platform_systems" title="Category:Data platform systems">Data platform systems</a></li></ul></div></div> </div> </main> </div> <div class="mw-footer-container"> <footer id="footer" class="mw-footer" > <ul id="footer-info"> <li id="footer-info-lastmod"> This page was last edited on 14 November 2024, at 10:53.</li> <li id="footer-info-copyright">Text is available under the <a rel="nofollow" class="external text" href="https://creativecommons.org/licenses/by-sa/4.0/deed.en">Creative Commons Attribution-ShareAlike License</a>; additional terms may apply. See <a class="external text" href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Terms_of_Use">Terms of Use</a> for details.</li> </ul> <ul id="footer-places"> <li id="footer-places-privacy"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy">Privacy policy</a></li> <li id="footer-places-about"><a href="/wiki/Main_Page">About Wikitech</a></li> <li id="footer-places-disclaimers"><a href="https://foundation.wikimedia.org/wiki/General_disclaimer">Disclaimers</a></li> <li id="footer-places-wm-codeofconduct"><a href="https://www.mediawiki.org/wiki/Special:MyLanguage/Code_of_Conduct">Code of Conduct</a></li> <li id="footer-places-developers"><a href="https://developer.wikimedia.org">Developers</a></li> <li id="footer-places-statslink"><a href="https://stats.wikimedia.org/#/wikitech.wikimedia.org">Statistics</a></li> <li id="footer-places-cookiestatement"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Cookie_statement">Cookie statement</a></li> <li id="footer-places-mobileview"><a href="//wikitech.wikimedia.org/w/index.php?title=Data_Platform/Systems/Airflow&mobileaction=toggle_view_mobile" class="noprint stopMobileRedirectToggle">Mobile view</a></li> </ul> <ul id="footer-icons" class="noprint"> <li id="footer-copyrightico"><a href="https://wikimediafoundation.org/" class="cdx-button cdx-button--fake-button cdx-button--size-large cdx-button--fake-button--enabled"><img src="/static/images/footer/wikimedia-button.svg" width="84" height="29" alt="Wikimedia Foundation" loading="lazy"></a></li> <li id="footer-poweredbyico"><a href="https://www.mediawiki.org/" class="cdx-button cdx-button--fake-button cdx-button--size-large cdx-button--fake-button--enabled"><img src="/w/resources/assets/poweredby_mediawiki.svg" alt="Powered by MediaWiki" width="88" height="31" loading="lazy"></a></li> </ul> </footer> </div> </div> </div> <div class="vector-settings" id="p-dock-bottom"> <ul></ul> </div><script>(RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgHostname":"mw-web.codfw.canary-74567ccdbd-t7h28","wgBackendResponseTime":150,"wgDiscussionToolsPageThreads":[{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-Airflow_setup_and_conventions","replies":[{"headingLevel":3,"name":"h-","type":"heading","level":0,"id":"h-Authentication-Airflow_setup_and_conventions","replies":[{"headingLevel":4,"name":"h-","type":"heading","level":0,"id":"h-Host-based_instances-Authentication","replies":[]},{"headingLevel":4,"name":"h-","type":"heading","level":0,"id":"h-Kubernetes_instances-Authentication","replies":[]}]},{"headingLevel":3,"name":"h-","type":"heading","level":0,"id":"h-Metadata_Database-Airflow_setup_and_conventions","replies":[{"headingLevel":4,"name":"h-","type":"heading","level":0,"id":"h-Host-based_instances_2-Metadata_Database","replies":[]},{"headingLevel":4,"name":"h-","type":"heading","level":0,"id":"h-Kubernetes_instances_2-Metadata_Database","replies":[]}]},{"headingLevel":3,"name":"h-","type":"heading","level":0,"id":"h-Airflow_DAGs-Airflow_setup_and_conventions","replies":[{"headingLevel":4,"name":"h-","type":"heading","level":0,"id":"h-Host-based_DAG_deployment-Airflow_DAGs","replies":[]},{"headingLevel":4,"name":"h-","type":"heading","level":0,"id":"h-Kubernetes_based_DAG_deployment-Airflow_DAGs","replies":[]}]},{"headingLevel":3,"name":"h-","type":"heading","level":0,"id":"h-Skein-Airflow_setup_and_conventions","replies":[{"headingLevel":4,"name":"h-","type":"heading","level":0,"id":"h-I'm_getting_paged_for_a_Skein_certificate_about_to_expire-Skein","replies":[]}]}]},{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-See_also","replies":[]},{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-Airflow_Instances","replies":[]},{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-Airflow_on_Kubernetes","replies":[]},{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-Airflow_Upgrades","replies":[]},{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-Administration","replies":[{"headingLevel":3,"name":"h-","type":"heading","level":0,"id":"h-Overview_of_Data_Engineering's_Airflow_deployments-Administration","replies":[]},{"headingLevel":3,"name":"h-","type":"heading","level":0,"id":"h-Creating_a_new_Airflow_Instance-Administration","replies":[{"headingLevel":4,"name":"h-","type":"heading","level":0,"id":"h-Prepare_airflow-dags_for_deployment_to_the_new_instance-Creating_a_new_Airflow_Instance","replies":[{"headingLevel":5,"name":"h-","type":"heading","level":0,"id":"h-Create_the_instance_specific_dags_folder-Prepare_airflow-dags_for_deployment_to_the_new_instance","replies":[]},{"headingLevel":5,"name":"h-","type":"heading","level":0,"id":"h-Create_the_instance_specific_scap_repository-Prepare_airflow-dags_for_deployment_to_the_new_instance","replies":[]},{"headingLevel":5,"name":"h-","type":"heading","level":0,"id":"h-Create_a_scap_deployment_source-Prepare_airflow-dags_for_deployment_to_the_new_instance","replies":[]}]},{"headingLevel":4,"name":"h-","type":"heading","level":0,"id":"h-Create_the_Airflow_PostgreSQL_Database-Creating_a_new_Airflow_Instance","replies":[]},{"headingLevel":4,"name":"h-","type":"heading","level":0,"id":"h-Configure_the_Airflow_instance_in_Puppet-Creating_a_new_Airflow_Instance","replies":[]},{"headingLevel":4,"name":"h-","type":"heading","level":0,"id":"h-Add_service_user_to_the_Yarn_production_queue-Creating_a_new_Airflow_Instance","replies":[]}]}]},{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-Incident_reports_\u0026_known_issues","replies":[]}],"wgPageParseReport":{"discussiontools":{"limitreport-timeusage":"0.022"},"limitreport":{"cputime":"0.092","walltime":"0.110","ppvisitednodes":{"value":492,"limit":1000000},"postexpandincludesize":{"value":13940,"limit":2097152},"templateargumentsize":{"value":7289,"limit":2097152},"expansiondepth":{"value":11,"limit":100},"expensivefunctioncount":{"value":11,"limit":500},"unstrip-depth":{"value":0,"limit":20},"unstrip-size":{"value":15294,"limit":5000000},"timingprofile":["100.00% 43.578 1 -total"," 48.00% 20.918 1 Template:Navigation_Data_Platform"," 44.89% 19.561 2 Template:Note"," 38.75% 16.886 1 Template:Navigation_sidebar"," 9.93% 4.329 3 Template:Phabricator/en"]},"cachereport":{"origin":"mw-web.eqiad.main-bf56d8766-dwx89","timestamp":"20241114115056","ttl":2592000,"transientcontent":false}}});});</script> </body> </html>