CINXE.COM

Data Platform/Transform data - Wikitech

<!DOCTYPE html> <html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-disabled skin-theme-clientpref-day vector-toc-not-available" lang="en" dir="ltr"> <head> <meta charset="UTF-8"> <title>Data Platform/Transform data - Wikitech</title> <script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-disabled skin-theme-clientpref-day vector-toc-not-available";var cookie=document.cookie.match(/(?:^|; )labswikimwclientpreferences=([^;]+)/);if(cookie){cookie[1].split('%2C').forEach(function(pref){className=className.replace(new RegExp('(^| )'+pref.replace(/-clientpref-\w+$|[^\w-]+/g,'')+'-clientpref-\\w+( |$)'),'$1'+pref+'$2');});}document.documentElement.className=className;}());RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat" :"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"0bcdf43f-7f8f-4a73-80b3-48022a482958","wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Data_Platform/Transform_data","wgTitle":"Data Platform/Transform data","wgCurRevisionId":2239812,"wgRevisionId":2239812,"wgArticleId":454548,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Pages with FIXME on them","Landing page","Data platform"],"wgPageViewLanguage":"en","wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRelevantPageName":"Data_Platform/Transform_data","wgRelevantArticleId":454548,"wgIsProbablyEditable":false,"wgRelevantPageIsProbablyEditable":false,"wgRestrictionEdit":[],"wgRestrictionMove":[],"wgNoticeProject":"wikitech","wgCiteReferencePreviewsActive":true,"wgMediaViewerOnClick":true, "wgMediaViewerEnabledByDefault":true,"wgVisualEditor":{"pageLanguageCode":"en","pageLanguageDir":"ltr","pageVariantFallbacks":"en"},"wgMFDisplayWikibaseDescriptions":{"search":true,"watchlist":true,"tagline":false,"nearby":true},"wgWMESchemaEditAttemptStepOversample":false,"wgWMEPageLength":10000,"wgCentralAuthMobileDomain":false,"wgEditSubmitButtonLabelPublish":true,"wgDiscussionToolsFeaturesEnabled":{"replytool":true,"newtopictool":true,"sourcemodetoolbar":true,"topicsubscription":false,"autotopicsub":false,"visualenhancements":false,"visualenhancements_reply":false,"visualenhancements_pageframe":false},"wgDiscussionToolsFallbackEditMode":"visual","wgULSPosition":"personal","wgULSisCompactLinksEnabled":false,"wgVector2022LanguageInHeader":true,"wgULSisLanguageSelectorEmpty":false,"wgCheckUserClientHintsHeadersJsApi":["brands","architecture","bitness","fullVersionList","mobile","model","platform","platformVersion"],"wgSiteNoticeId":"2.0"};RLSTATE={"ext.globalCssJs.user.styles":"ready" ,"site.styles":"ready","user.styles":"ready","ext.globalCssJs.user":"ready","user":"ready","user.options":"loading","ext.inputBox.styles":"ready","ext.discussionTools.init.styles":"ready","oojs-ui-core.styles":"ready","oojs-ui.styles.indicators":"ready","mediawiki.widgets.styles":"ready","oojs-ui-core.icons":"ready","skins.vector.search.codex.styles":"ready","skins.vector.styles":"ready","skins.vector.icons":"ready","jquery.makeCollapsible.styles":"ready","ext.wikimediamessages.styles":"ready","ext.visualEditor.desktopArticleTarget.noscript":"ready","ext.uls.pt":"ready","ext.dismissableSiteNotice.styles":"ready"};RLPAGEMODULES=["site","mediawiki.page.ready","jquery.makeCollapsible","skins.vector.js","ext.centralNotice.geoIP","ext.centralNotice.startUp","ext.gadget.site","ext.urlShortener.toolbar","ext.centralauth.centralautologin","mmv.bootstrap","ext.visualEditor.desktopArticleTarget.init","ext.visualEditor.targetLoader","ext.echo.centralauth","ext.discussionTools.init", "ext.eventLogging","ext.wikimediaEvents","ext.uls.interface","ext.checkUser.clientHints","ext.dismissableSiteNotice"];</script> <script>(RLQ=window.RLQ||[]).push(function(){mw.loader.impl(function(){return["user.options@12s5i",function($,jQuery,require,module){mw.user.tokens.set({"patrolToken":"+\\","watchToken":"+\\","csrfToken":"+\\"}); }];});});</script> <link rel="stylesheet" href="/w/load.php?lang=en&amp;modules=ext.discussionTools.init.styles%7Cext.dismissableSiteNotice.styles%7Cext.inputBox.styles%7Cext.uls.pt%7Cext.visualEditor.desktopArticleTarget.noscript%7Cext.wikimediamessages.styles%7Cjquery.makeCollapsible.styles%7Cmediawiki.widgets.styles%7Coojs-ui-core.icons%2Cstyles%7Coojs-ui.styles.indicators%7Cskins.vector.icons%2Cstyles%7Cskins.vector.search.codex.styles&amp;only=styles&amp;skin=vector-2022"> <script async="" src="/w/load.php?lang=en&amp;modules=startup&amp;only=scripts&amp;raw=1&amp;skin=vector-2022"></script> <meta name="ResourceLoaderDynamicStyles" content=""> <link rel="stylesheet" href="/w/load.php?lang=en&amp;modules=site.styles&amp;only=styles&amp;skin=vector-2022"> <meta name="generator" content="MediaWiki 1.44.0-wmf.5"> <meta name="referrer" content="origin"> <meta name="referrer" content="origin-when-cross-origin"> <meta name="robots" content="max-image-preview:standard"> <meta name="format-detection" content="telephone=no"> <meta name="viewport" content="width=1120"> <meta property="og:title" content="Data Platform/Transform data - Wikitech"> <meta property="og:type" content="website"> <link rel="preconnect" href="//upload.wikimedia.org"> <link rel="icon" href="/static/favicon/wikitech.ico"> <link rel="search" type="application/opensearchdescription+xml" href="/w/rest.php/v1/search" title="Wikitech (en)"> <link rel="EditURI" type="application/rsd+xml" href="//wikitech.wikimedia.org/w/api.php?action=rsd"> <link rel="canonical" href="https://wikitech.wikimedia.org/wiki/Data_Platform/Transform_data"> <link rel="license" href="https://creativecommons.org/licenses/by-sa/4.0/"> <link rel="alternate" type="application/atom+xml" title="Wikitech Atom feed" href="/w/index.php?title=Special:RecentChanges&amp;feed=atom"> <link rel="dns-prefetch" href="//meta.wikimedia.org" /> <link rel="dns-prefetch" href="//login.wikimedia.org"> </head> <body class="ext-discussiontools-replytool-enabled ext-discussiontools-newtopictool-enabled ext-discussiontools-sourcemodetoolbar-enabled skin--responsive skin-vector skin-vector-search-vue mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject page-Data_Platform_Transform_data rootpage-Data_Platform skin-vector-2022 action-view"><a class="mw-jump-link" href="#bodyContent">Jump to content</a> <div class="vector-header-container"> <header class="vector-header mw-header"> <div class="vector-header-start"> <nav class="vector-main-menu-landmark" aria-label="Site"> <div id="vector-main-menu-dropdown" class="vector-dropdown vector-main-menu-dropdown vector-button-flush-left vector-button-flush-right" > <input type="checkbox" id="vector-main-menu-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-main-menu-dropdown" class="vector-dropdown-checkbox " aria-label="Main menu" > <label id="vector-main-menu-dropdown-label" for="vector-main-menu-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-menu mw-ui-icon-wikimedia-menu"></span> <span class="vector-dropdown-label-text">Main menu</span> </label> <div class="vector-dropdown-content"> <div id="vector-main-menu-unpinned-container" class="vector-unpinned-container"> <div id="vector-main-menu" class="vector-main-menu vector-pinnable-element"> <div class="vector-pinnable-header vector-main-menu-pinnable-header vector-pinnable-header-unpinned" data-feature-name="main-menu-pinned" data-pinnable-element-id="vector-main-menu" data-pinned-container-id="vector-main-menu-pinned-container" data-unpinned-container-id="vector-main-menu-unpinned-container" > <div class="vector-pinnable-header-label">Main menu</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-main-menu.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-main-menu.unpin">hide</button> </div> <div id="p-navigation" class="vector-menu mw-portlet mw-portlet-navigation" > <div class="vector-menu-heading"> Navigation </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="n-mainpage-description" class="mw-list-item"><a href="/wiki/Main_Page" title="Visit the main page [z]" accesskey="z"><span>Main page</span></a></li><li id="n-recentchanges" class="mw-list-item"><a href="/wiki/Special:RecentChanges" title="A list of recent changes in the wiki [r]" accesskey="r"><span>Recent changes</span></a></li><li id="n-Server-admin-log:-Prod" class="mw-list-item"><a href="/wiki/Server_Admin_Log"><span>Server admin log: Prod</span></a></li><li id="n-Admin-log:-RelEng" class="mw-list-item"><a href="/wiki/Release_Engineering/SAL"><span>Admin log: RelEng</span></a></li><li id="n-Incident-status" class="mw-list-item"><a href="/wiki/Incident_status"><span>Incident status</span></a></li><li id="n-Deployments" class="mw-list-item"><a href="/wiki/Deployments"><span>Deployments</span></a></li><li id="n-SRE-Team-Help" class="mw-list-item"><a href="/wiki/SRE/SRE_Team_requests"><span>SRE Team Help</span></a></li> </ul> </div> </div> <div id="p-Cloud_VPS_&amp;_Toolforge" class="vector-menu mw-portlet mw-portlet-Cloud_VPS_Toolforge" > <div class="vector-menu-heading"> Cloud VPS &amp; Toolforge </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="n-Cloud-VPS-portal" class="mw-list-item"><a href="/wiki/Portal:Cloud_VPS"><span>Cloud VPS portal</span></a></li><li id="n-Toolforge-portal" class="mw-list-item"><a href="/wiki/Portal:Toolforge"><span>Toolforge portal</span></a></li><li id="n-Request-VPS-project" class="mw-list-item"><a href="https://phabricator.wikimedia.org/project/view/2875/"><span>Request VPS project</span></a></li><li id="n-Admin-log:-Cloud-VPS" class="mw-list-item"><a href="/wiki/Cloud_VPS_Server_Admin_Log"><span>Admin log: Cloud VPS</span></a></li> </ul> </div> </div> </div> </div> </div> </div> </nav> <a href="/wiki/Main_Page" class="mw-logo"> <img class="mw-logo-icon" src="/static/images/icons/wikitech.svg" alt="" aria-hidden="true" height="50" width="50"> <span class="mw-logo-container skin-invert"> <img class="mw-logo-wordmark" alt="Wikitech" src="/static/images/mobile/copyright/wikitech-wordmark.svg" style="width: 8.75em; height: 1.6875em;"> </span> </a> </div> <div class="vector-header-end"> <div id="p-search" role="search" class="vector-search-box-vue vector-search-box-collapses vector-search-box-show-thumbnail vector-search-box-auto-expand-width vector-search-box"> <a href="/wiki/Special:Search" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only search-toggle" title="Search Wikitech [f]" accesskey="f"><span class="vector-icon mw-ui-icon-search mw-ui-icon-wikimedia-search"></span> <span>Search</span> </a> <div class="vector-typeahead-search-container"> <div class="cdx-typeahead-search cdx-typeahead-search--show-thumbnail cdx-typeahead-search--auto-expand-width"> <form action="/w/index.php" id="searchform" class="cdx-search-input cdx-search-input--has-end-button"> <div id="simpleSearch" class="cdx-search-input__input-wrapper" data-search-loc="header-moved"> <div class="cdx-text-input cdx-text-input--has-start-icon"> <input class="cdx-text-input__input" type="search" name="search" placeholder="Search Wikitech" aria-label="Search Wikitech" autocapitalize="sentences" title="Search Wikitech [f]" accesskey="f" id="searchInput" > <span class="cdx-text-input__icon cdx-text-input__start-icon"></span> </div> <input type="hidden" name="title" value="Special:Search"> </div> <button class="cdx-button cdx-search-input__end-button">Search</button> </form> </div> </div> </div> <nav class="vector-user-links vector-user-links-wide" aria-label="Personal tools"> <div class="vector-user-links-main"> <div id="p-vector-user-menu-preferences" class="vector-menu mw-portlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-uls" class="mw-list-item active user-links-collapsible-item"><a data-mw="interface" href="#" class="uls-trigger cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet"><span class="vector-icon mw-ui-icon-wikimedia-language mw-ui-icon-wikimedia-wikimedia-language"></span> <span>English</span></a> </li> </ul> </div> </div> <div id="p-vector-user-menu-userpage" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <nav class="vector-appearance-landmark" aria-label="Appearance"> <div id="vector-appearance-dropdown" class="vector-dropdown " title="Change the appearance of the page&#039;s font size, width, and color" > <input type="checkbox" id="vector-appearance-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-appearance-dropdown" class="vector-dropdown-checkbox " aria-label="Appearance" > <label id="vector-appearance-dropdown-label" for="vector-appearance-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-appearance mw-ui-icon-wikimedia-appearance"></span> <span class="vector-dropdown-label-text">Appearance</span> </label> <div class="vector-dropdown-content"> <div id="vector-appearance-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <div id="p-vector-user-menu-notifications" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <div id="p-vector-user-menu-overflow" class="vector-menu mw-portlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-sitesupport-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="https://donate.wikimedia.org/?wmf_source=donate&amp;wmf_medium=sidebar&amp;wmf_campaign=wikitech.wikimedia.org&amp;uselang=en" class=""><span>Donate</span></a> </li> <li id="pt-login-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="/w/index.php?title=Special:UserLogin&amp;returnto=Data+Platform%2FTransform+data" title="You are encouraged to log in; however, it is not mandatory [o]" accesskey="o" class=""><span>Log in</span></a> </li> </ul> </div> </div> </div> <div id="vector-user-links-dropdown" class="vector-dropdown vector-user-menu vector-button-flush-right vector-user-menu-logged-out user-links-collapsible-item" title="More options" > <input type="checkbox" id="vector-user-links-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-user-links-dropdown" class="vector-dropdown-checkbox " aria-label="Personal tools" > <label id="vector-user-links-dropdown-label" for="vector-user-links-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-ellipsis mw-ui-icon-wikimedia-ellipsis"></span> <span class="vector-dropdown-label-text">Personal tools</span> </label> <div class="vector-dropdown-content"> <div id="p-personal" class="vector-menu mw-portlet mw-portlet-personal user-links-collapsible-item" title="User menu" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-sitesupport" class="user-links-collapsible-item mw-list-item"><a href="https://donate.wikimedia.org/?wmf_source=donate&amp;wmf_medium=sidebar&amp;wmf_campaign=wikitech.wikimedia.org&amp;uselang=en"><span>Donate</span></a></li><li id="pt-login" class="user-links-collapsible-item mw-list-item"><a href="/w/index.php?title=Special:UserLogin&amp;returnto=Data+Platform%2FTransform+data" title="You are encouraged to log in; however, it is not mandatory [o]" accesskey="o"><span class="vector-icon mw-ui-icon-logIn mw-ui-icon-wikimedia-logIn"></span> <span>Log in</span></a></li> </ul> </div> </div> </div> </div> </nav> </div> </header> </div> <div class="mw-page-container"> <div class="mw-page-container-inner"> <div class="vector-sitenotice-container"> <div id="siteNotice"><div id="mw-dismissablenotice-anonplace"></div><script>(function(){var node=document.getElementById("mw-dismissablenotice-anonplace");if(node){node.outerHTML="\u003Cdiv class=\"mw-dismissable-notice\"\u003E\u003Cdiv class=\"mw-dismissable-notice-close\"\u003E[\u003Ca tabindex=\"0\" role=\"button\"\u003Edismiss\u003C/a\u003E]\u003C/div\u003E\u003Cdiv class=\"mw-dismissable-notice-body\"\u003E\u003C!-- CentralNotice --\u003E\u003Cdiv id=\"localNotice\" data-nosnippet=\"\"\u003E\u003Cdiv class=\"sitenotice\" lang=\"en\" dir=\"ltr\"\u003E\u003Ctable style=\"width: 75%; background-color: var(--background-color-warning-subtle, #fdf2d5); border: var(--border-subtle, 1px solid #987027); color: var(--color-base, #202122); border-radius: 10px; padding: 5px; margin: 0 auto;\"\u003E\n\u003Ctbody\u003E\u003Ctr\u003E\n\u003Ctd style=\"width:40px; height:40px; text-align:center; vertical-align:middle; padding: 2px;\"\u003E\u003Cspan typeof=\"mw:File\"\u003E\u003Ca href=\"/wiki/File:OOjs_UI_icon_alert-warning.svg\" class=\"mw-file-description\"\u003E\u003Cimg src=\"//upload.wikimedia.org/wikipedia/commons/thumb/3/3b/OOjs_UI_icon_alert-warning.svg/30px-OOjs_UI_icon_alert-warning.svg.png\" decoding=\"async\" width=\"30\" height=\"30\" class=\"mw-file-element\" srcset=\"//upload.wikimedia.org/wikipedia/commons/thumb/3/3b/OOjs_UI_icon_alert-warning.svg/45px-OOjs_UI_icon_alert-warning.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/3/3b/OOjs_UI_icon_alert-warning.svg/60px-OOjs_UI_icon_alert-warning.svg.png 2x\" data-file-width=\"20\" data-file-height=\"20\" /\u003E\u003C/a\u003E\u003C/span\u003E\n\u003C/td\u003E\n\u003Ctd style=\"text-align:center; vertical-align: middle; padding: 4px; max-height: 60px;\"\u003E\u003Cb\u003EWe are migrating Wikitech to \u003Ca href=\"/wiki/Wikitech/SUL-migration\" title=\"Wikitech/SUL-migration\"\u003ESUL\u003C/a\u003E!\u003C/b\u003E\n\u003Cp\u003E\u003Cb\u003EAction may be required for your \u003Ca href=\"/wiki/Wikitech/SUL-migration#What_You_Should_Do\" title=\"Wikitech/SUL-migration\"\u003E account\u003C/a\u003E!\u003C/b\u003E\n\u003C/p\u003E\u003Cp\u003E\u003Cb\u003ETrouble logging in? Please visit \u003Ca href=\"https://phabricator.wikimedia.org/T376267\" class=\"extiw\" title=\"phab:T376267\"\u003ET376267\u003C/a\u003E\u003C/b\u003E\n\u003C/p\u003E\n\u003C/td\u003E\u003C/tr\u003E\u003C/tbody\u003E\u003C/table\u003E\u003C/div\u003E\u003C/div\u003E\u003C/div\u003E\u003C/div\u003E";}}());</script></div> </div> <div class="vector-column-start"> <div class="vector-main-menu-container"> <div id="mw-navigation"> <nav id="mw-panel" class="vector-main-menu-landmark" aria-label="Site"> <div id="vector-main-menu-pinned-container" class="vector-pinned-container"> </div> </nav> </div> </div> </div> <div class="mw-content-container"> <main id="content" class="mw-body"> <header class="mw-body-header vector-page-titlebar"> <h1 id="firstHeading" class="firstHeading mw-first-heading"><span class="mw-page-title-main">Data Platform/Transform data</span></h1> </header> <div class="vector-page-toolbar"> <div class="vector-page-toolbar-container"> <div id="left-navigation"> <nav aria-label="Namespaces"> <div id="p-associated-pages" class="vector-menu vector-menu-tabs mw-portlet mw-portlet-associated-pages" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-nstab-main" class="selected vector-tab-noicon mw-list-item"><a href="/wiki/Data_Platform/Transform_data" title="View the content page [c]" accesskey="c"><span>Page</span></a></li><li id="ca-talk" class="new vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Talk:Data_Platform/Transform_data&amp;action=edit&amp;redlink=1" rel="discussion" class="new" title="Discussion about the content page (page does not exist) [t]" accesskey="t"><span>Discussion</span></a></li> </ul> </div> </div> <div id="vector-variants-dropdown" class="vector-dropdown emptyPortlet" > <input type="checkbox" id="vector-variants-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-variants-dropdown" class="vector-dropdown-checkbox " aria-label="Change language variant" > <label id="vector-variants-dropdown-label" for="vector-variants-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet" aria-hidden="true" ><span class="vector-dropdown-label-text">English</span> </label> <div class="vector-dropdown-content"> <div id="p-variants" class="vector-menu mw-portlet mw-portlet-variants emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> </div> </div> </nav> </div> <div id="right-navigation" class="vector-collapsible"> <nav aria-label="Views"> <div id="p-views" class="vector-menu vector-menu-tabs mw-portlet mw-portlet-views" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-view" class="selected vector-tab-noicon mw-list-item"><a href="/wiki/Data_Platform/Transform_data"><span>Read</span></a></li><li id="ca-viewsource" class="vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Data_Platform/Transform_data&amp;action=edit" title="This page is protected.&#10;You can view its source [e]" accesskey="e"><span>View source</span></a></li><li id="ca-history" class="vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Data_Platform/Transform_data&amp;action=history" title="Past revisions of this page [h]" accesskey="h"><span>View history</span></a></li> </ul> </div> </div> </nav> <nav class="vector-page-tools-landmark" aria-label="Page tools"> <div id="vector-page-tools-dropdown" class="vector-dropdown vector-page-tools-dropdown" > <input type="checkbox" id="vector-page-tools-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-page-tools-dropdown" class="vector-dropdown-checkbox " aria-label="Tools" > <label id="vector-page-tools-dropdown-label" for="vector-page-tools-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet" aria-hidden="true" ><span class="vector-dropdown-label-text">Tools</span> </label> <div class="vector-dropdown-content"> <div id="vector-page-tools-unpinned-container" class="vector-unpinned-container"> <div id="vector-page-tools" class="vector-page-tools vector-pinnable-element"> <div class="vector-pinnable-header vector-page-tools-pinnable-header vector-pinnable-header-unpinned" data-feature-name="page-tools-pinned" data-pinnable-element-id="vector-page-tools" data-pinned-container-id="vector-page-tools-pinned-container" data-unpinned-container-id="vector-page-tools-unpinned-container" > <div class="vector-pinnable-header-label">Tools</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-page-tools.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-page-tools.unpin">hide</button> </div> <div id="p-cactions" class="vector-menu mw-portlet mw-portlet-cactions emptyPortlet vector-has-collapsible-items" title="More options" > <div class="vector-menu-heading"> Actions </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-more-view" class="selected vector-more-collapsible-item mw-list-item"><a href="/wiki/Data_Platform/Transform_data"><span>Read</span></a></li><li id="ca-more-viewsource" class="vector-more-collapsible-item mw-list-item"><a href="/w/index.php?title=Data_Platform/Transform_data&amp;action=edit"><span>View source</span></a></li><li id="ca-more-history" class="vector-more-collapsible-item mw-list-item"><a href="/w/index.php?title=Data_Platform/Transform_data&amp;action=history"><span>View history</span></a></li> </ul> </div> </div> <div id="p-tb" class="vector-menu mw-portlet mw-portlet-tb" > <div class="vector-menu-heading"> General </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="t-whatlinkshere" class="mw-list-item"><a href="/wiki/Special:WhatLinksHere/Data_Platform/Transform_data" title="A list of all wiki pages that link here [j]" accesskey="j"><span>What links here</span></a></li><li id="t-recentchangeslinked" class="mw-list-item"><a href="/wiki/Special:RecentChangesLinked/Data_Platform/Transform_data" rel="nofollow" title="Recent changes in pages linked from this page [k]" accesskey="k"><span>Related changes</span></a></li><li id="t-specialpages" class="mw-list-item"><a href="/wiki/Special:SpecialPages" title="A list of all special pages [q]" accesskey="q"><span>Special pages</span></a></li><li id="t-permalink" class="mw-list-item"><a href="/w/index.php?title=Data_Platform/Transform_data&amp;oldid=2239812" title="Permanent link to this revision of this page"><span>Permanent link</span></a></li><li id="t-info" class="mw-list-item"><a href="/w/index.php?title=Data_Platform/Transform_data&amp;action=info" title="More information about this page"><span>Page information</span></a></li><li id="t-cite" class="mw-list-item"><a href="/w/index.php?title=Special:CiteThisPage&amp;page=Data_Platform%2FTransform_data&amp;id=2239812&amp;wpFormIdentifier=titleform" title="Information on how to cite this page"><span>Cite this page</span></a></li><li id="t-urlshortener" class="mw-list-item"><a href="/w/index.php?title=Special:UrlShortener&amp;url=https%3A%2F%2Fwikitech.wikimedia.org%2Fwiki%2FData_Platform%2FTransform_data"><span>Get shortened URL</span></a></li><li id="t-urlshortener-qrcode" class="mw-list-item"><a href="/w/index.php?title=Special:QrCode&amp;url=https%3A%2F%2Fwikitech.wikimedia.org%2Fwiki%2FData_Platform%2FTransform_data"><span>Download QR code</span></a></li> </ul> </div> </div> <div id="p-coll-print_export" class="vector-menu mw-portlet mw-portlet-coll-print_export" > <div class="vector-menu-heading"> Print/export </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="coll-create_a_book" class="mw-list-item"><a href="/w/index.php?title=Special:Book&amp;bookcmd=book_creator&amp;referer=Data+Platform%2FTransform+data"><span>Create a book</span></a></li><li id="coll-download-as-rl" class="mw-list-item"><a href="/w/index.php?title=Special:DownloadAsPdf&amp;page=Data_Platform%2FTransform_data&amp;action=show-download-screen"><span>Download as PDF</span></a></li><li id="t-print" class="mw-list-item"><a href="/w/index.php?title=Data_Platform/Transform_data&amp;printable=yes" title="Printable version of this page [p]" accesskey="p"><span>Printable version</span></a></li> </ul> </div> </div> </div> </div> </div> </div> </nav> </div> </div> </div> <div class="vector-column-end"> <div class="vector-sticky-pinned-container"> <nav class="vector-page-tools-landmark" aria-label="Page tools"> <div id="vector-page-tools-pinned-container" class="vector-pinned-container"> </div> </nav> <nav class="vector-appearance-landmark" aria-label="Appearance"> <div id="vector-appearance-pinned-container" class="vector-pinned-container"> <div id="vector-appearance" class="vector-appearance vector-pinnable-element"> <div class="vector-pinnable-header vector-appearance-pinnable-header vector-pinnable-header-pinned" data-feature-name="appearance-pinned" data-pinnable-element-id="vector-appearance" data-pinned-container-id="vector-appearance-pinned-container" data-unpinned-container-id="vector-appearance-unpinned-container" > <div class="vector-pinnable-header-label">Appearance</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-appearance.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-appearance.unpin">hide</button> </div> </div> </div> </nav> </div> </div> <div id="bodyContent" class="vector-body" aria-labelledby="firstHeading" data-mw-ve-target-container> <div class="vector-body-before-content"> <div class="mw-indicators"> </div> <div id="siteSub" class="noprint">From Wikitech</div> </div> <div id="contentSub"><div id="mw-content-subtitle"><div class="subpages">&lt; <bdi dir="ltr"><a href="/wiki/Data_Platform" title="Data Platform">Data Platform</a></bdi></div></div></div> <div id="mw-content-text" class="mw-body-content"><div class="mw-content-ltr mw-parser-output" lang="en" dir="ltr"><style data-mw-deduplicate="TemplateStyles:r2232773">.mw-parser-output .tpl-navsidebar{max-width:22em;background:var(--background-color-base,#fff);color:var(--color-base,#202122);border:1px solid var(--border-color-base,#a2a9b1);float:right;clear:right;margin:.5em 0 1em 1em}.mw-parser-output .tpl-navsidebar-floatright{float:right;clear:right;margin:.5em 0 1em 1em}.mw-parser-output .tpl-navsidebar-floatleft{float:left;clear:left;margin:.5em 1em 1em 0}.mw-parser-output .tpl-navsidebar-floatnone{float:none;clear:both;margin:.5em 0}.mw-parser-output .tpl-navsidebar-topimage{margin:0 0 16px 0}.mw-parser-output .tpl-navsidebar-title{margin:8px 16px;border-bottom:3px solid var(--border-color-muted,#eaecf0);font-size:20px;text-align:center}.mw-parser-output .tpl-navsidebar-image{margin:0 0 8px}.mw-parser-output .tpl-navsidebar-content{margin:0 0 16px 0;padding:0 8px}.mw-parser-output .tpl-navsidebar-heading{margin:8px 0;font-weight:bold}.mw-parser-output .tpl-navsidebar-foot{padding:0 8px;margin:0;text-align:right;font-size:smaller}@media not (min-width:720px){.mw-parser-output .tpl-navsidebar{float:none;clear:both;margin:.5em 0;max-width:none}}</style><div role="navigation" class="navigation-not-searchable tpl-navsidebar" style=""><p class="tpl-navsidebar-title"><a href="/wiki/Data_Platform" title="Data Platform">Data Platform</a></p><div class="tpl-navsidebar-contents"><div class="tpl-navsidebar-content"> <div class="mw-inputbox-centered" style=""><form name="searchbox" class="searchbox mw-inputbox-form-inline" action="/wiki/Special:Search"><div class="cdx-text-input"><input class="mw-searchInput searchboxInput cdx-text-input__input" name="search" placeholder="Search Data Platform documentation" size="40" dir="ltr"/></div><input type="hidden" value="incategory:Data_platform" name="searchfilter"/> <input type="submit" name="fulltext" value="Search" class="cdx-button"/><input type="hidden" value="Search" name="fulltext"/></form></div> </div><div class="tpl-navsidebar-content"> <p class="tpl-navsidebar-heading"><a href="/wiki/Data_Platform/Discover_data" title="Data Platform/Discover data">Discover data</a></p><p class="mw-empty-elt"> </p><ul><li><a class="external text" href="https://datahub.wikimedia.org/">Explore datasets in DataHub</a></li> <li><a href="/wiki/Data_Platform/Data_Lake" title="Data Platform/Data Lake">Data Lake</a> <ul><li><a href="/wiki/Data_Platform/Data_Lake/Traffic" title="Data Platform/Data Lake/Traffic">Traffic data</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Edits" title="Data Platform/Data Lake/Edits">Edits data</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Content" title="Data Platform/Data Lake/Content">Content data</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Events" title="Data Platform/Data Lake/Events">Events data</a></li></ul></li> <li><a href="/wiki/Data_Platform/AQS" title="Data Platform/AQS">Analytics Query Service (AQS)</a></li></ul> </div><div class="tpl-navsidebar-content"> <p class="tpl-navsidebar-heading"><a href="/wiki/Data_Platform/Analyze_data" title="Data Platform/Analyze data">Access, query, and analyze data</a></p><p class="mw-empty-elt"> </p><ul><li><a href="/wiki/Data_Platform/Data_access" title="Data Platform/Data access">Get access to internal data</a></li> <li>Analytics tools <ul><li><a href="/wiki/Data_Platform/Systems/Jupyter" title="Data Platform/Systems/Jupyter">Jupyter notebooks</a></li> <li><a href="/wiki/Data_Platform/Systems/Superset" title="Data Platform/Systems/Superset">Superset</a></li> <li><a href="/wiki/Data_Platform/Systems/Spark" title="Data Platform/Systems/Spark">Spark</a></li> <li><a href="/wiki/Data_Platform/Systems/Presto" title="Data Platform/Systems/Presto">Presto</a></li></ul></li> <li><a rel="nofollow" class="external text" href="https://github.com/wikimedia/wmfdata-python/blob/main/docs/quickstart.ipynb">Quickstart notebook</a></li> <li><a href="/wiki/Data_Platform/Internal_API_requests" title="Data Platform/Internal API requests">Internal API requests</a></li></ul> </div><div class="tpl-navsidebar-content"> <p class="tpl-navsidebar-heading"><a class="mw-selflink selflink">Transform and publish data</a></p><p class="mw-empty-elt"> </p><ul><li><a href="https://www.mediawiki.org/wiki/Data_Platform_Engineering/Intake_Process" class="extiw" title="mw:Data Platform Engineering/Intake Process">Get help or file a request</a></li> <li><a class="mw-selflink-fragment" href="#Plan_data_lifecyle">Plan data lifecyle</a></li> <li>Build tables and datasets <ul><li><a href="/wiki/Data_Platform/Dataset_creation" title="Data Platform/Dataset creation">Dataset creation process</a></li> <li><a href="/wiki/Data_Platform/Data_modeling_guidelines" title="Data Platform/Data modeling guidelines"> Data modeling guidelines</a></li> <li><a href="/wiki/Data_Platform/Systems/Airflow/Developer_guide" title="Data Platform/Systems/Airflow/Developer guide">Airflow developer guide</a></li> <li><a href="/wiki/Data_Platform/Systems/Hive" title="Data Platform/Systems/Hive">Hive</a></li> <li><a href="/wiki/Data_Platform/Systems/Iceberg" title="Data Platform/Systems/Iceberg">Iceberg</a></li> <li><a href="/wiki/Data_Platform/Systems/Druid" title="Data Platform/Systems/Druid">Druid</a></li></ul></li> <li>Share data and dashboards <ul><li><a href="https://foundation.wikimedia.org/wiki/Legal:Data_publication_guidelines" class="extiw" title="foundation:Legal:Data publication guidelines"> Data publication guidelines</a></li> <li><a href="/wiki/Data_Platform/Systems/Turnilo" title="Data Platform/Systems/Turnilo">Turnilo</a></li> <li><a href="/wiki/Data_Platform/Systems/Superset" title="Data Platform/Systems/Superset">Superset</a></li> <li><a href="/wiki/Data_Platform/Systems/analytics.wikimedia.org" title="Data Platform/Systems/analytics.wikimedia.org"> analytics.wikimedia.org</a></li> <li><a href="/wiki/Data_Platform/Web_publication" title="Data Platform/Web publication"> Web publication guide</a></li> <li><a href="/wiki/Data_Platform/Systems/Dashiki" title="Data Platform/Systems/Dashiki"> Dashiki</a></li></ul></li> <li>Manage published data <ul><li><a href="/wiki/Data_Incident_management" class="mw-redirect" title="Data Incident management"> Data Incident management</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Data_Issues" title="Data Platform/Data Lake/Data Issues"> Data Issue reporting</a></li> <li><a href="https://foundation.wikimedia.org/wiki/Legal:Data_retention_guidelines" class="extiw" title="foundation:Legal:Data retention guidelines">Data Retention Guidelines</a></li> <li><a href="/wiki/Data_Platform/Systems/Event_Data_retention" title="Data Platform/Systems/Event Data retention">Event data retention</a></li> <li><a href="/wiki/Data_Platform/Event_Sanitization" title="Data Platform/Event Sanitization">Event Sanitization</a></li> <li><a href="/wiki/Data_Platform/Dataset_archiving_and_deletion" title="Data Platform/Dataset archiving and deletion">Dataset archiving and deletion</a></li></ul></li></ul> </div><div class="tpl-navsidebar-content"> <p class="tpl-navsidebar-heading">Collect data</p><p class="mw-empty-elt"> </p><ul><li><a href="/wiki/Metrics_Platform" title="Metrics Platform">Metrics platform</a></li> <li><a href="/wiki/Event_Platform/Instrumentation_How_To" title="Event Platform/Instrumentation How To">Instrumentation tutorial</a></li> <li><a href="/wiki/Event_Platform" title="Event Platform">Event Platform</a></li></ul> <hr/> </div><div class="tpl-navsidebar-content"> <p class="tpl-navsidebar-heading">Data Platform infrastructure and operations</p><p class="mw-empty-elt"> </p><ul><li><a href="/wiki/Data_Platform/Systems" title="Data Platform/Systems">Systems overview</a></li> <li><a href="/wiki/Category:Data_pipelines" title="Category:Data pipelines"> Data pipelines</a></li> <li>Search <ul><li><a href="/wiki/Search/Technical_interactions" title="Search/Technical interactions"> Using search for new features </a></li> <li><a href="/wiki/Search_Platform/Documentation#Search" title="Search Platform/Documentation"> Search Platform </a></li> <li><a href="/wiki/Wikidata_Query_Service" title="Wikidata Query Service"> Wikidata Query Service (WDQS) </a></li></ul></li> <li>Operations and team processes <ul><li><a href="/wiki/Data_Platform_Engineering/Ops_week" title="Data Platform Engineering/Ops week">Ops week</a></li> <li><a href="/wiki/Data_Platform_Engineering" title="Data Platform Engineering">Team pages on Wikitech</a></li> <li><a href="https://www.mediawiki.org/wiki/Data_Platform_Engineering" class="extiw" title="mw:Data Platform Engineering">Team and project pages on MediaWiki.org</a></li></ul></li></ul> </div></div><p class="tpl-navsidebar-foot">[<span class="noprint plainlinks"><a class="external text" href="https://wikitech.wikimedia.org/w/index.php?title=Template:Navigation_Data_Platform&amp;action=edit"><span title="Edit this template">edit</span></a></span>]</p></div> <p><br/> This page describes the process and internal tools for creating datasets and reports based on private/internal data sources. For info about publicly-accessible resources and data, see <a href="https://meta.wikimedia.org/wiki/Research:Data" class="extiw" title="meta:Research:Data">meta:Research:Data</a>. </p> <div class="mw-heading mw-heading2 ext-discussiontools-init-section"><h2 id="Before_you_start" data-mw-thread-id="h-Before_you_start"><span data-mw-comment-start="" id="h-Before_you_start"></span>Before you start<span data-mw-comment-end="h-Before_you_start"></span></h2><!--__DTELLIPSISBUTTON__{"threadItem":{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-Before_you_start","replies":[]}}--></div> <p>This page assumes you have already <a href="/wiki/Data_Platform/Discover_data" title="Data Platform/Discover data"> identified datasets relevant for your analysis</a>, and that you can <a href="/wiki/Data_Platform/Analyze_data" title="Data Platform/Analyze data"> access and query that data using internal analysis tools</a>. </p><p>Before you create a new table or dataset, check the <a href="/wiki/Data_Platform/Discover_data" title="Data Platform/Discover data">existing data sources</a> and <a class="external text" href="https://datahub.wikimedia.org/">datasets in DataHub</a> to see if the data you need is already there. If not, is there a similar table that could be updated to meet your needs? </p> <style data-mw-deduplicate="TemplateStyles:r2232765">.mw-parser-output .ambox{border:1px solid #a2a9b1;border-left:10px solid #36c;background-color:#fbfbfb;box-sizing:border-box}.mw-parser-output .ambox+link+.ambox,.mw-parser-output .ambox+link+style+.ambox,.mw-parser-output .ambox+link+link+.ambox,.mw-parser-output .ambox+.mw-empty-elt+link+.ambox,.mw-parser-output .ambox+.mw-empty-elt+link+style+.ambox,.mw-parser-output .ambox+.mw-empty-elt+link+link+.ambox{margin-top:-1px}html body.mediawiki .mw-parser-output .ambox.mbox-small-left{margin:4px 1em 4px 0;overflow:hidden;width:238px;border-collapse:collapse;font-size:88%;line-height:1.25em}.mw-parser-output .ambox-speedy{border-left:10px solid #b32424;background-color:#fee7e6}.mw-parser-output .ambox-delete{border-left:10px solid #b32424}.mw-parser-output .ambox-content{border-left:10px solid #f28500}.mw-parser-output .ambox-style{border-left:10px solid #fc3}.mw-parser-output .ambox-move{border-left:10px solid #9932cc}.mw-parser-output .ambox-protection{border-left:10px solid #a2a9b1}.mw-parser-output .ambox .mbox-text{border:none;padding:0.25em 0.5em;width:100%}.mw-parser-output .ambox .mbox-image{border:none;padding:2px 0 2px 0.5em;text-align:center}.mw-parser-output .ambox .mbox-imageright{border:none;padding:2px 0.5em 2px 0;text-align:center}.mw-parser-output .ambox .mbox-empty-cell{border:none;padding:0;width:1px}.mw-parser-output .ambox .mbox-image-div{width:52px}html.client-js body.skin-minerva .mw-parser-output .mbox-text-span{margin-left:23px!important}@media(min-width:720px){.mw-parser-output .ambox{margin:0 10%}}</style><table class="box-Notice plainlinks metadata ambox ambox-notice" role="presentation"><tbody><tr><td class="mbox-image"><div class="mbox-image-div"><span typeof="mw:File"><span><img alt="" src="//upload.wikimedia.org/wikipedia/commons/thumb/1/1d/Information_icon4.svg/40px-Information_icon4.svg.png" decoding="async" width="40" height="40" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/1/1d/Information_icon4.svg/60px-Information_icon4.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/1/1d/Information_icon4.svg/80px-Information_icon4.svg.png 2x" data-file-width="620" data-file-height="620"/></span></span></div></td><td class="mbox-text"><div class="mbox-text-span">To define a new instrument, generate new product metrics, or run experiments, use the <a href="/wiki/Metrics_Platform" title="Metrics Platform">Metrics Platform documentation</a>.</div></td></tr></tbody></table> <p><span id="new-data"></span> </p> <div class="mw-heading mw-heading2 ext-discussiontools-init-section"><h2 id="Plan_data_lifecyle" data-mw-thread-id="h-Plan_data_lifecyle"><span data-mw-comment-start="" id="h-Plan_data_lifecyle"></span>Plan data lifecyle<span data-mw-comment-end="h-Plan_data_lifecyle"></span></h2><!--__DTELLIPSISBUTTON__{"threadItem":{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-Plan_data_lifecyle","replies":["h-Get_approval_for_new_data_collection-Plan_data_lifecyle"]}}--></div> <style data-mw-deduplicate="TemplateStyles:r2211903">.mw-parser-output .note{background-position:left 7px top 50%;padding:0.5em 0.5em 0.5em 40px;margin:0.5em 0;overflow:hidden;background-color:#f8f9fa;color:#333;background-repeat:no-repeat;border:1px solid #ddd}.mw-parser-output .note-inline{display:inline-block;vertical-align:middle}.mw-parser-output .note-info{background-color:#eaf3ff;color:#333;background-image:url("https://upload.wikimedia.org/wikipedia/commons/e/ec/OOjs_UI_icon_information-progressive.svg");background-size:25px;border-color:#a3caff;padding-left:40px;min-height:25px}.mw-parser-output .note-reminder{background-color:#fff9ea;color:#333;background-image:url("https://upload.wikimedia.org/wikipedia/commons/a/a8/OOjs_UI_icon_lightbulb-yellow.svg");background-size:25px;border-color:#fc3;min-height:25px}.mw-parser-output .note-warn{background-color:#fff9ea;color:#333;background-image:url("https://upload.wikimedia.org/wikipedia/commons/3/3b/OOjs_UI_icon_alert-warning.svg");background-size:25px;border-color:#fc3;min-height:25px}.mw-parser-output .note-error{background-color:#fee7e6;color:#333;background-image:url("https://upload.wikimedia.org/wikipedia/commons/b/bf/OOjs_UI_icon_notice-destructive.svg");background-size:25px;border-color:#c33;min-height:25px}@media screen{html.skin-theme-clientpref-night .mw-parser-output .note{background-color:transparent;color:inherit}}@media screen and (prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .note{background-color:transparent;color:inherit}}</style><div role="note" class="note note-info"><b>FIXME:</b> Update this section when data lifecycle documentation is more complete</div> <ul><li><a href="/wiki/Data_Platform/Dataset_creation" title="Data Platform/Dataset creation"> Dataset creation process</a></li> <li><a href="/wiki/Data_Platform/Data_lifecycle_management" title="Data Platform/Data lifecycle management">Data lifecycle management process</a></li></ul> <div class="mw-heading mw-heading3"><h3 id="Get_approval_for_new_data_collection" data-mw-thread-id="h-Get_approval_for_new_data_collection-Plan_data_lifecyle"><span data-mw-comment-start="" id="h-Get_approval_for_new_data_collection-Plan_data_lifecyle"></span>Get approval for new data collection<span data-mw-comment-end="h-Get_approval_for_new_data_collection-Plan_data_lifecyle"></span></h3></div> <p>If you intend to collect a new type of data or design a new instrument for experimentation or product analysis, follow these data collection policies and procedures to submit and get approval for your data collection activity: </p> <ul><li><a rel="nofollow" class="external text" href="https://docs.google.com/document/d/14CfJ_iO_icwUSXfJIdjyOJN-j1ZTdcrl79ulR-AonYo/edit#">Data Collection Guidelines</a> (draft, currently internal only, succeeds <a href="https://www.mediawiki.org/wiki/Wikimedia_Product/Better_use_of_data/DACI" class="extiw" title="mw:Wikimedia Product/Better use of data/DACI">Instrumentation DACI</a>; will eventually be posted to Foundation wiki)</li> <li>WMF staff should use the <a class="external text" href="https://office.wikimedia.org/wiki/Legal,_Safety_%26_Security_Service_Center">Legal, Safety and Security Service Center (L3SC)</a> to <a rel="nofollow" class="external text" href="https://form.asana.com/?k=3s2GnIIOHcAw2lLjpawLVQ&amp;d=3758245663860">submit a request</a> to have data collection plans reviewed and approved.</li> <li>(draft) <a href="https://meta.wikimedia.org/wiki/User:MPopov_(WMF)/Sandbox/Measurement_plans_and_instrumentation_specifications" class="extiw" title="meta:User:MPopov (WMF)/Sandbox/Measurement plans and instrumentation specifications">guide on measurement plans and instrumentation specifications</a> and <a rel="nofollow" class="external text" href="https://docs.google.com/spreadsheets/d/1WFVvphtACS-_EKfpe67mVzOtbx4RFesRvWOT1obGvTY/edit#gid=1635536298">Instrumentation process and spec template</a>(Google sheet)</li></ul> <div class="mw-heading mw-heading2 ext-discussiontools-init-section"><h2 id="Model_and_document_your_data" data-mw-thread-id="h-Model_and_document_your_data"><span data-mw-comment-start="" id="h-Model_and_document_your_data"></span>Model and document your data<span data-mw-comment-end="h-Model_and_document_your_data"></span></h2><!--__DTELLIPSISBUTTON__{"threadItem":{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-Model_and_document_your_data","replies":[]}}--></div> <style data-mw-deduplicate="TemplateStyles:r2126319">.mw-parser-output .tpl-contentgrid{margin:1em 0;display:grid}.mw-parser-output .tpl-contentgrid .mw-tpl-colorbox{margin:0}</style> <div class="tpl-contentgrid" style="grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); grid-gap: 2em; grid-auto-rows: minmax(100px, auto); width: auto;"><div class="mw-tpl-colorbox" style=""><style data-mw-deduplicate="TemplateStyles:r2199608">.mw-parser-output .mw-tpl-colorbox{box-sizing:border-box;margin:0.5em 0.5em 1em 0.5em;border-radius:2px;overflow:hidden;background:var(--background-color-base,#fff);color:var(--color-base,#202122);border:1px solid var(--border-color-base,#a2a9b1);box-shadow:0 2px 2px rgba(0,0,0,0.2)}.mw-parser-output .mw-tpl-colorbox-title{background:var(--background-color-interactive,#eaecf0);color:var(--color-emphasized,#000000);display:flex;gap:0.5rem;padding-top:0.4rem;padding-bottom:0.4rem}.mw-parser-output .mw-tpl-colorbox-title--linked:hover{background:var(--background-color-progressive-subtle,#eaf3ff)}.mw-parser-output .mw-tpl-colorbox-title>*:first-child{padding-left:1rem}.mw-parser-output .mw-tpl-colorbox-title>*:last-child{padding-right:1rem}.mw-parser-output .mw-tpl-colorbox-title>strong,.mw-parser-output .mw-tpl-colorbox-title>a{flex-grow:1}.mw-parser-output .mw-tpl-colorbox-title-icon{opacity:0.8}.mw-parser-output .mw-tpl-colorbox-title-corner{float:right;font-size:0.7em}.mw-parser-output .mw-tpl-colorbox-content{padding:0.5rem 1rem}.mw-parser-output .mw-tpl-colorbox-content::after{content:"";display:block;clear:both}</style><div class="mw-tpl-colorbox-title" style=""><strong>Data modeling</strong> </div><div class="mw-tpl-colorbox-content"> <p>Follow the process defined in the <a href="/wiki/Data_Platform/Data_modeling_guidelines" title="Data Platform/Data modeling guidelines"> Data modeling guidelines</a> to define your schema, connect with data stewards and technical stewards, and determine who will build the dataset. </p><p>If you're defining a new instrument to collect data, follow the <a href="/wiki/Metrics_Platform#Workflows" title="Metrics Platform">Metrics Platform workflow guides</a>. </p> </div> </div> <div class="mw-tpl-colorbox" style=""><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r2199608"><div class="mw-tpl-colorbox-title" style=""><strong>Data documentation</strong> </div><div class="mw-tpl-colorbox-content"> <p>Follow the documentation guidelines for the type of data you're producing or collecting: </p> <ul><li><a href="/wiki/Metrics_Platform/How_to/Create_First_Metrics_Platform_Instrument#Instrument_documentation" class="mw-redirect" title="Metrics Platform/How to/Create First Metrics Platform Instrument">Instrument documentation</a></li> <li><a href="/wiki/Data_Platform/Systems/DataHub/Data_Catalog_Documentation_Guide" title="Data Platform/Systems/DataHub/Data Catalog Documentation Guide">Data catalog documentation guide</a></li> <li>TODO: more comprehensive dataset documentation guidelines and requirements <a href="https://phabricator.wikimedia.org/T349103" class="extiw" title="phab:T349103">phab:T349103</a></li></ul> <p>To find existing dataset documentation, see <a href="/wiki/Data_Platform/Discover_data" title="Data Platform/Discover data">Discover data</a>. </p> </div> </div></div> <div class="mw-heading mw-heading2 ext-discussiontools-init-section"><h2 id="Build_your_table_or_dataset" data-mw-thread-id="h-Build_your_table_or_dataset"><span data-mw-comment-start="" id="h-Build_your_table_or_dataset"></span>Build your table or dataset<span data-mw-comment-end="h-Build_your_table_or_dataset"></span></h2><!--__DTELLIPSISBUTTON__{"threadItem":{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-Build_your_table_or_dataset","replies":["h-Table_formats_and_storage-Build_your_table_or_dataset"]}}--></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r2126319"> <div class="tpl-contentgrid" style="grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); grid-gap: 2em; grid-auto-rows: minmax(100px, auto); width: auto;"><div class="mw-tpl-colorbox" style=""><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r2199608"><div class="mw-tpl-colorbox-title" style=""><strong>Batch transforms</strong> </div><div class="mw-tpl-colorbox-content"> <p>Use <a href="/wiki/Data_Platform/Systems/Airflow" title="Data Platform/Systems/Airflow">Airflow</a> to run jobs and schedule batch workflows that generate new Data Lake tables, metrics, or other transformations based on internal data sources. </p> <ul><li>Developer guide: <a href="/wiki/Data_Platform/Systems/Airflow/Developer_guide" title="Data Platform/Systems/Airflow/Developer guide">Create Airflow DAGS and queries</a></li> <li>Tutorial: <a href="/wiki/Data_Platform/Systems/Airflow/Developer_guide/Python_Job_Repos" title="Data Platform/Systems/Airflow/Developer guide/Python Job Repos"> Python job repository</a></li> <li>References: <ul><li><a href="/wiki/Data_Platform/Systems/Spark" title="Data Platform/Systems/Spark">Spark</a></li> <li><a href="/wiki/Data_Platform/Systems/Hive/Queries" title="Data Platform/Systems/Hive/Queries"> Hive queries and troubleshooting</a>(support for Hive querying is <a href="/wiki/Data_Platform/Systems/Cluster/Iceberg" class="mw-redirect" title="Data Platform/Systems/Cluster/Iceberg">being phased out</a>)</li></ul></li></ul> </div> </div> <div class="mw-tpl-colorbox" style=""><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r2199608"><div class="mw-tpl-colorbox-title" style=""><strong>Event data, instrumentation and experiments</strong> </div><div class="mw-tpl-colorbox-content"> <p>To produce and consume instrumentation data from WMF products, use the <a href="/wiki/Metrics_Platform" title="Metrics Platform">Metrics Platform</a>. It provides standard product metrics schemas and client libraries for data collection using <a href="/wiki/Event_Platform" title="Event Platform">Event Platform</a>. </p><p>If your data collection plans are approved, get started instrumenting your event data collection: </p> <ul><li>See the <a href="/wiki/Event_Platform/Instrumentation_How_To" title="Event Platform/Instrumentation How To">Event instrumentation tutorial</a> and the <a href="/wiki/Metrics_Platform#Workflows" title="Metrics Platform">Metrics Platform workflow guides</a> for how to write and test your instrumentation code locally.</li></ul> </div> </div></div> <table class="mw-collapsible mw-collapsed" style="margin: 2px 0; background: #FFF; color: #000; width: 100%; border: 1px solid #AAA; padding: 0; border-spacing: 0; border-collapse: collapse;"> <tbody><tr> <th style="padding: 2px;"><div style="background:#CCF;padding:2px 0.5em;font-weight:bold;text-align:center;"> Advanced topics for data engineers </div> </th></tr> <tr> <td style="padding: 2px; background:transparent;font-weight:normal;text-align:left;"> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r2126319"> <div class="tpl-contentgrid" style="grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); grid-gap: 2em; grid-auto-rows: minmax(100px, auto); width: auto;"><div class="mw-tpl-colorbox" style=""><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r2199608"><div class="mw-tpl-colorbox-title" style=""><strong>Data pipelines and stream processing</strong> </div><div class="mw-tpl-colorbox-content"> <ul><li><a href="/wiki/Event_Platform/Stream_Processing/Flink" title="Event Platform/Stream Processing/Flink">Flink</a> (stream processing)</li> <li><a href="/wiki/MediaWiki_Event_Enrichment" title="MediaWiki Event Enrichment">MediaWiki Event Enrichment</a></li> <li><a href="/wiki/Data_Platform/Systems/Cluster" title="Data Platform/Systems/Cluster">Data Lake pipelines</a></li> <li>TODO: what else should be linked here?</li></ul> </div> </div> <div class="mw-tpl-colorbox" style=""><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r2199608"><div class="mw-tpl-colorbox-title" style=""><strong>Event Platform schemas</strong> </div><div class="mw-tpl-colorbox-content"> <p>Schemas define the structure of event data. They enable the Event Platform to validate data, and ensure that consumers can rely upon and integrate with it. </p> <ul><li><a href="/wiki/Event_Platform/Schemas" title="Event Platform/Schemas">Create and materialize event schemas</a></li> <li><a href="/wiki/Event_Platform/Schemas/Guidelines" title="Event Platform/Schemas/Guidelines">Schema guidelines</a></li> <li><a rel="nofollow" class="external text" href="https://github.com/wikimedia/jsonschema-tools">jsonschema-tools</a> library</li></ul> </div> </div> <div class="mw-tpl-colorbox" style=""><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r2199608"><div class="mw-tpl-colorbox-title mw-tpl-colorbox-title--linked" style=""><a href="/wiki/Event_Platform/Stream_Configuration" title="Event Platform/Stream Configuration"><strong>Stream configuration and deployment</strong></a> </div><div class="mw-tpl-colorbox-content"> <ul><li><a href="/wiki/Event_Platform/Stream_Configuration" title="Event Platform/Stream Configuration"> Event Platform: Stream configuration guide</a></li> <li><a href="/wiki/Metrics_Platform/Stream_configuration" title="Metrics Platform/Stream configuration">Metrics Platform: Creating a stream configuration</a></li> <li><a href="/wiki/Event_Platform/Instrumentation_How_To#Deployment" title="Event Platform/Instrumentation How To"> Stream deployment</a></li> <li><a href="/wiki/Event_Platform/Event_Utilities" title="Event Platform/Event Utilities">Event utilities</a>: code libraries interacting with stream config, schemas and producing events to Kafka</li></ul> </div> </div></div> </td></tr></tbody></table> <div class="mw-heading mw-heading3"><h3 id="Table_formats_and_storage" data-mw-thread-id="h-Table_formats_and_storage-Build_your_table_or_dataset"><span data-mw-comment-start="" id="h-Table_formats_and_storage-Build_your_table_or_dataset"></span>Table formats and storage<span data-mw-comment-end="h-Table_formats_and_storage-Build_your_table_or_dataset"></span></h3></div> <p>You can store data in private namespaces in Hive or Iceberg, but product data should be in Iceberg (for exceptions, <a href="https://www.mediawiki.org/wiki/Data_Platform_Engineering/Intake_Process" class="extiw" title="mw:Data Platform Engineering/Intake Process">contact the team</a>). </p> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r2126319"> <div class="tpl-contentgrid" style="grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); grid-gap: 2em; grid-auto-rows: minmax(100px, auto); width: auto;"><div class="mw-tpl-colorbox" style=""><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r2199608"><div class="mw-tpl-colorbox-title mw-tpl-colorbox-title--linked" style=""><a href="/wiki/Data_Platform/Systems/Iceberg" title="Data Platform/Systems/Iceberg"><strong>Iceberg</strong></a> </div><div class="mw-tpl-colorbox-content"> <p><a href="/wiki/Data_Platform/Systems/Iceberg" title="Data Platform/Systems/Iceberg">Iceberg</a> is the successor to <a href="/wiki/Data_Platform/Systems/Hive" title="Data Platform/Systems/Hive">Hive</a>. Both Hive and Iceberg table formats can store data using a variety of underlying file formats; WMF normally uses Parquet. </p> </div> </div> <div class="mw-tpl-colorbox" style=""><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r2199608"><div class="mw-tpl-colorbox-title mw-tpl-colorbox-title--linked" style=""><a href="/wiki/Data_Platform/Systems/Hive" title="Data Platform/Systems/Hive"><strong>Hive</strong></a> </div><div class="mw-tpl-colorbox-content"> <p><a href="/wiki/Data_Platform/Systems/Hive" title="Data Platform/Systems/Hive">Hive</a> is a data storage framework that enables you to use SQL to work with various file formats stored in HDFS. The "Hive metastore" is a centralized repository for metadata about these data files stored in the Data Lake, and all three SQL query engines WMF uses (Presto, Spark SQL, and Hive) rely on it. </p> </div> </div> <div class="mw-tpl-colorbox" style=""><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r2199608"><div class="mw-tpl-colorbox-title mw-tpl-colorbox-title--linked" style=""><a href="/wiki/Data_Platform/Systems/Druid" title="Data Platform/Systems/Druid"><strong>Druid</strong></a> </div><div class="mw-tpl-colorbox-content"> <p>Some Data Lake datasets are available in <a href="/wiki/Data_Platform/Systems/Druid" title="Data Platform/Systems/Druid">Druid</a>, which is separate from Hive and HDFS, and allows quick exploration and dashboarding of those datasets in <a href="/wiki/Data_Platform/Systems/Turnilo" title="Data Platform/Systems/Turnilo">Turnilo</a> and <a href="/wiki/Data_Platform/Systems/Superset" title="Data Platform/Systems/Superset">Superset</a>. </p> <ul><li><a href="/wiki/Data_Platform/Systems/Hive_to_Druid_Ingestion_Pipeline" title="Data Platform/Systems/Hive to Druid Ingestion Pipeline"> Hive to Druid ingestion</a></li> <li><a href="https://www.mediawiki.org/wiki/Wikimedia_Product/Data_dictionary#Druid_Data_Tables_in_Superset/Turnilo" class="extiw" title="mw:Wikimedia Product/Data dictionary"> Druid data tables in Superset/Turnilo</a></li></ul> </div> </div></div> <table class="mw-collapsible mw-collapsed" style="margin: 2px 0; background: #FFF; color: #000; width: 100%; border: 1px solid #AAA; padding: 0; border-spacing: 0; border-collapse: collapse;"> <tbody><tr> <th style="padding: 2px;"><div style="background:#CCF;padding:2px 0.5em;font-weight:bold;text-align:center;"> Advanced topics for data engineers </div> </th></tr> <tr> <td style="padding: 2px; background:transparent;font-weight:normal;text-align:left;"> <p>Cassandra: </p> <ul><li>The AQS <a href="/wiki/Cassandra/Clusters#Generated_Data_Platform_(neé_AQS)" title="Cassandra/Clusters">Cassandra cluster</a> stores <a href="/wiki/Data_Platform/Systems/AQS" title="Data Platform/Systems/AQS">Analytics Query Service (AQS)</a> datasets and <a href="https://www.mediawiki.org/wiki/Platform_Engineering_Team/Data_Value_Stream#What_is_generated_data?" class="extiw" title="mw:Platform Engineering Team/Data Value Stream">generated datasets</a>, along with <a href="/wiki/Image-suggestion" title="Image-suggestion">Image suggestions</a> data.</li> <li>TODO: Other references?</li></ul> </td></tr></tbody></table> <p><span id="dashboards"></span> </p> <div class="mw-heading mw-heading2 ext-discussiontools-init-section"><h2 id="Share_data_and_dashboards" data-mw-thread-id="h-Share_data_and_dashboards"><span data-mw-comment-start="" id="h-Share_data_and_dashboards"></span>Share data and dashboards<span data-mw-comment-end="h-Share_data_and_dashboards"></span></h2><!--__DTELLIPSISBUTTON__{"threadItem":{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-Share_data_and_dashboards","replies":["h-Before_you_publish_any_data-Share_data_and_dashboards","h-Share_queries_and_visualizations-Share_data_and_dashboards","h-Tools_and_platforms_for_publishing_data_externally-Share_data_and_dashboards"]}}--></div> <div class="mw-heading mw-heading3"><h3 id="Before_you_publish_any_data" data-mw-thread-id="h-Before_you_publish_any_data-Share_data_and_dashboards"><span data-mw-comment-start="" id="h-Before_you_publish_any_data-Share_data_and_dashboards"></span>Before you publish any data<span data-mw-comment-end="h-Before_you_publish_any_data-Share_data_and_dashboards"></span></h3></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r2126319"> <div class="tpl-contentgrid" style="grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); grid-gap: 2em; grid-auto-rows: minmax(100px, auto); width: auto;"><div class="mw-tpl-colorbox" style=""><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r2199608"><div class="mw-tpl-colorbox-title" style=""><strong>Learn how to apply the Data Publication guidelines</strong> </div><div class="mw-tpl-colorbox-content"> <ul><li><a href="https://foundation.wikimedia.org/wiki/Legal:Data_publication_guidelines" class="extiw" title="foundation:Legal:Data publication guidelines"> Data Publication guidelines</a></li> <li>How to use data publication guidelines to evaluate risk and make publication decisions: <a rel="nofollow" class="external text" href="https://drive.google.com/file/d/1KTsy7Wm6e4bAPLX6CZeKmW336M3vC8KW/view?usp=sharing">GDrive</a>, <a rel="nofollow" class="external text" href="https://www.youtube.com/watch?v=7bSK6wy7QBI">YouTube</a></li></ul> </div> </div> <div class="mw-tpl-colorbox" style=""><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r2199608"><div class="mw-tpl-colorbox-title" style=""><strong>Follow policies and procedures</strong> </div><div class="mw-tpl-colorbox-content"> <p>Policies: </p> <ul><li><a href="https://foundation.wikimedia.org/wiki/Policy:Privacy_policy" class="extiw" title="foundation:Policy:Privacy policy">WMF Privacy Policy</a></li> <li><a href="https://foundation.wikimedia.org/wiki/Legal:Country_and_Territory_Protection_List" class="extiw" title="foundation:Legal:Country and Territory Protection List">Country and Territory Protection List</a> (accessible via:</li></ul> <p><code>canonical_data.countries</code> in the Data Lake)(<a rel="nofollow" class="external text" href="https://github.com/wikimedia-research/canonical-data/blob/master/country/README.md">source docs</a>) </p><p>Procedures: </p> <ul><li><a rel="nofollow" class="external text" href="https://docs.google.com/forms/d/e/1FAIpQLSds6m1puVoWHUoeYOq-4IoVg81aqrdkuQjWX8BZTrTjdBh5Fg/viewform">Data publication log</a> (Google form)</li> <li><a href="/wiki/Data_releases" title="Data releases"> Formal open data release process</a></li> <li><a href="https://www.mediawiki.org/wiki/Product_Analytics/Dashboarding_Guidelines" class="extiw" title="mw:Product Analytics/Dashboarding Guidelines"> Dashboarding guidelines</a></li> <li><a href="https://www.mediawiki.org/wiki/Product_Analytics/Reporting_Guidelines" class="extiw" title="mw:Product Analytics/Reporting Guidelines"> Reporting guidelines</a></li></ul> </div> </div></div> <div class="mw-heading mw-heading3"><h3 id="Share_queries_and_visualizations" data-mw-thread-id="h-Share_queries_and_visualizations-Share_data_and_dashboards"><span data-mw-comment-start="" id="h-Share_queries_and_visualizations-Share_data_and_dashboards"></span>Share queries and visualizations<span data-mw-comment-end="h-Share_queries_and_visualizations-Share_data_and_dashboards"></span></h3></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r2126319"> <div class="tpl-contentgrid" style="grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); grid-gap: 2em; grid-auto-rows: minmax(100px, auto); width: auto;"><div class="mw-tpl-colorbox" style=""><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r2199608"><div class="mw-tpl-colorbox-title" style=""><strong>GitHub, GitLab, and Jupyter notebooks</strong> </div><div class="mw-tpl-colorbox-content"> <ul><li><a href="/wiki/Data_Platform/Systems/Jupyter#Sharing_Notebooks" title="Data Platform/Systems/Jupyter">Publishing Jupyter notebooks on GitHub or GitLab</a></li> <li>Example Quarto publication: <a rel="nofollow" class="external text" href="https://kcvelaga.quarto.pub/cx-deletion-rate-variables-2024/">https://kcvelaga.quarto.pub/cx-deletion-rate-variables-2024/</a></li></ul> </div> </div> <div class="mw-tpl-colorbox" style=""><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r2199608"><div class="mw-tpl-colorbox-title" style=""><strong>Turnilo</strong> </div><div class="mw-tpl-colorbox-content"> <p>Turnilo is a web interface that provides self-service access to data stored in <a href="/wiki/Data_Platform/Systems/Druid" title="Data Platform/Systems/Druid">Druid</a>. In Turnilo, users who don't have full access to WMF private data can explore aggregate metrics without writing queries. However, Turnilo has some technical limitations that make it less accurate and precise than Superset. </p> <ul><li>To access Turnilo, you need a Developer account and <a href="/wiki/Data_Platform/Data_access#LDAP_access" title="Data Platform/Data access"> <code>wmf</code> or <code>nda</code> LDAP access</a>.</li> <li><a href="https://www.mediawiki.org/wiki/Wikimedia_Product/Data_dictionary#Druid_Data_Tables_in_Superset/Turnilo" class="extiw" title="mw:Wikimedia Product/Data dictionary"> Druid data tables in Superset/Turnilo</a></li> <li><a href="/wiki/Data_Platform/Systems/Turnilo" title="Data Platform/Systems/Turnilo">Turnilo documentation</a></li></ul> <p>Go to Turnilo: <a class="external text" href="https://turnilo.wikimedia.org/">turnilo.wikimedia.org</a> </p> </div> </div> <div class="mw-tpl-colorbox" style=""><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r2199608"><div class="mw-tpl-colorbox-title mw-tpl-colorbox-title--linked" style=""><a href="/wiki/Data_Platform/Systems/Superset" title="Data Platform/Systems/Superset"><strong>Superset</strong></a> </div><div class="mw-tpl-colorbox-content"> <p><a href="/wiki/Data_Platform/Systems/Superset" title="Data Platform/Systems/Superset">Superset</a> is a web interface for data visualization and exploration. Like Turnilo, it provides access to <a href="/wiki/Data_Platform/Systems/Druid" title="Data Platform/Systems/Druid">Druid</a> tables, but it also has access to data in <a href="/wiki/Data_Platform/Systems/Hive" title="Data Platform/Systems/Hive">Hive</a> (and elsewhere) via <a href="/wiki/Data_Platform/Systems/Presto" title="Data Platform/Systems/Presto">Presto</a>, and it offers more advanced slicing-and-dicing options. </p> <ul><li>Examples: <a href="https://www.mediawiki.org/wiki/Product_Analytics/Data_Products" class="extiw" title="mw:Product Analytics/Data Products">List of product analytics Superset dashboards</a></li> <li><a class="external text" href="https://superset.wikimedia.org/druiddatasourcemodelview/list/">List of Druid datasources in Superset</a></li> <li><a href="/wiki/Data_Platform/Systems/Superset" title="Data Platform/Systems/Superset">User docs</a></li> <li><a href="/wiki/Data_Platform/Systems/Superset/Administration" title="Data Platform/Systems/Superset/Administration"> Admin docs</a></li></ul> <p><a class="external text" href="https://superset.wikimedia.org/superset/welcome/">Go to Superset</a> </p> </div> </div></div> <p><span id="publishing-externally"></span> </p> <div class="mw-heading mw-heading3"><h3 id="Tools_and_platforms_for_publishing_data_externally" data-mw-thread-id="h-Tools_and_platforms_for_publishing_data_externally-Share_data_and_dashboards"><span data-mw-comment-start="" id="h-Tools_and_platforms_for_publishing_data_externally-Share_data_and_dashboards"></span>Tools and platforms for publishing data externally<span data-mw-comment-end="h-Tools_and_platforms_for_publishing_data_externally-Share_data_and_dashboards"></span></h3></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r2126319"> <div class="tpl-contentgrid" style="grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); grid-gap: 2em; grid-auto-rows: minmax(100px, auto); width: auto;"><div class="mw-tpl-colorbox" style=""><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r2199608"><div class="mw-tpl-colorbox-title" style=""><strong>analytics.wikimedia.org</strong> </div><div class="mw-tpl-colorbox-content"> <p><a class="external text" href="https://analytics.wikimedia.org">analytics.wikimedia.org</a> is a static site that serves WMF analytics dashboards and data downloads. </p> <ul><li><a href="/wiki/Data_Platform/Systems/analytics.wikimedia.org" title="Data Platform/Systems/analytics.wikimedia.org"> Site documentation</a></li> <li><a href="/wiki/Data_Platform/Web_publication" title="Data Platform/Web publication"> Web publication</a>: Process for publishing ad-hoc, low-risk datasets, notebooks, or other research products on the site</li></ul> </div> </div> <div class="mw-tpl-colorbox" style=""><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r2199608"><div class="mw-tpl-colorbox-title mw-tpl-colorbox-title--linked" style=""><a href="/wiki/Data_Platform/Systems/Dashiki" title="Data Platform/Systems/Dashiki"><strong>Dashiki</strong></a> </div><div class="mw-tpl-colorbox-content"> <p>Dashiki is a dashboarding tool that lets users declare dashboards by using configuration pages on a wiki. </p> <ul><li><a href="/wiki/Data_Platform_Engineering/Dashboard_tutorial" class="mw-redirect" title="Data Platform Engineering/Dashboard tutorial">Dashiki dashboard tutorial</a></li> <li>Example dashboards: <ul><li><a class="external text" href="https://analytics.wikimedia.org/dashboards/vital-signs/#projects=eswiki,itwiki,enwiki,jawiki,dewiki,ruwiki,frwiki/metrics=Pageviews">Pageviews</a> (public)</li> <li><a class="external text" href="https://analytics.wikimedia.org/dashboards/browsers/#all-sites-by-os">Browser statistics</a> (public)</li></ul></li></ul> </div> </div></div> <div class="mw-heading mw-heading2 ext-discussiontools-init-section"><h2 id="Manage_published_data" data-mw-thread-id="h-Manage_published_data"><span data-mw-comment-start="" id="h-Manage_published_data"></span>Manage published data<span data-mw-comment-end="h-Manage_published_data"></span></h2><!--__DTELLIPSISBUTTON__{"threadItem":{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-Manage_published_data","replies":["h-Maintainance_and_monitoring-Manage_published_data","h-Retention_and_deletion-Manage_published_data"]}}--></div> <div class="mw-heading mw-heading3"><h3 id="Maintainance_and_monitoring" data-mw-thread-id="h-Maintainance_and_monitoring-Manage_published_data"><span data-mw-comment-start="" id="h-Maintainance_and_monitoring-Manage_published_data"></span>Maintainance and monitoring<span data-mw-comment-end="h-Maintainance_and_monitoring-Manage_published_data"></span></h3></div> <ul><li><a href="/wiki/Data_Incident_management" class="mw-redirect" title="Data Incident management"> Data Incident management</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Data_Issues" title="Data Platform/Data Lake/Data Issues"> Data Issue reporting</a></li></ul> <p>TODO: are there dashboards where people can check the status of canonical data pipeline generation runs on which their datasets depend? </p> <div class="mw-heading mw-heading3"><h3 id="Retention_and_deletion" data-mw-thread-id="h-Retention_and_deletion-Manage_published_data"><span data-mw-comment-start="" id="h-Retention_and_deletion-Manage_published_data"></span>Retention and deletion<span data-mw-comment-end="h-Retention_and_deletion-Manage_published_data"></span></h3></div> <ul><li><a href="https://foundation.wikimedia.org/wiki/Legal:Data_retention_guidelines" class="extiw" title="foundation:Legal:Data retention guidelines">Data Retention Guidelines</a></li> <li><a href="/wiki/Data_Platform/Systems/Event_Data_retention" title="Data Platform/Systems/Event Data retention">Event data retention</a>: data retention practices for events, and privacy best practices for creating or modifying event schemas</li> <li><a href="/wiki/Data_Platform/Event_Sanitization" title="Data Platform/Event Sanitization">Event Sanitization</a>: processes used with Event Platform data for retaining event data in Hive beyond the standard 90 day retention period.</li> <li><a href="/wiki/Data_Platform/Dataset_archiving_and_deletion" title="Data Platform/Dataset archiving and deletion">Dataset archiving and deletion</a></li></ul> <!-- NewPP limit report Parsed by mw‐api‐ext.codfw.main‐85c544bfcc‐62wnl Cached time: 20241029190902 Cache expiry: 2592000 Reduced expiry: false Complications: [no‐toc] DiscussionTools time usage: 0.018 seconds CPU time usage: 0.141 seconds Real time usage: 0.163 seconds Preprocessor visited node count: 1168/1000000 Post‐expand include size: 43955/2097152 bytes Template argument size: 29524/2097152 bytes Highest expansion depth: 9/100 Expensive parser function count: 0/500 Unstrip recursion depth: 0/20 Unstrip post‐expand size: 27555/5000000 bytes Lua time usage: 0.023/10.000 seconds Lua memory usage: 635939/52428800 bytes --> <!-- Transclusion expansion time report (%,ms,calls,template) 100.00% 102.805 1 -total 39.94% 41.064 1 Template:Notice 38.06% 39.131 1 Template:Mbox 21.35% 21.954 7 Template:ContentGrid 18.94% 19.469 1 Template:Navigation_Data_Platform 17.02% 17.497 1 Template:Navigation_sidebar 16.00% 16.446 17 Template:Colored_box 12.01% 12.349 1 Template:Todo 9.74% 10.011 1 Template:Note 3.42% 3.521 2 Template:Hidden --> <!-- Saved in parser cache with key labswiki:pcache:idhash:454548-0!canonical and timestamp 20241029190912 and revision id 2239812. Rendering was triggered because: edit-page --> </div><!--esi <esi:include src="/esitest-fa8a495983347898/content" /> --><noscript><img src="https://login.wikimedia.org/wiki/Special:CentralAutoLogin/start?type=1x1" alt="" width="1" height="1" style="border: none; position: absolute;"></noscript> <div class="printfooter" data-nosnippet="">Retrieved from "<a dir="ltr" href="https://wikitech.wikimedia.org/w/index.php?title=Data_Platform/Transform_data&amp;oldid=2239812">https://wikitech.wikimedia.org/w/index.php?title=Data_Platform/Transform_data&amp;oldid=2239812</a>"</div></div> <div id="catlinks" class="catlinks" data-mw="interface"><div id="mw-normal-catlinks" class="mw-normal-catlinks"><a href="/wiki/Special:Categories" title="Special:Categories">Categories</a>: <ul><li><a href="/wiki/Category:Pages_with_FIXME_on_them" title="Category:Pages with FIXME on them">Pages with FIXME on them</a></li><li><a href="/wiki/Category:Landing_page" title="Category:Landing page">Landing page</a></li><li><a href="/wiki/Category:Data_platform" title="Category:Data platform">Data platform</a></li></ul></div></div> </div> </main> </div> <div class="mw-footer-container"> <footer id="footer" class="mw-footer" > <ul id="footer-info"> <li id="footer-info-lastmod"> This page was last edited on 29 October 2024, at 19:09.</li> <li id="footer-info-copyright">Text is available under the <a rel="nofollow" class="external text" href="https://creativecommons.org/licenses/by-sa/4.0/deed.en">Creative Commons Attribution-ShareAlike License</a>; additional terms may apply. See <a class="external text" href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Terms_of_Use">Terms of Use</a> for details.</li> </ul> <ul id="footer-places"> <li id="footer-places-privacy"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy">Privacy policy</a></li> <li id="footer-places-about"><a href="/wiki/Main_Page">About Wikitech</a></li> <li id="footer-places-disclaimers"><a href="https://foundation.wikimedia.org/wiki/General_disclaimer">Disclaimers</a></li> <li id="footer-places-wm-codeofconduct"><a href="https://www.mediawiki.org/wiki/Special:MyLanguage/Code_of_Conduct">Code of Conduct</a></li> <li id="footer-places-developers"><a href="https://developer.wikimedia.org">Developers</a></li> <li id="footer-places-statslink"><a href="https://stats.wikimedia.org/#/wikitech.wikimedia.org">Statistics</a></li> <li id="footer-places-cookiestatement"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Cookie_statement">Cookie statement</a></li> <li id="footer-places-mobileview"><a href="//wikitech.wikimedia.org/w/index.php?title=Data_Platform/Transform_data&amp;mobileaction=toggle_view_mobile" class="noprint stopMobileRedirectToggle">Mobile view</a></li> </ul> <ul id="footer-icons" class="noprint"> <li id="footer-copyrightico"><a href="https://wikimediafoundation.org/" class="cdx-button cdx-button--fake-button cdx-button--size-large cdx-button--fake-button--enabled"><img src="/static/images/footer/wikimedia-button.svg" width="84" height="29" alt="Wikimedia Foundation" loading="lazy"></a></li> <li id="footer-poweredbyico"><a href="https://www.mediawiki.org/" class="cdx-button cdx-button--fake-button cdx-button--size-large cdx-button--fake-button--enabled"><img src="/w/resources/assets/poweredby_mediawiki.svg" alt="Powered by MediaWiki" width="88" height="31" loading="lazy"></a></li> </ul> </footer> </div> </div> </div> <div class="vector-settings" id="p-dock-bottom"> <ul></ul> </div><script>(RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgHostname":"mw-web.codfw.main-74cc59cb9d-l5qsq","wgBackendResponseTime":105,"wgDiscussionToolsPageThreads":[{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-Before_you_start","replies":[]},{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-Plan_data_lifecyle","replies":[{"headingLevel":3,"name":"h-","type":"heading","level":0,"id":"h-Get_approval_for_new_data_collection-Plan_data_lifecyle","replies":[]}]},{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-Model_and_document_your_data","replies":[]},{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-Build_your_table_or_dataset","replies":[{"headingLevel":3,"name":"h-","type":"heading","level":0,"id":"h-Table_formats_and_storage-Build_your_table_or_dataset","replies":[]}]},{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-Share_data_and_dashboards","replies":[{"headingLevel":3,"name":"h-","type":"heading","level":0,"id":"h-Before_you_publish_any_data-Share_data_and_dashboards","replies":[]},{"headingLevel":3,"name":"h-","type":"heading","level":0,"id":"h-Share_queries_and_visualizations-Share_data_and_dashboards","replies":[]},{"headingLevel":3,"name":"h-","type":"heading","level":0,"id":"h-Tools_and_platforms_for_publishing_data_externally-Share_data_and_dashboards","replies":[]}]},{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-Manage_published_data","replies":[{"headingLevel":3,"name":"h-","type":"heading","level":0,"id":"h-Maintainance_and_monitoring-Manage_published_data","replies":[]},{"headingLevel":3,"name":"h-","type":"heading","level":0,"id":"h-Retention_and_deletion-Manage_published_data","replies":[]}]}],"wgPageParseReport":{"discussiontools":{"limitreport-timeusage":"0.018"},"limitreport":{"cputime":"0.141","walltime":"0.163","ppvisitednodes":{"value":1168,"limit":1000000},"postexpandincludesize":{"value":43955,"limit":2097152},"templateargumentsize":{"value":29524,"limit":2097152},"expansiondepth":{"value":9,"limit":100},"expensivefunctioncount":{"value":0,"limit":500},"unstrip-depth":{"value":0,"limit":20},"unstrip-size":{"value":27555,"limit":5000000},"timingprofile":["100.00% 102.805 1 -total"," 39.94% 41.064 1 Template:Notice"," 38.06% 39.131 1 Template:Mbox"," 21.35% 21.954 7 Template:ContentGrid"," 18.94% 19.469 1 Template:Navigation_Data_Platform"," 17.02% 17.497 1 Template:Navigation_sidebar"," 16.00% 16.446 17 Template:Colored_box"," 12.01% 12.349 1 Template:Todo"," 9.74% 10.011 1 Template:Note"," 3.42% 3.521 2 Template:Hidden"]},"scribunto":{"limitreport-timeusage":{"value":"0.023","limit":"10.000"},"limitreport-memusage":{"value":635939,"limit":52428800}},"cachereport":{"origin":"mw-api-ext.codfw.main-85c544bfcc-62wnl","timestamp":"20241029190902","ttl":2592000,"transientcontent":false}}});});</script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10