CINXE.COM

Data Platform/Data Lake - Wikitech

<!DOCTYPE html> <html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-disabled skin-theme-clientpref-day vector-toc-available" lang="en" dir="ltr"> <head> <meta charset="UTF-8"> <title>Data Platform/Data Lake - Wikitech</title> <script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-disabled skin-theme-clientpref-day vector-toc-available";var cookie=document.cookie.match(/(?:^|; )labswikimwclientpreferences=([^;]+)/);if(cookie){cookie[1].split('%2C').forEach(function(pref){className=className.replace(new RegExp('(^| )'+pref.replace(/-clientpref-\w+$|[^\w-]+/g,'')+'-clientpref-\\w+( |$)'),'$1'+pref+'$2');});}document.documentElement.className=className;}());RLCONF={"wgBreakFrames":true,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat": "dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"27fbc258-5631-47cb-b5ce-75d01cfc12c3","wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Data_Platform/Data_Lake","wgTitle":"Data Platform/Data Lake","wgCurRevisionId":2241501,"wgRevisionId":2241501,"wgArticleId":440607,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Query engines","Analytics cluster","Data platform"],"wgPageViewLanguage":"en","wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRelevantPageName":"Data_Platform/Data_Lake","wgRelevantArticleId":440607,"wgIsProbablyEditable":false,"wgRelevantPageIsProbablyEditable":false,"wgRestrictionEdit":[],"wgRestrictionMove":[],"wgNoticeProject":"wikitech","wgCiteReferencePreviewsActive":true,"wgMediaViewerOnClick":true,"wgMediaViewerEnabledByDefault":true, "wgVisualEditor":{"pageLanguageCode":"en","pageLanguageDir":"ltr","pageVariantFallbacks":"en"},"wgMFDisplayWikibaseDescriptions":{"search":true,"watchlist":true,"tagline":false,"nearby":true},"wgWMESchemaEditAttemptStepOversample":false,"wgWMEPageLength":9000,"wgCentralAuthMobileDomain":false,"wgEditSubmitButtonLabelPublish":true,"wgDiscussionToolsFeaturesEnabled":{"replytool":true,"newtopictool":true,"sourcemodetoolbar":true,"topicsubscription":false,"autotopicsub":false,"visualenhancements":false,"visualenhancements_reply":false,"visualenhancements_pageframe":false},"wgDiscussionToolsFallbackEditMode":"visual","wgULSPosition":"personal","wgULSisCompactLinksEnabled":false,"wgVector2022LanguageInHeader":true,"wgULSisLanguageSelectorEmpty":false,"wgCheckUserClientHintsHeadersJsApi":["brands","architecture","bitness","fullVersionList","mobile","model","platform","platformVersion"],"wgSiteNoticeId":"2.0"};RLSTATE={"ext.globalCssJs.user.styles":"ready","site.styles":"ready","user.styles": "ready","ext.globalCssJs.user":"ready","user":"ready","user.options":"loading","ext.inputBox.styles":"ready","ext.pygments":"ready","mediawiki.special":"ready","oojs-ui-core.styles":"ready","oojs-ui.styles.indicators":"ready","mediawiki.widgets.styles":"ready","oojs-ui-core.icons":"ready","mediawiki.htmlform.ooui.styles":"ready","ext.discussionTools.init.styles":"ready","skins.vector.search.codex.styles":"ready","skins.vector.styles":"ready","skins.vector.icons":"ready","ext.wikimediamessages.styles":"ready","ext.visualEditor.desktopArticleTarget.noscript":"ready","ext.uls.pt":"ready","ext.dismissableSiteNotice.styles":"ready"};RLPAGEMODULES=["ext.pygments.view","site","mediawiki.page.ready","mediawiki.toc","skins.vector.js","ext.centralNotice.geoIP","ext.centralNotice.startUp","ext.gadget.site","ext.urlShortener.toolbar","ext.centralauth.centralautologin","ext.visualEditor.desktopArticleTarget.init","ext.visualEditor.targetLoader","ext.echo.centralauth","ext.discussionTools.init", "ext.eventLogging","ext.wikimediaEvents","ext.uls.interface","ext.checkUser.clientHints","ext.dismissableSiteNotice"];</script> <script>(RLQ=window.RLQ||[]).push(function(){mw.loader.impl(function(){return["user.options@12s5i",function($,jQuery,require,module){mw.user.tokens.set({"patrolToken":"+\\","watchToken":"+\\","csrfToken":"+\\"}); }];});});</script> <link rel="stylesheet" href="/w/load.php?lang=en&amp;modules=ext.discussionTools.init.styles%7Cext.dismissableSiteNotice.styles%7Cext.inputBox.styles%7Cext.pygments%7Cext.uls.pt%7Cext.visualEditor.desktopArticleTarget.noscript%7Cext.wikimediamessages.styles%7Cmediawiki.htmlform.ooui.styles%7Cmediawiki.special%7Cmediawiki.widgets.styles%7Coojs-ui-core.icons%2Cstyles%7Coojs-ui.styles.indicators%7Cskins.vector.icons%2Cstyles%7Cskins.vector.search.codex.styles&amp;only=styles&amp;skin=vector-2022"> <script async="" src="/w/load.php?lang=en&amp;modules=startup&amp;only=scripts&amp;raw=1&amp;skin=vector-2022"></script> <meta name="ResourceLoaderDynamicStyles" content=""> <link rel="stylesheet" href="/w/load.php?lang=en&amp;modules=site.styles&amp;only=styles&amp;skin=vector-2022"> <meta name="generator" content="MediaWiki 1.44.0-wmf.4"> <meta name="referrer" content="origin"> <meta name="referrer" content="origin-when-cross-origin"> <meta name="robots" content="max-image-preview:standard"> <meta name="format-detection" content="telephone=no"> <meta name="viewport" content="width=1120"> <meta property="og:title" content="Data Platform/Data Lake - Wikitech"> <meta property="og:type" content="website"> <link rel="icon" href="/static/favicon/wikitech.ico"> <link rel="search" type="application/opensearchdescription+xml" href="/w/rest.php/v1/search" title="Wikitech (en)"> <link rel="EditURI" type="application/rsd+xml" href="//wikitech.wikimedia.org/w/api.php?action=rsd"> <link rel="canonical" href="https://wikitech.wikimedia.org/wiki/Data_Platform/Data_Lake"> <link rel="license" href="https://creativecommons.org/licenses/by-sa/4.0/"> <link rel="alternate" type="application/atom+xml" title="Wikitech Atom feed" href="/w/index.php?title=Special:RecentChanges&amp;feed=atom"> <link rel="dns-prefetch" href="//meta.wikimedia.org" /> <link rel="dns-prefetch" href="//login.wikimedia.org"> </head> <body class="ext-discussiontools-replytool-enabled ext-discussiontools-newtopictool-enabled ext-discussiontools-sourcemodetoolbar-enabled skin--responsive skin-vector skin-vector-search-vue mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject page-Data_Platform_Data_Lake rootpage-Data_Platform skin-vector-2022 action-view"><a class="mw-jump-link" href="#bodyContent">Jump to content</a> <div class="vector-header-container"> <header class="vector-header mw-header"> <div class="vector-header-start"> <nav class="vector-main-menu-landmark" aria-label="Site"> <div id="vector-main-menu-dropdown" class="vector-dropdown vector-main-menu-dropdown vector-button-flush-left vector-button-flush-right" > <input type="checkbox" id="vector-main-menu-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-main-menu-dropdown" class="vector-dropdown-checkbox " aria-label="Main menu" > <label id="vector-main-menu-dropdown-label" for="vector-main-menu-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-menu mw-ui-icon-wikimedia-menu"></span> <span class="vector-dropdown-label-text">Main menu</span> </label> <div class="vector-dropdown-content"> <div id="vector-main-menu-unpinned-container" class="vector-unpinned-container"> <div id="vector-main-menu" class="vector-main-menu vector-pinnable-element"> <div class="vector-pinnable-header vector-main-menu-pinnable-header vector-pinnable-header-unpinned" data-feature-name="main-menu-pinned" data-pinnable-element-id="vector-main-menu" data-pinned-container-id="vector-main-menu-pinned-container" data-unpinned-container-id="vector-main-menu-unpinned-container" > <div class="vector-pinnable-header-label">Main menu</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-main-menu.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-main-menu.unpin">hide</button> </div> <div id="p-navigation" class="vector-menu mw-portlet mw-portlet-navigation" > <div class="vector-menu-heading"> Navigation </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="n-mainpage-description" class="mw-list-item"><a href="/wiki/Main_Page" title="Visit the main page [z]" accesskey="z"><span>Main page</span></a></li><li id="n-recentchanges" class="mw-list-item"><a href="/wiki/Special:RecentChanges" title="A list of recent changes in the wiki [r]" accesskey="r"><span>Recent changes</span></a></li><li id="n-Server-admin-log:-Prod" class="mw-list-item"><a href="/wiki/Server_Admin_Log"><span>Server admin log: Prod</span></a></li><li id="n-Admin-log:-RelEng" class="mw-list-item"><a href="/wiki/Release_Engineering/SAL"><span>Admin log: RelEng</span></a></li><li id="n-Incident-status" class="mw-list-item"><a href="/wiki/Incident_status"><span>Incident status</span></a></li><li id="n-Deployments" class="mw-list-item"><a href="/wiki/Deployments"><span>Deployments</span></a></li><li id="n-SRE-Team-Help" class="mw-list-item"><a href="/wiki/SRE/SRE_Team_requests"><span>SRE Team Help</span></a></li> </ul> </div> </div> <div id="p-Cloud_VPS_&amp;_Toolforge" class="vector-menu mw-portlet mw-portlet-Cloud_VPS_Toolforge" > <div class="vector-menu-heading"> Cloud VPS &amp; Toolforge </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="n-Cloud-VPS-portal" class="mw-list-item"><a href="/wiki/Portal:Cloud_VPS"><span>Cloud VPS portal</span></a></li><li id="n-Toolforge-portal" class="mw-list-item"><a href="/wiki/Portal:Toolforge"><span>Toolforge portal</span></a></li><li id="n-Request-VPS-project" class="mw-list-item"><a href="https://phabricator.wikimedia.org/project/view/2875/"><span>Request VPS project</span></a></li><li id="n-Admin-log:-Cloud-VPS" class="mw-list-item"><a href="/wiki/Cloud_VPS_Server_Admin_Log"><span>Admin log: Cloud VPS</span></a></li> </ul> </div> </div> </div> </div> </div> </div> </nav> <a href="/wiki/Main_Page" class="mw-logo"> <img class="mw-logo-icon" src="/static/images/icons/wikitech.svg" alt="" aria-hidden="true" height="50" width="50"> <span class="mw-logo-container skin-invert"> <img class="mw-logo-wordmark" alt="Wikitech" src="/static/images/mobile/copyright/wikitech-wordmark.svg" style="width: 8.75em; height: 1.6875em;"> </span> </a> </div> <div class="vector-header-end"> <div id="p-search" role="search" class="vector-search-box-vue vector-search-box-collapses vector-search-box-show-thumbnail vector-search-box-auto-expand-width vector-search-box"> <a href="/wiki/Special:Search" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only search-toggle" title="Search Wikitech [f]" accesskey="f"><span class="vector-icon mw-ui-icon-search mw-ui-icon-wikimedia-search"></span> <span>Search</span> </a> <div class="vector-typeahead-search-container"> <div class="cdx-typeahead-search cdx-typeahead-search--show-thumbnail cdx-typeahead-search--auto-expand-width"> <form action="/w/index.php" id="searchform" class="cdx-search-input cdx-search-input--has-end-button"> <div id="simpleSearch" class="cdx-search-input__input-wrapper" data-search-loc="header-moved"> <div class="cdx-text-input cdx-text-input--has-start-icon"> <input class="cdx-text-input__input" type="search" name="search" placeholder="Search Wikitech" aria-label="Search Wikitech" autocapitalize="sentences" title="Search Wikitech [f]" accesskey="f" id="searchInput" > <span class="cdx-text-input__icon cdx-text-input__start-icon"></span> </div> <input type="hidden" name="title" value="Special:Search"> </div> <button class="cdx-button cdx-search-input__end-button">Search</button> </form> </div> </div> </div> <nav class="vector-user-links vector-user-links-wide" aria-label="Personal tools"> <div class="vector-user-links-main"> <div id="p-vector-user-menu-preferences" class="vector-menu mw-portlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-uls" class="mw-list-item active user-links-collapsible-item"><a data-mw="interface" href="#" class="uls-trigger cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet"><span class="vector-icon mw-ui-icon-wikimedia-language mw-ui-icon-wikimedia-wikimedia-language"></span> <span>English</span></a> </li> </ul> </div> </div> <div id="p-vector-user-menu-userpage" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <nav class="vector-appearance-landmark" aria-label="Appearance"> <div id="vector-appearance-dropdown" class="vector-dropdown " title="Change the appearance of the page&#039;s font size, width, and color" > <input type="checkbox" id="vector-appearance-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-appearance-dropdown" class="vector-dropdown-checkbox " aria-label="Appearance" > <label id="vector-appearance-dropdown-label" for="vector-appearance-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-appearance mw-ui-icon-wikimedia-appearance"></span> <span class="vector-dropdown-label-text">Appearance</span> </label> <div class="vector-dropdown-content"> <div id="vector-appearance-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <div id="p-vector-user-menu-notifications" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <div id="p-vector-user-menu-overflow" class="vector-menu mw-portlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-sitesupport-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="https://donate.wikimedia.org/?utm_source=donate&amp;utm_medium=sidebar&amp;utm_campaign=spontaneous&amp;uselang=en" class=""><span>Donate</span></a> </li> <li id="pt-login-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="/w/index.php?title=Special:UserLogin&amp;returnto=Data+Platform%2FData+Lake" title="You are encouraged to log in; however, it is not mandatory [o]" accesskey="o" class=""><span>Log in</span></a> </li> </ul> </div> </div> </div> <div id="vector-user-links-dropdown" class="vector-dropdown vector-user-menu vector-button-flush-right vector-user-menu-logged-out user-links-collapsible-item" title="More options" > <input type="checkbox" id="vector-user-links-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-user-links-dropdown" class="vector-dropdown-checkbox " aria-label="Personal tools" > <label id="vector-user-links-dropdown-label" for="vector-user-links-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-ellipsis mw-ui-icon-wikimedia-ellipsis"></span> <span class="vector-dropdown-label-text">Personal tools</span> </label> <div class="vector-dropdown-content"> <div id="p-personal" class="vector-menu mw-portlet mw-portlet-personal user-links-collapsible-item" title="User menu" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-sitesupport" class="user-links-collapsible-item mw-list-item"><a href="https://donate.wikimedia.org/?utm_source=donate&amp;utm_medium=sidebar&amp;utm_campaign=spontaneous&amp;uselang=en"><span>Donate</span></a></li><li id="pt-login" class="user-links-collapsible-item mw-list-item"><a href="/w/index.php?title=Special:UserLogin&amp;returnto=Data+Platform%2FData+Lake" title="You are encouraged to log in; however, it is not mandatory [o]" accesskey="o"><span class="vector-icon mw-ui-icon-logIn mw-ui-icon-wikimedia-logIn"></span> <span>Log in</span></a></li> </ul> </div> </div> </div> </div> </nav> </div> </header> </div> <div class="mw-page-container"> <div class="mw-page-container-inner"> <div class="vector-sitenotice-container"> <div id="siteNotice"><div id="mw-dismissablenotice-anonplace"></div><script>(function(){var node=document.getElementById("mw-dismissablenotice-anonplace");if(node){node.outerHTML="\u003Cdiv class=\"mw-dismissable-notice\"\u003E\u003Cdiv class=\"mw-dismissable-notice-close\"\u003E[\u003Ca tabindex=\"0\" role=\"button\"\u003Edismiss\u003C/a\u003E]\u003C/div\u003E\u003Cdiv class=\"mw-dismissable-notice-body\"\u003E\u003C!-- CentralNotice --\u003E\u003Cdiv id=\"localNotice\" data-nosnippet=\"\"\u003E\u003Cdiv class=\"sitenotice\" lang=\"en\" dir=\"ltr\"\u003E\u003Ctable style=\"width: 75%; background-color: var(--background-color-warning-subtle, #fdf2d5); border: var(--border-subtle, 1px solid #987027); color: var(--color-base, #202122); border-radius: 10px; padding: 5px; margin: 0 auto;\"\u003E\n\u003Ctbody\u003E\u003Ctr\u003E\n\u003Ctd style=\"width:40px; height:40px; text-align:center; vertical-align:middle; padding: 2px;\"\u003E\u003Cspan typeof=\"mw:File\"\u003E\u003Ca href=\"/wiki/File:OOjs_UI_icon_alert-warning.svg\" class=\"mw-file-description\"\u003E\u003Cimg src=\"//upload.wikimedia.org/wikipedia/commons/thumb/3/3b/OOjs_UI_icon_alert-warning.svg/30px-OOjs_UI_icon_alert-warning.svg.png\" decoding=\"async\" width=\"30\" height=\"30\" class=\"mw-file-element\" srcset=\"//upload.wikimedia.org/wikipedia/commons/thumb/3/3b/OOjs_UI_icon_alert-warning.svg/45px-OOjs_UI_icon_alert-warning.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/3/3b/OOjs_UI_icon_alert-warning.svg/60px-OOjs_UI_icon_alert-warning.svg.png 2x\" data-file-width=\"20\" data-file-height=\"20\" /\u003E\u003C/a\u003E\u003C/span\u003E\n\u003C/td\u003E\n\u003Ctd style=\"text-align:center; vertical-align: middle; padding: 4px; max-height: 60px;\"\u003E\u003Cb\u003EWe are migrating Wikitech to \u003Ca href=\"/wiki/Wikitech/SUL-migration\" title=\"Wikitech/SUL-migration\"\u003ESUL\u003C/a\u003E!\u003C/b\u003E\n\u003Cp\u003E\u003Cb\u003EAction may be required for your \u003Ca href=\"/wiki/Wikitech/SUL-migration#What_You_Should_Do\" title=\"Wikitech/SUL-migration\"\u003E account\u003C/a\u003E!\u003C/b\u003E\n\u003C/p\u003E\u003Cp\u003E\u003Cb\u003ETrouble logging in? Please visit \u003Ca href=\"https://phabricator.wikimedia.org/T376267\" class=\"extiw\" title=\"phab:T376267\"\u003ET376267\u003C/a\u003E\u003C/b\u003E\n\u003C/p\u003E\n\u003C/td\u003E\u003C/tr\u003E\u003C/tbody\u003E\u003C/table\u003E\u003C/div\u003E\u003C/div\u003E\u003C/div\u003E\u003C/div\u003E";}}());</script></div> </div> <div class="vector-column-start"> <div class="vector-main-menu-container"> <div id="mw-navigation"> <nav id="mw-panel" class="vector-main-menu-landmark" aria-label="Site"> <div id="vector-main-menu-pinned-container" class="vector-pinned-container"> </div> </nav> </div> </div> <div class="vector-sticky-pinned-container"> <nav id="mw-panel-toc" aria-label="Contents" data-event-name="ui.sidebar-toc" class="mw-table-of-contents-container vector-toc-landmark"> <div id="vector-toc-pinned-container" class="vector-pinned-container"> <div id="vector-toc" class="vector-toc vector-pinnable-element"> <div class="vector-pinnable-header vector-toc-pinnable-header vector-pinnable-header-pinned" data-feature-name="toc-pinned" data-pinnable-element-id="vector-toc" > <h2 class="vector-pinnable-header-label">Contents</h2> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-toc.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-toc.unpin">hide</button> </div> <ul class="vector-toc-contents" id="mw-panel-toc-list"> <li id="toc-mw-content-text" class="vector-toc-list-item vector-toc-level-1"> <a href="#" class="vector-toc-link"> <div class="vector-toc-text">Beginning</div> </a> </li> <li id="toc-Data_available" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Data_available"> <div class="vector-toc-text"> <span class="vector-toc-numb">1</span> <span>Data available</span> </div> </a> <ul id="toc-Data_available-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Access" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Access"> <div class="vector-toc-text"> <span class="vector-toc-numb">2</span> <span>Access</span> </div> </a> <button aria-controls="toc-Access-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Access subsection</span> </button> <ul id="toc-Access-sublist" class="vector-toc-list"> <li id="toc-Syntax_differences_between_the_SQL_engines" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Syntax_differences_between_the_SQL_engines"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.1</span> <span>Syntax differences between the SQL engines</span> </div> </a> <ul id="toc-Syntax_differences_between_the_SQL_engines-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Integer_division_in_Presto" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Integer_division_in_Presto"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.2</span> <span>Integer division in Presto</span> </div> </a> <ul id="toc-Integer_division_in_Presto-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Table_and_file_formats" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Table_and_file_formats"> <div class="vector-toc-text"> <span class="vector-toc-numb">3</span> <span>Table and file formats</span> </div> </a> <ul id="toc-Table_and_file_formats-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Technical_architecture" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Technical_architecture"> <div class="vector-toc-text"> <span class="vector-toc-numb">4</span> <span>Technical architecture</span> </div> </a> <ul id="toc-Technical_architecture-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-All_Subpages_of_Data_Platform/Data_Lake" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#All_Subpages_of_Data_Platform/Data_Lake"> <div class="vector-toc-text"> <span class="vector-toc-numb">5</span> <span>All Subpages of Data Platform/Data Lake</span> </div> </a> <ul id="toc-All_Subpages_of_Data_Platform/Data_Lake-sublist" class="vector-toc-list"> </ul> </li> </ul> </div> </div> </nav> </div> </div> <div class="mw-content-container"> <main id="content" class="mw-body"> <header class="mw-body-header vector-page-titlebar"> <nav aria-label="Contents" class="vector-toc-landmark"> <div id="vector-page-titlebar-toc" class="vector-dropdown vector-page-titlebar-toc vector-button-flush-left" > <input type="checkbox" id="vector-page-titlebar-toc-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-page-titlebar-toc" class="vector-dropdown-checkbox " aria-label="Toggle the table of contents" > <label id="vector-page-titlebar-toc-label" for="vector-page-titlebar-toc-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-listBullet mw-ui-icon-wikimedia-listBullet"></span> <span class="vector-dropdown-label-text">Toggle the table of contents</span> </label> <div class="vector-dropdown-content"> <div id="vector-page-titlebar-toc-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <h1 id="firstHeading" class="firstHeading mw-first-heading"><span class="mw-page-title-main">Data Platform/Data Lake</span></h1> </header> <div class="vector-page-toolbar"> <div class="vector-page-toolbar-container"> <div id="left-navigation"> <nav aria-label="Namespaces"> <div id="p-associated-pages" class="vector-menu vector-menu-tabs mw-portlet mw-portlet-associated-pages" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-nstab-main" class="selected vector-tab-noicon mw-list-item"><a href="/wiki/Data_Platform/Data_Lake" title="View the content page [c]" accesskey="c"><span>Page</span></a></li><li id="ca-talk" class="new vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Talk:Data_Platform/Data_Lake&amp;action=edit&amp;redlink=1" rel="discussion" class="new" title="Discussion about the content page (page does not exist) [t]" accesskey="t"><span>Discussion</span></a></li> </ul> </div> </div> <div id="vector-variants-dropdown" class="vector-dropdown emptyPortlet" > <input type="checkbox" id="vector-variants-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-variants-dropdown" class="vector-dropdown-checkbox " aria-label="Change language variant" > <label id="vector-variants-dropdown-label" for="vector-variants-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet" aria-hidden="true" ><span class="vector-dropdown-label-text">English</span> </label> <div class="vector-dropdown-content"> <div id="p-variants" class="vector-menu mw-portlet mw-portlet-variants emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> </div> </div> </nav> </div> <div id="right-navigation" class="vector-collapsible"> <nav aria-label="Views"> <div id="p-views" class="vector-menu vector-menu-tabs mw-portlet mw-portlet-views" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-view" class="selected vector-tab-noicon mw-list-item"><a href="/wiki/Data_Platform/Data_Lake"><span>Read</span></a></li><li id="ca-viewsource" class="vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Data_Platform/Data_Lake&amp;action=edit" title="This page is protected.&#10;You can view its source [e]" accesskey="e"><span>View source</span></a></li><li id="ca-history" class="vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Data_Platform/Data_Lake&amp;action=history" title="Past revisions of this page [h]" accesskey="h"><span>View history</span></a></li> </ul> </div> </div> </nav> <nav class="vector-page-tools-landmark" aria-label="Page tools"> <div id="vector-page-tools-dropdown" class="vector-dropdown vector-page-tools-dropdown" > <input type="checkbox" id="vector-page-tools-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-page-tools-dropdown" class="vector-dropdown-checkbox " aria-label="Tools" > <label id="vector-page-tools-dropdown-label" for="vector-page-tools-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet" aria-hidden="true" ><span class="vector-dropdown-label-text">Tools</span> </label> <div class="vector-dropdown-content"> <div id="vector-page-tools-unpinned-container" class="vector-unpinned-container"> <div id="vector-page-tools" class="vector-page-tools vector-pinnable-element"> <div class="vector-pinnable-header vector-page-tools-pinnable-header vector-pinnable-header-unpinned" data-feature-name="page-tools-pinned" data-pinnable-element-id="vector-page-tools" data-pinned-container-id="vector-page-tools-pinned-container" data-unpinned-container-id="vector-page-tools-unpinned-container" > <div class="vector-pinnable-header-label">Tools</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-page-tools.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-page-tools.unpin">hide</button> </div> <div id="p-cactions" class="vector-menu mw-portlet mw-portlet-cactions emptyPortlet vector-has-collapsible-items" title="More options" > <div class="vector-menu-heading"> Actions </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-more-view" class="selected vector-more-collapsible-item mw-list-item"><a href="/wiki/Data_Platform/Data_Lake"><span>Read</span></a></li><li id="ca-more-viewsource" class="vector-more-collapsible-item mw-list-item"><a href="/w/index.php?title=Data_Platform/Data_Lake&amp;action=edit"><span>View source</span></a></li><li id="ca-more-history" class="vector-more-collapsible-item mw-list-item"><a href="/w/index.php?title=Data_Platform/Data_Lake&amp;action=history"><span>View history</span></a></li> </ul> </div> </div> <div id="p-tb" class="vector-menu mw-portlet mw-portlet-tb" > <div class="vector-menu-heading"> General </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="t-whatlinkshere" class="mw-list-item"><a href="/wiki/Special:WhatLinksHere/Data_Platform/Data_Lake" title="A list of all wiki pages that link here [j]" accesskey="j"><span>What links here</span></a></li><li id="t-recentchangeslinked" class="mw-list-item"><a href="/wiki/Special:RecentChangesLinked/Data_Platform/Data_Lake" rel="nofollow" title="Recent changes in pages linked from this page [k]" accesskey="k"><span>Related changes</span></a></li><li id="t-specialpages" class="mw-list-item"><a href="/wiki/Special:SpecialPages" title="A list of all special pages [q]" accesskey="q"><span>Special pages</span></a></li><li id="t-permalink" class="mw-list-item"><a href="/w/index.php?title=Data_Platform/Data_Lake&amp;oldid=2241501" title="Permanent link to this revision of this page"><span>Permanent link</span></a></li><li id="t-info" class="mw-list-item"><a href="/w/index.php?title=Data_Platform/Data_Lake&amp;action=info" title="More information about this page"><span>Page information</span></a></li><li id="t-cite" class="mw-list-item"><a href="/w/index.php?title=Special:CiteThisPage&amp;page=Data_Platform%2FData_Lake&amp;id=2241501&amp;wpFormIdentifier=titleform" title="Information on how to cite this page"><span>Cite this page</span></a></li><li id="t-urlshortener" class="mw-list-item"><a href="/w/index.php?title=Special:UrlShortener&amp;url=https%3A%2F%2Fwikitech.wikimedia.org%2Fwiki%2FData_Platform%2FData_Lake"><span>Get shortened URL</span></a></li><li id="t-urlshortener-qrcode" class="mw-list-item"><a href="/w/index.php?title=Special:QrCode&amp;url=https%3A%2F%2Fwikitech.wikimedia.org%2Fwiki%2FData_Platform%2FData_Lake"><span>Download QR code</span></a></li> </ul> </div> </div> <div id="p-coll-print_export" class="vector-menu mw-portlet mw-portlet-coll-print_export" > <div class="vector-menu-heading"> Print/export </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="coll-create_a_book" class="mw-list-item"><a href="/w/index.php?title=Special:Book&amp;bookcmd=book_creator&amp;referer=Data+Platform%2FData+Lake"><span>Create a book</span></a></li><li id="coll-download-as-rl" class="mw-list-item"><a href="/w/index.php?title=Special:DownloadAsPdf&amp;page=Data_Platform%2FData_Lake&amp;action=show-download-screen"><span>Download as PDF</span></a></li><li id="t-print" class="mw-list-item"><a href="/w/index.php?title=Data_Platform/Data_Lake&amp;printable=yes" title="Printable version of this page [p]" accesskey="p"><span>Printable version</span></a></li> </ul> </div> </div> </div> </div> </div> </div> </nav> </div> </div> </div> <div class="vector-column-end"> <div class="vector-sticky-pinned-container"> <nav class="vector-page-tools-landmark" aria-label="Page tools"> <div id="vector-page-tools-pinned-container" class="vector-pinned-container"> </div> </nav> <nav class="vector-appearance-landmark" aria-label="Appearance"> <div id="vector-appearance-pinned-container" class="vector-pinned-container"> <div id="vector-appearance" class="vector-appearance vector-pinnable-element"> <div class="vector-pinnable-header vector-appearance-pinnable-header vector-pinnable-header-pinned" data-feature-name="appearance-pinned" data-pinnable-element-id="vector-appearance" data-pinned-container-id="vector-appearance-pinned-container" data-unpinned-container-id="vector-appearance-unpinned-container" > <div class="vector-pinnable-header-label">Appearance</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-appearance.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-appearance.unpin">hide</button> </div> </div> </div> </nav> </div> </div> <div id="bodyContent" class="vector-body" aria-labelledby="firstHeading" data-mw-ve-target-container> <div class="vector-body-before-content"> <div class="mw-indicators"> </div> <div id="siteSub" class="noprint">From Wikitech</div> </div> <div id="contentSub"><div id="mw-content-subtitle"><div class="subpages">&lt; <bdi dir="ltr"><a href="/wiki/Data_Platform" title="Data Platform">Data Platform</a></bdi></div></div></div> <div id="mw-content-text" class="mw-body-content"><div class="mw-content-ltr mw-parser-output" lang="en" dir="ltr"><style data-mw-deduplicate="TemplateStyles:r2241375">.mw-parser-output .tpl-navsidebar{max-width:22em;background:var(--background-color-base,#fff);color:var(--color-base,#202122);border:1px solid var(--border-color-base,#a2a9b1);float:right;clear:right;margin:.5em 0 1em 1em}.mw-parser-output .tpl-navsidebar-floatright{float:right;clear:right;margin:.5em 0 1em 1em}.mw-parser-output .tpl-navsidebar-floatleft{float:left;clear:left;margin:.5em 1em 1em 0}.mw-parser-output .tpl-navsidebar-floatnone{float:none;clear:both;margin:.5em 0}.mw-parser-output .tpl-navsidebar-topimage{margin:0 0 16px 0}.mw-parser-output .tpl-navsidebar-title{margin:8px 16px;border-bottom:3px solid var(--border-color-muted,#eaecf0);font-size:20px;text-align:center}.mw-parser-output .tpl-navsidebar-image{margin:0 0 8px}.mw-parser-output .tpl-navsidebar-content{margin:0 0 16px 0;padding:0 8px}.mw-parser-output .tpl-navsidebar-heading{margin:8px 0;font-weight:bold}.mw-parser-output .tpl-navsidebar-foot{padding:0 8px;margin:0;text-align:right;font-size:smaller}@media not (min-width:720px){.mw-parser-output .tpl-navsidebar{float:none;clear:both;margin:.5em 0;max-width:none}}</style><div role="navigation" class="navigation-not-searchable tpl-navsidebar" style=""><p class="tpl-navsidebar-title"><a href="/wiki/Data_Platform" title="Data Platform">Data Platform</a></p><div class="tpl-navsidebar-contents"><div class="tpl-navsidebar-content"> <div class="mw-inputbox-centered" style=""><form name="searchbox" class="searchbox mw-inputbox-form-inline" action="/wiki/Special:Search"><div class="cdx-text-input"><input class="mw-searchInput searchboxInput cdx-text-input__input" name="search" placeholder="Search Data Platform documentation" size="40" dir="ltr"/></div><input type="hidden" value="incategory:Data_platform" name="searchfilter"/> <input type="submit" name="fulltext" value="Search" class="cdx-button"/><input type="hidden" value="Search" name="fulltext"/></form></div> </div><div class="tpl-navsidebar-content"> <p class="tpl-navsidebar-heading"><a href="/wiki/Data_Platform/Discover_data" title="Data Platform/Discover data">Discover data</a></p><p class="mw-empty-elt"> </p><ul><li><a class="external text" href="https://datahub.wikimedia.org/">Explore datasets in DataHub</a></li> <li><a class="mw-selflink selflink">Data Lake</a> <ul><li><a href="/wiki/Data_Platform/Data_Lake/Traffic" title="Data Platform/Data Lake/Traffic">Traffic data</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Edits" title="Data Platform/Data Lake/Edits">Edits data</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Content" title="Data Platform/Data Lake/Content">Content data</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Events" title="Data Platform/Data Lake/Events">Events data</a></li></ul></li> <li><a href="/wiki/Data_Platform/AQS" title="Data Platform/AQS">Analytics Query Service (AQS)</a></li></ul> </div><div class="tpl-navsidebar-content"> <p class="tpl-navsidebar-heading"><a href="/wiki/Data_Platform/Analyze_data" title="Data Platform/Analyze data">Access, query, and analyze data</a></p><p class="mw-empty-elt"> </p><ul><li><a href="/wiki/Data_Platform/Data_access" title="Data Platform/Data access">Get access to internal data</a></li> <li>Analytics tools <ul><li><a href="/wiki/Data_Platform/Systems/Jupyter" title="Data Platform/Systems/Jupyter">Jupyter notebooks</a></li> <li><a href="/wiki/Data_Platform/Systems/Superset" title="Data Platform/Systems/Superset">Superset</a></li> <li><a href="/wiki/Data_Platform/Systems/Spark" title="Data Platform/Systems/Spark">Spark</a></li> <li><a href="/wiki/Data_Platform/Systems/Presto" title="Data Platform/Systems/Presto">Presto</a></li></ul></li> <li><a rel="nofollow" class="external text" href="https://github.com/wikimedia/wmfdata-python/blob/main/docs/quickstart.ipynb">Quickstart notebook</a></li> <li><a href="/wiki/Data_Platform/Internal_API_requests" title="Data Platform/Internal API requests">Internal API requests</a></li></ul> </div><div class="tpl-navsidebar-content"> <p class="tpl-navsidebar-heading"><a href="/wiki/Data_Platform/Transform_data" title="Data Platform/Transform data">Transform and publish data</a></p><p class="mw-empty-elt"> </p><ul><li><a href="https://www.mediawiki.org/wiki/Data_Platform_Engineering/Intake_Process" class="extiw" title="mw:Data Platform Engineering/Intake Process">Get help or file a request</a></li> <li><a href="/wiki/Data_Platform/Transform_data#Plan_data_lifecyle" title="Data Platform/Transform data">Plan data lifecyle</a></li> <li>Build tables and datasets <ul><li><a href="/wiki/Data_Platform/Dataset_creation" title="Data Platform/Dataset creation">Dataset creation process</a></li> <li><a href="/wiki/Data_Platform/Data_modeling_guidelines" title="Data Platform/Data modeling guidelines"> Data modeling guidelines</a></li> <li><a href="/wiki/Data_Platform/Systems/Airflow/Developer_guide" title="Data Platform/Systems/Airflow/Developer guide">Airflow developer guide</a></li> <li><a href="/wiki/Data_Platform/Systems/Hive" title="Data Platform/Systems/Hive">Hive</a></li> <li><a href="/wiki/Data_Platform/Systems/Iceberg" title="Data Platform/Systems/Iceberg">Iceberg</a></li> <li><a href="/wiki/Data_Platform/Systems/Druid" title="Data Platform/Systems/Druid">Druid</a></li></ul></li> <li>Share data and dashboards <ul><li><a href="https://foundation.wikimedia.org/wiki/Legal:Data_publication_guidelines" class="extiw" title="foundation:Legal:Data publication guidelines"> Data publication guidelines</a></li> <li><a href="/wiki/Data_Platform/Systems/Turnilo" title="Data Platform/Systems/Turnilo">Turnilo</a></li> <li><a href="/wiki/Data_Platform/Systems/Superset" title="Data Platform/Systems/Superset">Superset</a></li> <li><a href="/wiki/Data_Platform/Systems/analytics.wikimedia.org" title="Data Platform/Systems/analytics.wikimedia.org"> analytics.wikimedia.org</a></li> <li><a href="/wiki/Data_Platform/Web_publication" title="Data Platform/Web publication"> Web publication guide</a></li> <li><a href="/wiki/Data_Platform/Systems/Dashiki" title="Data Platform/Systems/Dashiki"> Dashiki</a></li></ul></li> <li>Manage published data <ul><li><a href="/wiki/Data_Incident_management" class="mw-redirect" title="Data Incident management"> Data Incident management</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Data_Issues" title="Data Platform/Data Lake/Data Issues"> Data Issue reporting</a></li> <li><a href="https://foundation.wikimedia.org/wiki/Legal:Data_retention_guidelines" class="extiw" title="foundation:Legal:Data retention guidelines">Data Retention Guidelines</a></li> <li><a href="/wiki/Data_Platform/Systems/Event_Data_retention" title="Data Platform/Systems/Event Data retention">Event data retention</a></li> <li><a href="/wiki/Data_Platform/Event_Sanitization" title="Data Platform/Event Sanitization">Event Sanitization</a></li> <li><a href="/wiki/Data_Platform/Dataset_archiving_and_deletion" title="Data Platform/Dataset archiving and deletion">Dataset archiving and deletion</a></li></ul></li></ul> </div><div class="tpl-navsidebar-content"> <p class="tpl-navsidebar-heading">Collect data</p><p class="mw-empty-elt"> </p><ul><li><a href="/wiki/Metrics_Platform" title="Metrics Platform">Metrics platform</a></li> <li><a href="/wiki/Event_Platform/Instrumentation_How_To" title="Event Platform/Instrumentation How To">Instrumentation tutorial</a></li> <li><a href="/wiki/Event_Platform" title="Event Platform">Event Platform</a></li></ul> <hr/> </div><div class="tpl-navsidebar-content"> <p class="tpl-navsidebar-heading">Data Platform infrastructure and operations</p><p class="mw-empty-elt"> </p><ul><li><a href="/wiki/Data_Platform/Systems" title="Data Platform/Systems">Systems overview</a></li> <li><a href="/wiki/Category:Data_pipelines" title="Category:Data pipelines"> Data pipelines</a></li> <li>Search <ul><li><a href="/wiki/Search/Technical_interactions" title="Search/Technical interactions"> Using search for new features </a></li> <li><a href="/wiki/Search_Platform/Documentation#Search" title="Search Platform/Documentation"> Search Platform </a></li> <li><a href="/wiki/Wikidata_Query_Service" title="Wikidata Query Service"> Wikidata Query Service (WDQS) </a></li></ul></li> <li>Operations and team processes <ul><li><a href="/wiki/Data_Platform_Engineering/Ops_week" title="Data Platform Engineering/Ops week">Ops week</a></li> <li><a href="/wiki/Data_Platform_Engineering" title="Data Platform Engineering">Team pages on Wikitech</a></li> <li><a href="https://www.mediawiki.org/wiki/Data_Platform_Engineering" class="extiw" title="mw:Data Platform Engineering">Team and project pages on MediaWiki.org</a></li></ul></li></ul> </div></div><p class="tpl-navsidebar-foot">[<span class="noprint plainlinks"><a class="external text" href="https://wikitech.wikimedia.org/w/index.php?title=Template:Navigation_Data_Platform&amp;action=edit"><span title="Edit this template">edit</span></a></span>]</p></div> <p>The <b>Analytics Data Lake</b> (ADL), or the <b>Data Lake</b> for short, is a large, analytics-oriented repository of data about Wikimedia projects (in industry terms, a <a href="https://en.wikipedia.org/wiki/data_lake" class="extiw" title="w:data lake">data lake</a>). </p> <meta property="mw:PageProp/toc"/> <div class="mw-heading mw-heading2 ext-discussiontools-init-section"><h2 id="Data_available" data-mw-thread-id="h-Data_available"><span data-mw-comment-start="" id="h-Data_available"></span>Data available<span data-mw-comment-end="h-Data_available"></span></h2><!--__DTELLIPSISBUTTON__{"threadItem":{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-Data_available","replies":[]}}--></div> <style data-mw-deduplicate="TemplateStyles:r2211903">.mw-parser-output .note{background-position:left 7px top 50%;padding:0.5em 0.5em 0.5em 40px;margin:0.5em 0;overflow:hidden;background-color:#f8f9fa;color:#333;background-repeat:no-repeat;border:1px solid #ddd}.mw-parser-output .note-inline{display:inline-block;vertical-align:middle}.mw-parser-output .note-info{background-color:#eaf3ff;color:#333;background-image:url("https://upload.wikimedia.org/wikipedia/commons/e/ec/OOjs_UI_icon_information-progressive.svg");background-size:25px;border-color:#a3caff;padding-left:40px;min-height:25px}.mw-parser-output .note-reminder{background-color:#fff9ea;color:#333;background-image:url("https://upload.wikimedia.org/wikipedia/commons/a/a8/OOjs_UI_icon_lightbulb-yellow.svg");background-size:25px;border-color:#fc3;min-height:25px}.mw-parser-output .note-warn{background-color:#fff9ea;color:#333;background-image:url("https://upload.wikimedia.org/wikipedia/commons/3/3b/OOjs_UI_icon_alert-warning.svg");background-size:25px;border-color:#fc3;min-height:25px}.mw-parser-output .note-error{background-color:#fee7e6;color:#333;background-image:url("https://upload.wikimedia.org/wikipedia/commons/b/bf/OOjs_UI_icon_notice-destructive.svg");background-size:25px;border-color:#c33;min-height:25px}@media screen{html.skin-theme-clientpref-night .mw-parser-output .note{background-color:transparent;color:inherit}}@media screen and (prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .note{background-color:transparent;color:inherit}}</style><div role="note" class="note note-info">Currently, you need <a href="/wiki/Data_Platform/Data_access#Production_access" title="Data Platform/Data access">production data access</a> to use some of this data. A lot of it is available publicly at <a class="external text" href="https://dumps.wikimedia.org/">dumps.wikimedia.org</a>.</div> <dl><dt><a href="/wiki/Data_Platform/Data_Lake/Traffic" title="Data Platform/Data Lake/Traffic">Traffic data</a></dt> <dd><a href="/wiki/Data_Platform/Data_Lake/Traffic/Webrequest" title="Data Platform/Data Lake/Traffic/Webrequest">Webrequest</a>, <a href="/wiki/Data_Platform/Data_Lake/Traffic/Pageviews" title="Data Platform/Data Lake/Traffic/Pageviews">pageviews</a>, and <a href="/wiki/Data_Platform/Data_Lake/Traffic/Unique_Devices" title="Data Platform/Data Lake/Traffic/Unique Devices">unique devices</a></dd> <dt><a href="/wiki/Data_Platform/Data_Lake/Edits" title="Data Platform/Data Lake/Edits">Edits data</a></dt> <dd>Historical data about revisions, pages, and users (e.g. <a href="/wiki/Data_Platform/Data_Lake/Edits/MediaWiki_history" title="Data Platform/Data Lake/Edits/MediaWiki history">MediaWiki History</a>)</dd> <dt><a href="/wiki/Data_Platform/Data_Lake/Content" title="Data Platform/Data Lake/Content">Content data</a></dt> <dd>Wikitext (<a href="/wiki/Data_Platform/Data_Lake/Content/Mediawiki_wikitext_current" title="Data Platform/Data Lake/Content/Mediawiki wikitext current">latest</a> &amp; <a href="/wiki/Data_Platform/Data_Lake/Content/Mediawiki_wikitext_history" title="Data Platform/Data Lake/Content/Mediawiki wikitext history">historical</a>) and wikidata-entities</dd> <dt><a href="/wiki/Data_Platform/Data_Lake/Events" title="Data Platform/Data Lake/Events">Events data</a></dt> <dd><a href="/wiki/Data_Platform/Systems/EventLogging" title="Data Platform/Systems/EventLogging">EventLogging</a>, EventBus and event streams data (raw, refined, <a href="/wiki/Data_Platform/Systems/Event_Sanitization" class="mw-redirect" title="Data Platform/Systems/Event Sanitization">sanitized</a>)</dd> <dt><a href="/wiki/Commons_Impact_Metrics" title="Commons Impact Metrics">Commons Impact Metrics</a></dt> <dd>Contributions to Wikimedia Commons focused on <a href="https://meta.wikimedia.org/wiki/GLAM" class="extiw" title="meta:GLAM">GLAMs</a></dd></dl> <p>Some of these datasets (such as webrequests) are only available in Hive, while others (such as pageviews) are <i>also</i> available as <a href="https://www.mediawiki.org/wiki/Wikimedia_Product/Data_dictionary" class="extiw" title="mw:Wikimedia Product/Data dictionary">data cubes</a> (usually in more aggregated capacity). </p> <div class="mw-heading mw-heading2 ext-discussiontools-init-section"><h2 id="Access" data-mw-thread-id="h-Access"><span data-mw-comment-start="" id="h-Access"></span>Access<span data-mw-comment-end="h-Access"></span></h2><!--__DTELLIPSISBUTTON__{"threadItem":{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-Access","replies":["h-Syntax_differences_between_the_SQL_engines-Access","h-Integer_division_in_Presto-Access"]}}--></div> <p>The main way to access the data in the Data Lake is to run queries using one of the three available SQL engines: <a href="/wiki/Data_Platform/Systems/Presto" title="Data Platform/Systems/Presto">Presto</a>, <a href="/wiki/Hive" class="mw-redirect" title="Hive">Hive</a>, and <a href="/wiki/Data_Platform/Systems/Cluster/Spark" class="mw-redirect" title="Data Platform/Systems/Cluster/Spark">Spark</a>. </p><p>You can access these engines through several different routes: </p> <ul><li><a href="/wiki/Data_Platform/Systems/Superset" title="Data Platform/Systems/Superset"><b>Superset</b></a> has a graphical SQL editor where you can run <a href="/wiki/Data_Platform/Systems/Presto" title="Data Platform/Systems/Presto">Presto</a> queries</li> <li><b><a href="/wiki/Analytics/Archive/Hue" title="Analytics/Archive/Hue">Hue</a></b> has a graphical SQL editor where you can run <a href="/wiki/Data_Platform/Systems/Cluster/Hive" class="mw-redirect" title="Data Platform/Systems/Cluster/Hive">Hive</a> queries</li> <li><b>Custom code</b> on one of the <a href="/wiki/Data_Platform/Systems/Clients" title="Data Platform/Systems/Clients">analytics clients</a> (the easiest way to do this is to use our <a href="/wiki/Data_Platform/Systems/Jupyter" title="Data Platform/Systems/Jupyter">Jupyter service</a>) <ul><li>for <b>Python</b>, use the <a href="https://gitlab.wikimedia.org/repos/data-engineering/wmfdata-python" class="extiw" title="gitlab:repos/data-engineering/wmfdata-python">Wmfdata-Python</a> package</li> <li>for <b>R</b>, use the <a rel="nofollow" class="external text" href="https://github.com/wikimedia/wmfdata-r">wmfdata-r</a> package</li></ul></li></ul> <p>All three engines also have command-line programs which you can use on one of the <a href="/wiki/Data_Platform/Systems/Clients" title="Data Platform/Systems/Clients">analytics clients</a>. This is probably the least convenient way, but if you want to use it, consult the engine's documentation page. </p> <div class="mw-heading mw-heading3"><h3 id="Syntax_differences_between_the_SQL_engines" data-mw-thread-id="h-Syntax_differences_between_the_SQL_engines-Access"><span data-mw-comment-start="" id="h-Syntax_differences_between_the_SQL_engines-Access"></span>Syntax differences between the SQL engines<span data-mw-comment-end="h-Syntax_differences_between_the_SQL_engines-Access"></span></h3></div> <p>For the most part, Presto, Hive, and Spark work the same way, but they have some differences in SQL syntax. </p> <table class="wikitable"> <caption> </caption> <tbody><tr> <th>use case </th> <th>Spark </th> <th>Presto </th> <th>Hive </th></tr> <tr> <td>keyword for the string data type </td> <td><code>STRING</code> </td> <td><code>VARCHAR</code> </td> <td><code>STRING</code> </td></tr> <tr> <td>string literal </td> <td><code>'foo'</code>, <code>"foo"</code> </td> <td><code>'foo'</code> </td> <td><code>'foo'</code>, <code>"foo"</code> </td></tr> <tr> <td>keyword for 32-bit float data type </td> <td><code>FLOAT</code>, <code>REAL</code> </td> <td><code>REAL</code> </td> <td><code>FLOAT</code> </td></tr> <tr> <td>keyword for 64-bit float data type </td> <td colspan="3"><code>DOUBLE</code> </td></tr> <tr> <td>select a column named with a reserved word (e.g. <code>DATE</code>) </td> <td><code>`date`</code> </td> <td><code>"date"</code> </td> <td><code>`date`</code> </td></tr> <tr> <td>get the length of an array </td> <td><code>SIZE(a)</code> </td> <td><code>CARDINALITY(a)</code> </td> <td><code>SIZE(a)</code> </td></tr> <tr> <td>concatenate strings with a separator </td> <td><code>CONCAT_WS</code> </td> <td><i>not available</i> </td> <td><code>CONCAT_WS</code> </td></tr> <tr> <td>count rows which match a condition </td> <td><code>COUNT_IF(x = y)</code> </td> <td><code>COUNT_IF(x = y)</code> </td> <td><code>SUM(CAST(x = y AS INT))</code> </td></tr> <tr> <td>transform integer <code>year</code>/<code>month</code>/<code>day</code> fields to a date string </td> <td><code>CONCAT(year, '-', LPAD(month, 2, '0'), '-', LPAD(day, 2, '0'))</code> </td> <td><code>CONCAT(CAST(year AS VARCHAR), '-', LPAD(CAST(month AS VARCHAR), 2, '0'), '-', LPAD(CAST(day AS VARCHAR), 2, '0'))</code> </td> <td><code>CONCAT(year, '-', LPAD(month, 2, '0'), '-', LPAD(day, 2, '0'))</code> </td></tr> <tr> <td>convert an ISO 8601 timestamp string (e.g. <code>"2021-11-01T01:23:02Z"</code>) to an SQL timestamp </td> <td><code>TO_TIMESTAMP(dt)</code> </td> <td><code>FROM_ISO8601_TIMESTAMP(dt)</code> </td> <td><code>FROM_UNIXTIME(UNIX_TIMESTAMP(dt, "yyyy-MM-dd'T'HH:mm:ss'Z'"))</code> </td></tr> <tr> <td>divide integers, returning a float if necessary </td> <td><code>x / y</code> </td> <td><code>CAST(x AS DOUBLE) / y</code> </td> <td><code>x / y</code> </td></tr> <tr> <td>select the first or last rows in a group </td> <td><code>FIRST</code>, <code>LAST</code> </td> <td><i>not available</i> </td> <td><i>not available</i> </td></tr></tbody></table> <ul><li>It's useful to get in the habit of using singled quoted text (<code>'foo'</code>) for strings, since all three engines interpret it the same way. Double quoted text (<code>"foo"</code>) is interpreted as a string in Spark and Hive, but as a column name in Presto.</li> <li>Escaping special characters in string literals works differently in Spark and Presto. See <a rel="nofollow" class="external text" href="https://github.com/nshahquinn/misc-wikimedia-analysis/blob/master/2022-10_SQL_string_escaping.ipynb">this notebook</a> for more details.</li> <li>See also: <a rel="nofollow" class="external text" href="https://prestodb.io/docs/0.273.3/migration/from-hive.html">Presto's guide to migrating from Hive</a></li></ul> <div class="mw-heading mw-heading3"><h3 id="Integer_division_in_Presto" data-mw-thread-id="h-Integer_division_in_Presto-Access"><span data-mw-comment-start="" id="h-Integer_division_in_Presto-Access"></span>Integer division in Presto<span data-mw-comment-end="h-Integer_division_in_Presto-Access"></span></h3></div> <p>If you divide integers, Hive and Spark will return a floating-point number if necessary (e.g. <code>1 / 3</code> returns <code>0.333333</code>). However, Presto will return only an integer (e.g. <code>1 / 3</code> returns <code>0</code>). Use <code>CAST(x AS DOUBLE)</code> to work around this. <code>DOUBLE</code> is a 64-bit floating point number, while <code>REAL</code> is a 32-bit floating point number. </p><p> There are some quirks to be aware of with this behavior:</p><div class="mw-highlight mw-highlight-lang-sql mw-content-ltr" dir="ltr"><pre><span></span><span class="k">SELECT</span> <span class="w"> </span><span class="mi">2</span><span class="o">/</span><span class="mi">5</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="ss">"none"</span><span class="p">,</span> <span class="w"> </span><span class="k">CAST</span><span class="p">(</span><span class="mi">2</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">DOUBLE</span><span class="p">)</span><span class="o">/</span><span class="mi">5</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="ss">"numerator"</span><span class="p">,</span> <span class="w"> </span><span class="mi">2</span><span class="o">/</span><span class="k">CAST</span><span class="p">(</span><span class="mi">5</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">DOUBLE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="ss">"denominator"</span><span class="p">,</span> <span class="w"> </span><span class="k">CAST</span><span class="p">(</span><span class="mi">2</span><span class="o">/</span><span class="mi">5</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">DOUBLE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="ss">"outer"</span><span class="p">,</span> <span class="w"> </span><span class="mi">2</span><span class="o">/</span><span class="mi">5</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="k">CAST</span><span class="p">(</span><span class="mi">100</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">DOUBLE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="ss">"percentage (a)"</span><span class="p">,</span> <span class="w"> </span><span class="k">CAST</span><span class="p">(</span><span class="mi">2</span><span class="o">/</span><span class="mi">5</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">DOUBLE</span><span class="p">)</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="mi">100</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="ss">"percentage (b)"</span><span class="p">,</span> <span class="w"> </span><span class="k">CAST</span><span class="p">(</span><span class="mi">2</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">DOUBLE</span><span class="p">)</span><span class="w"> </span><span class="o">/</span><span class="w"> </span><span class="mi">5</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="mi">100</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="ss">"percentage (c)"</span><span class="p">,</span> <span class="w"> </span><span class="mi">1</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="mi">2</span><span class="w"> </span><span class="o">/</span><span class="w"> </span><span class="mi">5</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="ss">"percentage (d)"</span> </pre></div><p>These produce: </p><ul><li><b>none</b>: 0 (because 2/5 is <i>rounded towards 0</i> to keep the output data type integer, same as input)</li> <li><b>numerator</b>, <b>denominator</b>: 0.4</li> <li><b>outer</b>: 0 (because 2/5 is implicitly cast to integer BEFORE being explicitly cast as double)</li> <li><b>percentage</b> <ul><li><b>(a)</b>: 0 (same as "none" – 2/5 is cast to int and rounded towards 0 before it reaches the double-typed 100)</li> <li><b>(b)</b>: 0 (same as outer)</li> <li><b>(c)</b>: 40</li> <li><b>(d</b>): 40</li></ul></li></ul> <p>So let's say your query has <code>SUM(IF(event.action = 'click', 1, 0)) / COUNT(1)</code> to calculate clickthrough rate. It'll be 0 unless you: </p> <ul><li>explicitly cast either the denominator or the numerator to double, or</li> <li>implicitly cast by multiplying by 1.0 (for example above it follows order of operations: <code>1.0 * 2</code> becomes <code>2.0</code> <i>then</i> that gets divided by 5)</li></ul> <div class="mw-heading mw-heading2 ext-discussiontools-init-section"><h2 id="Table_and_file_formats" data-mw-thread-id="h-Table_and_file_formats"><span data-mw-comment-start="" id="h-Table_and_file_formats"></span>Table and file formats<span data-mw-comment-end="h-Table_and_file_formats"></span></h2><!--__DTELLIPSISBUTTON__{"threadItem":{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-Table_and_file_formats","replies":[]}}--></div> <p>Data Lake tables can be created using either <a href="/wiki/Data_Platform/Systems/Hive" title="Data Platform/Systems/Hive">Hive format</a> or <a href="/wiki/Data_Platform/Systems/Cluster/Iceberg" class="mw-redirect" title="Data Platform/Systems/Cluster/Iceberg">Iceberg format</a>. Iceberg is the successor to Hive, and highly recommended for new tables. As of Feb 2024, the existing tables in the <code>wmf</code> database are being slowly migrated to Iceberg (<a href="https://phabricator.wikimedia.org/T333013" class="extiw" title="phabricator:T333013">task T333013</a>). </p><p>Both table formats can store data using a variety of underlying file formats; we normally use Parquet with both Hive and Iceberg. </p> <div class="mw-heading mw-heading2 ext-discussiontools-init-section"><h2 id="Technical_architecture" data-mw-thread-id="h-Technical_architecture"><span data-mw-comment-start="" id="h-Technical_architecture"></span>Technical architecture<span data-mw-comment-end="h-Technical_architecture"></span></h2><!--__DTELLIPSISBUTTON__{"threadItem":{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-Technical_architecture","replies":[]}}--></div> <p>Data Lake datasets which are available in Hive are stored in the <a href="/wiki/Data_Platform/Systems/Cluster/Hadoop" class="mw-redirect" title="Data Platform/Systems/Cluster/Hadoop">Hadoop</a> Distributed File System (HDFS). The <a rel="nofollow" class="external text" href="https://cwiki.apache.org/confluence/display/Hive/AdminManual+Metastore+Administration">Hive metastore</a> is a centralized repository for metadata about these data files, and all three SQL query engines we use (Presto, Spark SQL, and Hive) rely on it. </p><p>Some Data Lake datasets are available in <a href="/wiki/Data_Platform/Systems/Druid" title="Data Platform/Systems/Druid">Druid</a>, which is separate from Hive and HDFS, and allows quick exploration and dashboarding of those datasets in <a href="/wiki/Data_Platform/Systems/Turnilo" title="Data Platform/Systems/Turnilo">Turnilo</a> and <a href="/wiki/Data_Platform/Systems/Superset" title="Data Platform/Systems/Superset">Superset</a>. </p><p>The <a href="/wiki/Data_Platform/Systems/Cluster" title="Data Platform/Systems/Cluster">Analytics cluster</a>, which consists of Hadoop servers and related components, provides the infrastructure for the Data Lake. </p> <div class="mw-heading mw-heading2 ext-discussiontools-init-section"><h2 id="All_Subpages_of_Data_Platform/Data_Lake" data-mw-thread-id="h-All_Subpages_of_Data_Platform/Data_Lake"><span id="All_Subpages_of_Data_Platform.2FData_Lake"></span><span data-mw-comment-start="" id="h-All_Subpages_of_Data_Platform/Data_Lake"></span>All Subpages of Data Platform/Data Lake<span data-mw-comment-end="h-All_Subpages_of_Data_Platform/Data_Lake"></span></h2><!--__DTELLIPSISBUTTON__{"threadItem":{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-All_Subpages_of_Data_Platform\/Data_Lake","replies":[]}}--></div> <div class="mw-prefixindex-body"><ul class="mw-prefixindex-list"><li><a href="/wiki/Data_Platform/Data_Lake/Content" title="Data Platform/Data Lake/Content">Content</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Content/Mediawiki_content_history_v1" title="Data Platform/Data Lake/Content/Mediawiki content history v1">Content/Mediawiki content history v1</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Content/Mediawiki_wikitext_current" title="Data Platform/Data Lake/Content/Mediawiki wikitext current">Content/Mediawiki wikitext current</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Content/Mediawiki_wikitext_history" title="Data Platform/Data Lake/Content/Mediawiki wikitext history">Content/Mediawiki wikitext history</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Content/Wikidata_entity" title="Data Platform/Data Lake/Content/Wikidata entity">Content/Wikidata entity</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Content/Wikidata_item_page_link" title="Data Platform/Data Lake/Content/Wikidata item page link">Content/Wikidata item page link</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Data_Issues" title="Data Platform/Data Lake/Data Issues">Data Issues</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Data_Issues/2021-02-09_Unique_Devices_By_Family_Overcount" title="Data Platform/Data Lake/Data Issues/2021-02-09 Unique Devices By Family Overcount">Data Issues/2021-02-09 Unique Devices By Family Overcount</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Data_Issues/2021-06-04_Traffic_Data_Loss" title="Data Platform/Data Lake/Data Issues/2021-06-04 Traffic Data Loss">Data Issues/2021-06-04 Traffic Data Loss</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Data_Issues/2023-01-08_Webrequest_Data_Loss" title="Data Platform/Data Lake/Data Issues/2023-01-08 Webrequest Data Loss">Data Issues/2023-01-08 Webrequest Data Loss</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Data_Issues/2023-11_eventgate-analytics-external_Data_Loss" title="Data Platform/Data Lake/Data Issues/2023-11 eventgate-analytics-external Data Loss">Data Issues/2023-11 eventgate-analytics-external Data Loss</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Data_Issues/2024-10-10_Webrequest_Data_Loss_-_Clobbered_Hadoop_Temporary_Dir" title="Data Platform/Data Lake/Data Issues/2024-10-10 Webrequest Data Loss - Clobbered Hadoop Temporary Dir">Data Issues/2024-10-10 Webrequest Data Loss - Clobbered Hadoop Temporary Dir</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Edits" title="Data Platform/Data Lake/Edits">Edits</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Edits/Edit_hourly" title="Data Platform/Data Lake/Edits/Edit hourly">Edits/Edit hourly</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Edits/Geoeditors" title="Data Platform/Data Lake/Edits/Geoeditors">Edits/Geoeditors</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Edits/Geoeditors/Public" title="Data Platform/Data Lake/Edits/Geoeditors/Public">Edits/Geoeditors/Public</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Edits/MediaWiki_history" title="Data Platform/Data Lake/Edits/MediaWiki history">Edits/MediaWiki history</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Edits/MediaWiki_history/Revision_identity_reverts" title="Data Platform/Data Lake/Edits/MediaWiki history/Revision identity reverts">Edits/MediaWiki history/Revision identity reverts</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Edits/MediaWiki_history_dumps" title="Data Platform/Data Lake/Edits/MediaWiki history dumps">Edits/MediaWiki history dumps</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Edits/MediaWiki_history_dumps/FAQ" title="Data Platform/Data Lake/Edits/MediaWiki history dumps/FAQ">Edits/MediaWiki history dumps/FAQ</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Edits/MediaWiki_history_dumps/Python_spark_examples" title="Data Platform/Data Lake/Edits/MediaWiki history dumps/Python spark examples">Edits/MediaWiki history dumps/Python spark examples</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Edits/MediaWiki_history_dumps/Scala_spark_examples" title="Data Platform/Data Lake/Edits/MediaWiki history dumps/Scala spark examples">Edits/MediaWiki history dumps/Scala spark examples</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Edits/Mediawiki_history_dumps/Python_Dask_examples" title="Data Platform/Data Lake/Edits/Mediawiki history dumps/Python Dask examples">Edits/Mediawiki history dumps/Python Dask examples</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Edits/Mediawiki_history_dumps/Python_Pandas_examples" title="Data Platform/Data Lake/Edits/Mediawiki history dumps/Python Pandas examples">Edits/Mediawiki history dumps/Python Pandas examples</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Edits/Mediawiki_history_reduced" title="Data Platform/Data Lake/Edits/Mediawiki history reduced">Edits/Mediawiki history reduced</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Edits/Mediawiki_page_history" title="Data Platform/Data Lake/Edits/Mediawiki page history">Edits/Mediawiki page history</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Edits/Mediawiki_project_namespace_map" title="Data Platform/Data Lake/Edits/Mediawiki project namespace map">Edits/Mediawiki project namespace map</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Edits/Mediawiki_user_history" title="Data Platform/Data Lake/Edits/Mediawiki user history">Edits/Mediawiki user history</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Edits/Metrics" title="Data Platform/Data Lake/Edits/Metrics">Edits/Metrics</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Edits/Public" title="Data Platform/Data Lake/Edits/Public">Edits/Public</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Edits/Structured_data/Commons_entity" title="Data Platform/Data Lake/Edits/Structured data/Commons entity">Edits/Structured data/Commons entity</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Events" title="Data Platform/Data Lake/Events">Events</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Project_History" title="Data Platform/Data Lake/Project History">Project History</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Public_Data_Lake" title="Data Platform/Data Lake/Public Data Lake">Public Data Lake</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic" title="Data Platform/Data Lake/Traffic">Traffic</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/Banner_activity" title="Data Platform/Data Lake/Traffic/Banner activity">Traffic/Banner activity</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/BotDetection" title="Data Platform/Data Lake/Traffic/BotDetection">Traffic/BotDetection</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/Browser_general" title="Data Platform/Data Lake/Traffic/Browser general">Traffic/Browser general</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/Caching" title="Data Platform/Data Lake/Traffic/Caching">Traffic/Caching</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/Interlanguage" title="Data Platform/Data Lake/Traffic/Interlanguage">Traffic/Interlanguage</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/Mediacounts" title="Data Platform/Data Lake/Traffic/Mediacounts">Traffic/Mediacounts</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/Pagecounts-ez" title="Data Platform/Data Lake/Traffic/Pagecounts-ez">Traffic/Pagecounts-ez</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/Pageview_actor" title="Data Platform/Data Lake/Traffic/Pageview actor">Traffic/Pageview actor</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/Pageview_hourly" title="Data Platform/Data Lake/Traffic/Pageview hourly">Traffic/Pageview hourly</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/Pageview_hourly/Fingerprinting_Over_Time" title="Data Platform/Data Lake/Traffic/Pageview hourly/Fingerprinting Over Time">Traffic/Pageview hourly/Fingerprinting Over Time</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/Pageview_hourly/Identity_reconstruction_analysis" title="Data Platform/Data Lake/Traffic/Pageview hourly/Identity reconstruction analysis">Traffic/Pageview hourly/Identity reconstruction analysis</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/Pageview_hourly/K_Anonymity_Threshold_Analysis" title="Data Platform/Data Lake/Traffic/Pageview hourly/K Anonymity Threshold Analysis">Traffic/Pageview hourly/K Anonymity Threshold Analysis</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/Pageview_hourly/Sanitization" title="Data Platform/Data Lake/Traffic/Pageview hourly/Sanitization">Traffic/Pageview hourly/Sanitization</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/Pageview_hourly/Sanitization_algorithm_proposal" title="Data Platform/Data Lake/Traffic/Pageview hourly/Sanitization algorithm proposal">Traffic/Pageview hourly/Sanitization algorithm proposal</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/Pageviews" title="Data Platform/Data Lake/Traffic/Pageviews">Traffic/Pageviews</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/Pageviews/Bots" title="Data Platform/Data Lake/Traffic/Pageviews/Bots">Traffic/Pageviews/Bots</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/Pageviews/Bots_Research" title="Data Platform/Data Lake/Traffic/Pageviews/Bots Research">Traffic/Pageviews/Bots Research</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/Pageviews/Redirects" title="Data Platform/Data Lake/Traffic/Pageviews/Redirects">Traffic/Pageviews/Redirects</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/Projectview_hourly" title="Data Platform/Data Lake/Traffic/Projectview hourly">Traffic/Projectview hourly</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/ReaderCounts" title="Data Platform/Data Lake/Traffic/ReaderCounts">Traffic/ReaderCounts</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/SessionLength" title="Data Platform/Data Lake/Traffic/SessionLength">Traffic/SessionLength</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/Unique_Devices" title="Data Platform/Data Lake/Traffic/Unique Devices">Traffic/Unique Devices</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/Unique_Devices/Automated_traffic_correction" title="Data Platform/Data Lake/Traffic/Unique Devices/Automated traffic correction">Traffic/Unique Devices/Automated traffic correction</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/Unique_Devices/Last_access_solution" title="Data Platform/Data Lake/Traffic/Unique Devices/Last access solution">Traffic/Unique Devices/Last access solution</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/Unique_Devices/Last_access_solution/Validation" title="Data Platform/Data Lake/Traffic/Unique Devices/Last access solution/Validation">Traffic/Unique Devices/Last access solution/Validation</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/UserRetention" title="Data Platform/Data Lake/Traffic/UserRetention">Traffic/UserRetention</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/Virtualpageview_hourly" title="Data Platform/Data Lake/Traffic/Virtualpageview hourly">Traffic/Virtualpageview hourly</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/Webrequest" title="Data Platform/Data Lake/Traffic/Webrequest">Traffic/Webrequest</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/Webrequest/RawIPUsage" title="Data Platform/Data Lake/Traffic/Webrequest/RawIPUsage">Traffic/Webrequest/RawIPUsage</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/Webrequest/Tagging" title="Data Platform/Data Lake/Traffic/Webrequest/Tagging">Traffic/Webrequest/Tagging</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/mediawiki_api_request" title="Data Platform/Data Lake/Traffic/mediawiki api request">Traffic/mediawiki api request</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/mobile_apps_session_metrics" title="Data Platform/Data Lake/Traffic/mobile apps session metrics">Traffic/mobile apps session metrics</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/mobile_apps_uniques" title="Data Platform/Data Lake/Traffic/mobile apps uniques">Traffic/mobile apps uniques</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/referrer_daily" title="Data Platform/Data Lake/Traffic/referrer daily">Traffic/referrer daily</a></li> <li><a href="/wiki/Data_Platform/Data_Lake/Traffic/referrer_daily/Dashboard" title="Data Platform/Data Lake/Traffic/referrer daily/Dashboard">Traffic/referrer daily/Dashboard</a></li> </ul></div> <!-- NewPP limit report Parsed by mw‐web.codfw.main‐f69cdc8f6‐lz44k Cached time: 20241123225332 Cache expiry: 172 Reduced expiry: true Complications: [show‐toc] DiscussionTools time usage: 0.030 seconds CPU time usage: 0.110 seconds Real time usage: 0.129 seconds Preprocessor visited node count: 296/1000000 Post‐expand include size: 13122/2097152 bytes Template argument size: 6839/2097152 bytes Highest expansion depth: 7/100 Expensive parser function count: 2/500 Unstrip recursion depth: 0/20 Unstrip post‐expand size: 19626/5000000 bytes --> <!-- Transclusion expansion time report (%,ms,calls,template) 100.00% 56.583 1 -total 41.77% 23.633 1 Template:Navigation_Data_Platform 36.09% 20.423 1 Template:Navigation_sidebar 29.24% 16.544 1 Template:Note 19.90% 11.261 1 Special:PrefixIndex/Data_Platform/Data_Lake/ 6.40% 3.622 1 Template:PhabT --> <!-- Saved in parser cache with key labswiki:pcache:idhash:440607-0!canonical and timestamp 20241123225332 and revision id 2241501. Rendering was triggered because: page-view --> </div><!--esi <esi:include src="/esitest-fa8a495983347898/content" /> --><noscript><img src="https://login.wikimedia.org/wiki/Special:CentralAutoLogin/start?type=1x1" alt="" width="1" height="1" style="border: none; position: absolute;"></noscript> <div class="printfooter" data-nosnippet="">Retrieved from "<a dir="ltr" href="https://wikitech.wikimedia.org/w/index.php?title=Data_Platform/Data_Lake&amp;oldid=2241501">https://wikitech.wikimedia.org/w/index.php?title=Data_Platform/Data_Lake&amp;oldid=2241501</a>"</div></div> <div id="catlinks" class="catlinks" data-mw="interface"><div id="mw-normal-catlinks" class="mw-normal-catlinks"><a href="/wiki/Special:Categories" title="Special:Categories">Categories</a>: <ul><li><a href="/wiki/Category:Query_engines" title="Category:Query engines">Query engines</a></li><li><a href="/wiki/Category:Analytics_cluster" title="Category:Analytics cluster">Analytics cluster</a></li><li><a href="/wiki/Category:Data_platform" title="Category:Data platform">Data platform</a></li></ul></div></div> </div> </main> </div> <div class="mw-footer-container"> <footer id="footer" class="mw-footer" > <ul id="footer-info"> <li id="footer-info-lastmod"> This page was last edited on 4 November 2024, at 23:11.</li> <li id="footer-info-copyright">Text is available under the <a rel="nofollow" class="external text" href="https://creativecommons.org/licenses/by-sa/4.0/deed.en">Creative Commons Attribution-ShareAlike License</a>; additional terms may apply. See <a class="external text" href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Terms_of_Use">Terms of Use</a> for details.</li> </ul> <ul id="footer-places"> <li id="footer-places-privacy"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy">Privacy policy</a></li> <li id="footer-places-about"><a href="/wiki/Main_Page">About Wikitech</a></li> <li id="footer-places-disclaimers"><a href="https://foundation.wikimedia.org/wiki/General_disclaimer">Disclaimers</a></li> <li id="footer-places-wm-codeofconduct"><a href="https://www.mediawiki.org/wiki/Special:MyLanguage/Code_of_Conduct">Code of Conduct</a></li> <li id="footer-places-developers"><a href="https://developer.wikimedia.org">Developers</a></li> <li id="footer-places-statslink"><a href="https://stats.wikimedia.org/#/wikitech.wikimedia.org">Statistics</a></li> <li id="footer-places-cookiestatement"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Cookie_statement">Cookie statement</a></li> <li id="footer-places-mobileview"><a href="//wikitech.wikimedia.org/w/index.php?title=Data_Platform/Data_Lake&amp;mobileaction=toggle_view_mobile" class="noprint stopMobileRedirectToggle">Mobile view</a></li> </ul> <ul id="footer-icons" class="noprint"> <li id="footer-copyrightico"><a href="https://wikimediafoundation.org/" class="cdx-button cdx-button--fake-button cdx-button--size-large cdx-button--fake-button--enabled"><img src="/static/images/footer/wikimedia-button.svg" width="84" height="29" alt="Wikimedia Foundation" loading="lazy"></a></li> <li id="footer-poweredbyico"><a href="https://www.mediawiki.org/" class="cdx-button cdx-button--fake-button cdx-button--size-large cdx-button--fake-button--enabled"><img src="/w/resources/assets/poweredby_mediawiki.svg" alt="Powered by MediaWiki" width="88" height="31" loading="lazy"></a></li> </ul> </footer> </div> </div> </div> <div class="vector-settings" id="p-dock-bottom"> <ul></ul> </div><script>(RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgHostname":"mw-web.codfw.main-f69cdc8f6-lz44k","wgBackendResponseTime":234,"wgDiscussionToolsPageThreads":[{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-Data_available","replies":[]},{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-Access","replies":[{"headingLevel":3,"name":"h-","type":"heading","level":0,"id":"h-Syntax_differences_between_the_SQL_engines-Access","replies":[]},{"headingLevel":3,"name":"h-","type":"heading","level":0,"id":"h-Integer_division_in_Presto-Access","replies":[]}]},{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-Table_and_file_formats","replies":[]},{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-Technical_architecture","replies":[]},{"headingLevel":2,"name":"h-","type":"heading","level":0,"id":"h-All_Subpages_of_Data_Platform/Data_Lake","replies":[]}],"wgPageParseReport":{"discussiontools":{"limitreport-timeusage":"0.030"},"limitreport":{"cputime":"0.110","walltime":"0.129","ppvisitednodes":{"value":296,"limit":1000000},"postexpandincludesize":{"value":13122,"limit":2097152},"templateargumentsize":{"value":6839,"limit":2097152},"expansiondepth":{"value":7,"limit":100},"expensivefunctioncount":{"value":2,"limit":500},"unstrip-depth":{"value":0,"limit":20},"unstrip-size":{"value":19626,"limit":5000000},"timingprofile":["100.00% 56.583 1 -total"," 41.77% 23.633 1 Template:Navigation_Data_Platform"," 36.09% 20.423 1 Template:Navigation_sidebar"," 29.24% 16.544 1 Template:Note"," 19.90% 11.261 1 Special:PrefixIndex/Data_Platform/Data_Lake/"," 6.40% 3.622 1 Template:PhabT"]},"cachereport":{"origin":"mw-web.codfw.main-f69cdc8f6-lz44k","timestamp":"20241123225332","ttl":172,"transientcontent":true}}});});</script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10