CINXE.COM
<!doctype html> <html lang="en" dir="ltr" class="blog-wrapper blog-list-page plugin-blog plugin-id-user_cases" data-has-hydrated="false"> <head> <meta charset="UTF-8"> <meta name="generator" content="Docusaurus v2.4.3"> <title data-rh="true">Blog | Apache SeaTunnel</title><meta data-rh="true" name="viewport" content="width=device-width,initial-scale=1"><meta data-rh="true" name="twitter:card" content="summary_large_image"><meta data-rh="true" property="og:url" content="https://seatunnel.apache.org/user_cases"><meta data-rh="true" name="docusaurus_locale" content="en"><meta data-rh="true" name="docsearch:language" content="en"><meta data-rh="true" property="og:title" content="Blog | Apache SeaTunnel"><meta data-rh="true" name="description" content="Blog"><meta data-rh="true" property="og:description" content="Blog"><meta data-rh="true" name="docusaurus_tag" content="blog_posts_list"><meta data-rh="true" name="docsearch:docusaurus_tag" content="blog_posts_list"><link data-rh="true" rel="icon" href="/image/favicon.ico"><link data-rh="true" rel="canonical" href="https://seatunnel.apache.org/user_cases"><link data-rh="true" rel="alternate" href="https://seatunnel.apache.org/user_cases" hreflang="en"><link data-rh="true" rel="alternate" href="https://seatunnel.apache.org/zh-CN/user_cases" hreflang="zh-CN"><link data-rh="true" rel="alternate" href="https://seatunnel.apache.org/user_cases" hreflang="x-default"><link data-rh="true" rel="preconnect" href="https://S2J1A7LWND-dsn.algolia.net" crossorigin="anonymous"><link rel="alternate" type="application/rss+xml" href="/blog/rss.xml" title="Apache SeaTunnel RSS Feed"> <link rel="alternate" type="application/atom+xml" href="/blog/atom.xml" title="Apache SeaTunnel Atom Feed"> <link rel="search" type="application/opensearchdescription+xml" title="Apache SeaTunnel" href="/opensearch.xml"> <link rel="alternate" type="application/rss+xml" href="/user_cases/rss.xml" title="Apache SeaTunnel RSS Feed"> <link rel="alternate" type="application/atom+xml" href="/user_cases/atom.xml" title="Apache SeaTunnel Atom Feed"> <script>var _paq=window._paq=window._paq||[];_paq.push(["setDoNotTrack",!0]),_paq.push(["disableCookies"]),_paq.push(["trackPageView"]),_paq.push(["enableLinkTracking"]),function(){var a="https://analytics.apache.org/";_paq.push(["setTrackerUrl",a+"matomo.php"]),_paq.push(["setSiteId","65"]);var e=document,p=e.createElement("script"),t=e.getElementsByTagName("script")[0];p.async=!0,p.src=a+"matomo.js",t.parentNode.insertBefore(p,t)}()</script><link rel="stylesheet" href="/assets/css/styles.196a51e3.css"> <link rel="preload" href="/assets/js/runtime~main.49ca96aa.js" as="script"> <link rel="preload" href="/assets/js/main.b6cbf3fe.js" as="script"> </head> <body class="navigation-with-keyboard"> <script>!function(){function t(t){document.documentElement.setAttribute("data-theme",t)}var e=function(){var t=null;try{t=new URLSearchParams(window.location.search).get("docusaurus-theme")}catch(t){}return t}()||function(){var t=null;try{t=localStorage.getItem("theme")}catch(t){}return t}();null!==e?t(e):window.matchMedia("(prefers-color-scheme: dark)").matches?t("dark"):(window.matchMedia("(prefers-color-scheme: light)").matches,t("light"))}(),document.documentElement.setAttribute("data-announcement-bar-initially-dismissed",function(){try{return"true"===localStorage.getItem("docusaurus.announcement.dismiss")}catch(t){}return!1}())</script><div id="__docusaurus"> <div role="region" aria-label="Skip to main content"><a class="skipToContent_fXgn" href="#__docusaurus_skipToContent_fallback">Skip to main content</a></div><div class="announcementBar_mb4j" style="background-color:rgb(70, 125, 175, 0.8)" role="banner"><div class="content_knG7 announcementBarContent_xLdY"><a href="https://github.com/apache/seatunnel" target="_blank" style="display: flex; width: 100%; align-items: center; justify-content: center; margin-left: 4px; text-decoration: none;">⭐️ If you like Apache SeaTunnel, give it a star on GitHub <img style="width: 1.2rem; height: 1.2rem; margin-left: 0.4rem;" src="/home/icons/github1.svg"> ⭐️ </a></div></div><nav aria-label="Main" class="navbar navbar--fixed-top"><div class="navbar__inner"><div class="navbar__items"><button aria-label="Toggle navigation bar" aria-expanded="false" class="navbar__toggle clean-btn" type="button"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a class="navbar__brand" href="/"><div class="navbar__logo"><img src="/image/logo.png" alt="Apache SeaTunnel Logo" class="themedImage_ToTc themedImage--light_HNdA"><img src="/image/logo.png" alt="Apache SeaTunnel Logo" class="themedImage_ToTc themedImage--dark_i4oU"></div><b class="navbar__title text--truncate">Apache SeaTunnel</b></a></div><div class="navbar__items navbar__items--right"><a class="navbar__item navbar__link" href="/">Home</a><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a href="#" aria-haspopup="true" aria-expanded="false" role="button" class="navbar__link">Document</a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/docs/2.3.9/about">2.3.9</a></li><li><a class="dropdown__link" href="/docs/2.3.8/about">2.3.8</a></li><li><a class="dropdown__link" href="/docs/2.3.7/about">2.3.7</a></li><li><a class="dropdown__link" href="/docs/2.3.6/about">2.3.6</a></li><li><a class="dropdown__link" href="/docs/2.3.5/about">2.3.5</a></li><li><a class="dropdown__link" href="/docs/about">Next</a></li><li><a class="dropdown__link" href="/versions/">All versions</a></li></ul></div><a class="navbar__item navbar__link" href="/download">Download</a><a class="navbar__item navbar__link" href="/community/contribution_guide/contribute">Community</a><a class="navbar__item navbar__link" href="/blog">Blog</a><a aria-current="page" class="navbar__item navbar__link navbar__link--active" href="/user_cases">UserCases</a><a class="navbar__item navbar__link" href="/team">Team</a><a class="navbar__item navbar__link" href="/user">Users</a><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a href="#" aria-haspopup="true" aria-expanded="false" role="button" class="navbar__link">ASF</a><ul class="dropdown__menu"><li><a href="https://www.apache.org/" target="_blank" rel="noopener noreferrer" class="dropdown__link">Foundation</a></li><li><a href="https://www.apache.org/licenses/" target="_blank" rel="noopener noreferrer" class="dropdown__link">License</a></li><li><a href="https://www.apache.org/events/current-event" target="_blank" rel="noopener noreferrer" class="dropdown__link">Events</a></li><li><a href="https://www.apache.org/foundation/sponsorship.html" target="_blank" rel="noopener noreferrer" class="dropdown__link">Sponsorship</a></li><li><a href="https://www.apache.org/foundation/thanks.html" target="_blank" rel="noopener noreferrer" class="dropdown__link">Thanks</a></li><li><a href="https://apache.org/foundation/policies/privacy.html" target="_blank" rel="noopener noreferrer" class="dropdown__link">Privacy</a></li></ul></div><a href="https://github.com/apache/incubator-seatunnel" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">GitHub<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a><a class="navbar__item navbar__link" href="/security">Security</a><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a href="#" aria-haspopup="true" aria-expanded="false" role="button" class="navbar__link"><svg viewBox="0 0 24 24" width="20" height="20" aria-hidden="true" class="iconLanguage_nlXk"><path fill="currentColor" d="M12.87 15.07l-2.54-2.51.03-.03c1.74-1.94 2.98-4.17 3.71-6.53H17V4h-7V2H8v2H1v1.99h11.17C11.5 7.92 10.44 9.75 9 11.35 8.07 10.32 7.3 9.19 6.69 8h-2c.73 1.63 1.73 3.17 2.98 4.56l-5.09 5.02L4 19l5-5 3.11 3.11.76-2.04zM18.5 10h-2L12 22h2l1.12-3h4.75L21 22h2l-4.5-12zm-2.62 7l1.62-4.33L19.12 17h-3.24z"></path></svg>English</a><ul class="dropdown__menu"><li><a href="/user_cases" target="_self" rel="noopener noreferrer" class="dropdown__link dropdown__link--active" lang="en">English</a></li><li><a href="/zh-CN/user_cases" target="_self" rel="noopener noreferrer" class="dropdown__link" lang="zh-CN">简体中文</a></li></ul></div><div class="toggle_vylO colorModeToggle_DEke"><button class="clean-btn toggleButton_gllP toggleButtonDisabled_aARS" type="button" disabled="" title="Switch between dark and light mode (currently light mode)" aria-label="Switch between dark and light mode (currently light mode)" aria-live="polite"><svg viewBox="0 0 24 24" width="24" height="24" class="lightToggleIcon_pyhR"><path fill="currentColor" d="M12,9c1.65,0,3,1.35,3,3s-1.35,3-3,3s-3-1.35-3-3S10.35,9,12,9 M12,7c-2.76,0-5,2.24-5,5s2.24,5,5,5s5-2.24,5-5 S14.76,7,12,7L12,7z M2,13l2,0c0.55,0,1-0.45,1-1s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S1.45,13,2,13z M20,13l2,0c0.55,0,1-0.45,1-1 s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S19.45,13,20,13z M11,2v2c0,0.55,0.45,1,1,1s1-0.45,1-1V2c0-0.55-0.45-1-1-1S11,1.45,11,2z M11,20v2c0,0.55,0.45,1,1,1s1-0.45,1-1v-2c0-0.55-0.45-1-1-1C11.45,19,11,19.45,11,20z M5.99,4.58c-0.39-0.39-1.03-0.39-1.41,0 c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0s0.39-1.03,0-1.41L5.99,4.58z M18.36,16.95 c-0.39-0.39-1.03-0.39-1.41,0c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0c0.39-0.39,0.39-1.03,0-1.41 L18.36,16.95z M19.42,5.99c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06c-0.39,0.39-0.39,1.03,0,1.41 s1.03,0.39,1.41,0L19.42,5.99z M7.05,18.36c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06 c-0.39,0.39-0.39,1.03,0,1.41s1.03,0.39,1.41,0L7.05,18.36z"></path></svg><svg viewBox="0 0 24 24" width="24" height="24" class="darkToggleIcon_wfgR"><path fill="currentColor" d="M9.37,5.51C9.19,6.15,9.1,6.82,9.1,7.5c0,4.08,3.32,7.4,7.4,7.4c0.68,0,1.35-0.09,1.99-0.27C17.45,17.19,14.93,19,12,19 c-3.86,0-7-3.14-7-7C5,9.07,6.81,6.55,9.37,5.51z M12,3c-4.97,0-9,4.03-9,9s4.03,9,9,9s9-4.03,9-9c0-0.46-0.04-0.92-0.1-1.36 c-0.98,1.37-2.58,2.26-4.4,2.26c-2.98,0-5.4-2.42-5.4-5.4c0-1.81,0.89-3.42,2.26-4.4C12.92,3.04,12.46,3,12,3L12,3z"></path></svg></button></div><div class="searchBox_ZlJk"><button type="button" class="DocSearch DocSearch-Button" aria-label="Search"><span class="DocSearch-Button-Container"><svg width="20" height="20" class="DocSearch-Search-Icon" viewBox="0 0 20 20" aria-hidden="true"><path d="M14.386 14.386l4.0877 4.0877-4.0877-4.0877c-2.9418 2.9419-7.7115 2.9419-10.6533 0-2.9419-2.9418-2.9419-7.7115 0-10.6533 2.9418-2.9419 7.7115-2.9419 10.6533 0 2.9419 2.9418 2.9419 7.7115 0 10.6533z" stroke="currentColor" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg><span class="DocSearch-Button-Placeholder">Search</span></span><span class="DocSearch-Button-Keys"></span></button></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div></nav><div id="__docusaurus_skipToContent_fallback" class="main-wrapper mainWrapper_z2l0"><div class="container margin-vert--lg"><div class="row"><aside class="col col--3"><nav class="sidebar_re4s thin-scrollbar" aria-label="Blog recent posts navigation"><div class="sidebarItemTitle_pO2u margin-bottom--md">Recent posts</div><ul class="sidebarItemList_Yudw clean-list"><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/user_cases/The-practice-of-SeaTunnel-in-Vip">The practice of SeaTunnel in Vip</a></li></ul></nav></aside><main class="col col--7" itemscope="" itemtype="http://schema.org/Blog"><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><meta itemprop="description" content="Guest speaker: Vip Senior Big Data Engineer Wang Yu"><header><h2 class="title_f1Hy" itemprop="headline"><a itemprop="url" href="/user_cases/The-practice-of-SeaTunnel-in-Vip">The practice of SeaTunnel in Vip</a></h2><div class="container_mt6G margin-vert--md"><time datetime="2022-02-18T00:00:00.000Z" itemprop="datePublished">February 18, 2022</time> · <!-- -->13 min read</div></header><div class="markdown" itemprop="articleBody"><p>Guest speaker: Vip Senior Big Data Engineer Wang Yu Lecture preparation: Zhang Detong</p><p>Introduction: Vip referenced SeaTunnel as early as version 1.0. We use SeaTunnel to perform some data interaction work between Hive and ClickHouse. Today's presentation will focus on the following points:</p><ul><li>Requirements and pain points of ClickHouse data import;</li><li>Selection of ClickHouse warehousing and warehousing tools;</li><li>Hive to ClickHouse;</li><li>ClickHouse to Hive;</li><li>Integration of SeaTunnel and Vipshop data platform;</li><li>Future outlook;</li></ul><h1>Requirements and pain points of ClickHouse data import</h1><h2 class="anchor anchorWithStickyNavbar_LWe7" id="1-vipshop-data-olap-architecture">1. Vipshop Data OLAP Architecture<a href="#1-vipshop-data-olap-architecture" class="hash-link" aria-label="Direct link to 1. Vipshop Data OLAP Architecture" title="Direct link to 1. Vipshop Data OLAP Architecture"></a></h2><p>The picture shows the OLAP architecture of Vipshop. The modules we are responsible for are the data service and the computing engine in the picture. The underlying data warehouses are divided into offline data warehouses, real-time data warehouses, and lake warehouses. For computing engines, we use Presto, Kylin and Clickhouse. Although Clickhouse is a storage-integrated OLAP database, we have included it in the computing engine part in order to take advantage of Clickhouse's excellent computing performance. Based on OLAP components, we provide SQL data services and non-SQL independent analysis of Vipshop to serve different intelligences. For example, non-SQL services are services that provide data analysis that is closer to the business for BI and commerce. Multiple data applications are abstracted on top of data services. <img loading="lazy" alt="1" src="/assets/images/1-1-f0cdf48f8c90391d9e8cfe410fc32bb1.png" width="2038" height="1440" class="img_ev3q"></p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="2-requirements">2. Requirements<a href="#2-requirements" class="hash-link" aria-label="Direct link to 2. Requirements" title="Direct link to 2. Requirements"></a></h2><p>We connect the underlying Hive, Kudu, and Alluxio components through Presto Connector and Spark components. Big data components can import and export data to and from each other, and you can use appropriate components to analyze data according to the needs and scenarios of data analysis. But when we introduced Clickhouse, it was a data island, and it was difficult to import and export data. There is a lot of work between Hive and Clickhouse to implement import and export. Our first data import and export requirement is to improve the import and export efficiency and incorporate Clickhouse into the big data system. <img loading="lazy" alt="2" src="/assets/images/2-0b1a1413f99ac6195aefc2146ba07c4b.png" width="2037" height="1440" class="img_ev3q"></p><p>The second requirement is that Presto runs SQL relatively slowly. The figure shows an example of slow SQL. The SQL where condition in the figure sets the date, time range, and specific filter conditions. This kind of SQL usage runs slowly because Presto uses partition granularity to push down. Even after optimization by other methods such as Hive's bucket table and bucketing, the return time is a few seconds, which cannot meet business requirements. In this case, we need to use Clickhouse for offline OLAP computing acceleration. <img loading="lazy" alt="3" src="/assets/images/3-a762d5f3914268cbc2b478215d8da72d.png" width="2035" height="1440" class="img_ev3q"></p><p>Our real-time data is written to Clickhouse through Kafka and Flink SQL. However, it is not enough to use real-time data for analysis. It is necessary to use the Hive dimension table and the T+1 real-time table with the ETL calculation number for accelerated transportation in Clickhouse. This requires importing Hive data into Clickhouse, which is our third requirement. <img loading="lazy" alt="4" src="/assets/images/4-03112bb3e1bca7cc97ab7868fbdf6d78.png" width="2035" height="1440" class="img_ev3q"></p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="3-pain-points">3. Pain points<a href="#3-pain-points" class="hash-link" aria-label="Direct link to 3. Pain points" title="Direct link to 3. Pain points"></a></h2><p>First, we introduce a data component to consider its performance. The granularity of the Hive table is five minutes. Is there a component that can support a short ETL process and import the ETL results into Clickhouse within five minutes? Second, we need to ensure the quality of the data, and the accuracy of the data needs to be guaranteed. The number of data entries in Hive and Clickhouse needs to be consistent. If there is a problem with the data quality, can the data be repaired by rerunning and other mechanisms? Third, are the data types that data import needs to support complete? The data types and some mechanisms between different databases are different. We have data types such as HiperLogLog and BitMap that are widely used in a certain storage engine. Whether they can be correctly transmitted and identified, and can be used well.</p><h1>Selection of ClickHouse and Hive warehousing and warehousing tools</h1><p>Based on the pain points in the data business, we have compared and selected data warehouse and warehouse tools. We mainly choose among open source tools, without considering commercial warehouse entry and exit tools, we mainly compare DataX, SeaTunnel, and write Spark programs and use jdbc to insert ClickHouse among the three options. SeaTunnel and Spark rely on Vipshop's own Yarn cluster, which can directly implement distributed reading and writing. DataX is non-distributed, and the startup process between Reader and Writer takes a long time, and the performance is ordinary. The performance of SeaTunnel and Spark for data processing can reach several times that of DataX. Data of more than one billion can run smoothly in SeaTunnel and Spark. DataX has great performance pressure after the amount of data is large, and it is difficult to process data of more than one billion. In terms of read and write plug-in scalability, SeaTunnel supports a variety of data sources and supports users to develop plug-ins. SeaTunnel supports data import into Redis. In terms of stability, since SeaTunnel and DataX are self-contained tools, the stability will be better. The stability aspect of Spark requires attention to code quality. <img loading="lazy" alt="5" src="/assets/images/5-e5c4ff850add4f0334c2f9a0de839c53.png" width="2035" height="1440" class="img_ev3q"></p><p>The amount of data in our exposure table is in the billions of levels every day. We have the performance requirement to complete data processing within 5 minutes. We have the need to import and export data to Redis. We need import and export tools that can be connected to the data platform for task scheduling. . For the consideration of data volume, performance, scalability, and platform compatibility, we chose SeaTunnel as our data warehouse import and export tool.</p><h1>Import Hive data into ClickHouse</h1><p>The following will introduce our use of SeaTunnel. The picture is a Hive table, which is our three-level product dimension table, including category products, dimension categories, and user population information. The primary key of the table is a third-level category ct_third_id, and the following value is the bitmap of two uids, which is the bitmap type of the user id. We need to import this Hive table into Clickhouse. <img loading="lazy" alt="6" src="/assets/images/6-e37187a14e8316f82074f78b8a250a58.png" width="2036" height="1440" class="img_ev3q"></p><p>SeaTunnel is easy to install, and the official website documentation describes how to install it. The figure below shows the configuration of SeaTunnel. In the configuration, env, source and sink are essential. In the env part, the example in the figure is the Spark configuration. The configuration includes concurrency, etc. These parameters can be adjusted. The source part is the data source. The Hive data source is configured here, including a Hive Select statement. Spark runs the SQL in the source configuration to read the data. UDF is supported here for simple ETL; the sink part is configured with Clickhouse, and you can see output_type= rowbinary and rowbinary are the self-developed acceleration solutions of Vipshop; pre_sql and check_sql are self-developed functions for data verification, which will be described in detail later; clickhouse.socket_timeout and bulk_size can be adjusted according to the actual situation. <img loading="lazy" alt="7" src="/assets/images/7-70fd49824f2fe94faa0ed91b9e31fdf6.png" width="2037" height="1440" class="img_ev3q"></p><p>Run SeaTunnel, execute the sh script file, configure the conf file address and yarn information, and then you can. <img loading="lazy" alt="8" src="/assets/images/8-fa6d66bcb7d1a37038666d5d63c64d3d.png" width="2038" height="1440" class="img_ev3q"> Spark logs are generated during the running process, and both successful running and running errors can be viewed in the logs. <img loading="lazy" alt="9" src="/assets/images/9-f08855ea40230b4f16d6c04408063346.png" width="2038" height="1440" class="img_ev3q"></p><p>In order to better fit the business, Vipshop will make some improvements to SeaTunnel. All our ETL tasks need to be rerun. We support pre_sql and check_sql to implement data rerun and logarithm. The main process is to execute pre_sql for preprocessing after the data is ready, delete the old partition data in Clickhouse, store it in a directory, and restore the partition and rename when it fails. check_sql will check, and the whole process will end after the check is passed; if the check fails, it will be rerun according to the configuration, and if the rerun fails, the corresponding person in charge will be alerted. <img loading="lazy" alt="10" src="/assets/images/10-15459320d13177fafe0fd49a06f92e03.png" width="2037" height="1440" class="img_ev3q"></p><p>Based on the 1.0 version of SeaTunnel, Vipshop has added RowBinary for acceleration, and it also makes it easier to import the binary files of HuperLogLog and BinaryBitmap from Hive to Clickhouse. We made changes in ClickHouse-jdbc, bulk_size, Hive-source. Use the extended api of CK-jdbc to write data to CK in rowbinary mode. Bulk_size introduces the control logic for writing to CK in rowbinary mode. Hive-source RDD is partitioned with HashPartitioner to break up data to prevent data from being skewed.</p><p>We also let SeaTunnel support multiple types. In order to circle the crowd, corresponding methods need to be implemented in Clickhouse, Preso, and Spark. We added Callback, HttpEntity, and RowBinaryStream that support Batch feature to Clickhouse-jdbc, added bitmap type mapping to Clickhouse-jdbc and Clickhouse-sink code, and implemented UDF of Clickhouse's Hyperloglog and Bitmap functions in Presto and Spark. In the previous configuration, the clickhouse-sink part can specify the table name, and here is the difference between writing to the local table and the distributed table. The performance of writing to a distributed table is worse than that of writing to a local table, which will put more pressure on the Clickhouse cluster. However, in scenarios such as exposure meter, flow meter, and ABTest, two tables are required to join, and both tables are in the order of billions. . At this time, we hope that the Join key falls on the local machine, and the Join cost is smaller. When we built the table, we configured the murmurHash64 rule in the distributed table distribution rules of Clickhouse, and then directly configured the distributed table in the sink of Seatunnel, handed the writing rules to Clickhouse, and used the characteristics of the distributed table to write. Writing to the local table will put less pressure on Clickhouse, and the performance of writing will be better. In Seatunnel, we go to Clickhouse's System.cluster table to obtain the table distribution information and machine distribution host according to the sink's local table. Then write to these hosts according to the equalization rule. Put the distributed writing of data into Seatunnel. For the writing of local tables and distributed tables, our future transformation direction is to implement consistent hashing in Seatunnel, write directly according to certain rules, such as Clickhouse, without relying on Clickhouse itself for data distribution, and improve Clickhouse's high CPU load problem.</p><h1>ClickHouse data import into Hive</h1><p>We have the needs of people in the circle. Every day, Vipshop gathers 200,000 people in the supplier circle, such as people born in the 1980s, Gao Fushuai, and Bai Fumei. These Bitmap crowd information in Clickhouse needs to be exported to the Hive table, coordinated with other ETL tasks in Hive, and finally pushed to PIKA for use by external media. We made SeaTunnel back-push Clickhouse Bitmap crowd data to Hive. <img loading="lazy" alt="11" src="/assets/images/11-8f601d01cedafe8901b091b5ddcefd59.png" width="2035" height="1440" class="img_ev3q"></p><p>The figure shows the SeaTunnel configuration. We configure the source as Clickhouse, the sink as Hive, and the data verification is also configured in Hive. <img loading="lazy" alt="12" src="/assets/images/12-9a03852b4e4065b3f9f909da1bb4b672.png" width="2035" height="1440" class="img_ev3q"></p><p>Since we access SeaTunnel earlier, we have processed some modules, including adding plugin-spark-sink-hive module, plugin-spark-source-ClickHouse module, and rewriting Spark Row related methods so that they can be packaged through The Clickhouse data mapped by Schem, reconstruct the StructField and generate the DataFrame that finally needs to land on Hive. The latest version has many source and sink components, which is more convenient to use in SeaTunnel. It is now also possible to integrate the Flink connector directly in SeaTunnel.</p><h1>Integration of SeaTunnel and Vipshop Data Platform</h1><p>Each company has its own scheduling system, such as Beluga, Zeus. The scheduling tool of Vipshop is Shufang, and the scheduling tool integrates the data transmission tool. The following is the architecture diagram of the scheduling system, which includes the entry and exit of various types of data. <img loading="lazy" alt="13" src="/assets/images/13-c93b070b1abe679688f557f3b6bcfc52.png" width="2036" height="1440" class="img_ev3q"></p><p>The SeaTunnel task type is integrated into the platform. The picture is a screenshot of the scheduled task of Shufang. You can see that the selected part is a configured SeaTunnel task. resource information. The following shows the historical running instance information. <img loading="lazy" alt="14" src="/assets/images/14-654566bbfb5ed478b1a9dcd28f2dba72.png" width="2037" height="1440" class="img_ev3q"></p><p>We integrated SeaTunnel into the scheduling system. The Shufang Scheduling Master will assign tasks to the corresponding Agents according to the task types, and assign them to the appropriate machines to run according to the Agent load. The controller pulls the task scheduling configuration and information in the foreground. After arriving, a SeaTunnel cluster is generated and executed in a virtual environment similar to k8s pod and cgroup isolation. The running result will be judged by the data quality monitoring of the scheduling platform whether the task is completed and whether the operation is successful, and if it fails, it will rerun and alarm. <img loading="lazy" alt="15" src="/assets/images/15-798a1029db30ff228befcc333645c3e4.png" width="2038" height="1440" class="img_ev3q"></p><p>SeaTunnel itself is a tool-based component, which is used to manage and control data blood relationship, data quality, historical records, high-alert monitoring, and resource allocation. We integrate SeaTunnel into the platform, and we can take advantage of the platform to take advantage of SeaTunnel. SeaTunnel is used for processing in the deposit crowd. By managing data, we divide the circled people into different people according to their paths and usage conditions, or thousands of people and thousands of faces, tag users, and push a certain type of people circled to users, analysts and suppliers. <img loading="lazy" alt="16" src="/assets/images/16-8947f00847b3a55a537124964206e1b2.png" width="2036" height="1440" class="img_ev3q"></p><p>The traffic enters Kafka, enters the warehouse through Flink, and then forms a user label table through ETL. After the user label table is generated, we use the BitMap method implemented by Presto to type the data into a wide table in Hive. Users create tasks by box-selecting entries on the crowd system page, submit to Tengqun, and generate SQL query Clickhouse BitMap. Clickhouse's BitMap query speed is very fast. Due to its inherent advantages, we need to import Hive's BitMap table into Clickhouse through SeaTunnel. After the crowd is circled, we need to land the table to form a partition or a record of Clickhouse, and then store the resulting BitMap table in Hive through SeaTunnel. Finally, the synchronization tool will synchronize Hive's BitMap crowd results to the external media repository Pika. Around 20w people are circled every day. During the whole process, SeaTunnel is responsible for exporting the data from Hive to Clickhouse. After the ETL process of Clickhouse is completed, SeaTunnel exports the data from Clickhouse to Hive. In order to fulfill this requirement, we implemented UDFs of ClickHouse's Hyperloglog and BitMap functions on Presto and Spark; we also developed the Seatunnel interface, so that the crowds circled by users using the Bitmap method in ClickHouse can be directly written to the Hive table through Seatunnel , without an intermediate landing step. Users can also call the SeaTunnel interface through spark to circle the crowd or reverse the crowd bitmap in Hive, so that the data can be directly transmitted to the ClickHouse result table without intermediate landing.</p><h1>Follow-up</h1><p>In the future, we will further improve the problem of high CPU load when Clickhouse writes data. In the next step, we will implement the CK-local mode of Clickhouse data source and read end in SeaTunnel, separate read and write, and reduce Clickhouse pressure. In the future we will also add more sink support, such as data push to Pika and corresponding data checking.</p></div><footer class="row docusaurus-mt-lg"><div class="col"><b>Tags:</b><ul class="tags_jXut padding--none margin-left--sm"><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/user_cases/tags/vip">Vip</a></li><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/user_cases/tags/click-house">ClickHouse</a></li></ul></div></footer></article><nav class="pagination-nav" aria-label="Blog list page navigation"></nav></main></div></div></div><footer class="footer"><div class="container container-fluid"><div class="row footer__links"><div class="col footer__col"><div class="footer__title">SeaTunnel</div><ul class="footer__items clean-list"><li class="footer__item"><a class="footer__link-item" href="/docs/faq">FAQ</a></li><li class="footer__item"><a href="https://github.com/apache/incubator-seatunnel/releases" target="_blank" rel="noopener noreferrer" class="footer__link-item">Releases<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div><div class="col footer__col"><div class="footer__title">Community</div><ul class="footer__items clean-list"><li class="footer__item"><a href="https://github.com/apache/incubator-seatunnel" target="_blank" rel="noopener noreferrer" class="footer__link-item">GitHub<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://github.com/apache/incubator-seatunnel/issues" target="_blank" rel="noopener noreferrer" class="footer__link-item">Issue Tracker<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://github.com/apache/incubator-seatunnel/pulls" target="_blank" rel="noopener noreferrer" class="footer__link-item">Pull Requests<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div><div class="col footer__col"><div class="footer__title">Subscribe Mailing List</div><ul class="footer__items clean-list"><li class="footer__item"><a class="footer__link-item" href="/community/contribution_guide/subscribe">How to Subscribe</a></li><li class="footer__item"><a href="mailto:dev-subscribe@seatunnel.apache.org" target="_blank" rel="noopener noreferrer" class="footer__link-item">Subscribe Mail<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://lists.apache.org/list.html?dev@seatunnel.apache.org" target="_blank" rel="noopener noreferrer" class="footer__link-item">Mail Archive<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div></div><div class="footer__bottom text--center"><div class="footer__copyright"> <div style="margin-top: 20px;"> <img style="height:50px;margin: 30px 0 10px;" alt="Apache Software Foundation" src="/image/asf_logo_wide.svg"> <div style="border-top: 1px solid #ccc;min-height: 60px;line-height: 20px;text-align: center;font-family: Avenir-Medium;font-size: 14px;color: #999;display: flex;align-items: center;"><span>Copyright © 2021-2025 The Apache Software Foundation. Apache SeaTunnel, SeaTunnel, and its feather logo are trademarks of The Apache Software Foundation.</span></div> <div style="text-align: center;"> <a href="https://twitter.com/asfseatunnel?s=21" target="_blank" title="Twitter"><svg t="1644553365083" class="icon" viewBox="0 0 1260 1024" version="1.1" xmlns="http://www.w3.org/2000/svg" p-id="7015" width="38" height="38"><path d="M1259.846921 121.148242c-46.524504 20.728739-96.273478 34.547899-148.325646 40.536201 53.434084-31.784067 94.430924-82.454319 113.777747-142.797982-50.209613 29.480874-105.486251 51.13089-164.447999 62.646857A257.584528 257.584528 0 0 0 872.449815 0.000276c-142.797982 0-258.418284 115.620302-258.418284 258.418284 0 20.268101 2.303193 40.075563 6.909579 58.961748C405.82286 306.32498 215.579097 203.602561 87.98219 47.446058c-22.110655 38.233008-35.008538 82.454319-35.008538 129.900099 0 89.824537 45.603227 168.593747 115.159663 215.118251-42.378756-1.381916-81.99368-12.897882-117.002217-32.244706v3.224471c0 125.293713 88.90326 229.398049 207.287393 253.351259-21.650017 5.988302-44.681949 9.212773-68.17452 9.212773-16.582991 0-32.705344-1.842555-48.827697-4.606387 32.705344 102.722419 128.518184 177.345881 241.374653 179.649074-88.442621 69.095798-199.917175 110.553277-321.06514 110.553277-20.728739 0-41.457479-1.381916-61.72558-3.685109 114.238386 73.241546 250.126788 116.08094 396.149241 116.08094 475.379089 0 735.179289-393.846048 735.179289-735.179289 0-11.055328-0.460639-22.571294-0.921277-33.626621 51.13089-36.851092 94.891562-82.454319 129.439461-134.045848z" fill="#909094" p-id="7016"></path></svg></a> <a href="https://s.apache.org/seatunnel-slack" target="_blank" title="Slack" style="margin-left: 20px;"><svg t="1644553076784" class="icon" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg" p-id="3088" width="36" height="36"><path d="M215.125333 647.04a107.861333 107.861333 0 0 1-107.52 107.648A107.861333 107.861333 0 0 1 0 647.04a107.818667 107.818667 0 0 1 107.605333-107.52h107.52v107.52z m54.229334 0a107.818667 107.818667 0 0 1 107.562666-107.52 107.818667 107.818667 0 0 1 107.562667 107.52v269.354667A107.861333 107.861333 0 0 1 376.917333 1024a107.861333 107.861333 0 0 1-107.562666-107.605333v-269.354667zM376.917333 215.125333a107.861333 107.861333 0 0 1-107.562666-107.52A107.861333 107.861333 0 0 1 376.917333 0a107.861333 107.861333 0 0 1 107.562667 107.605333v107.52H376.917333z m0 54.229334a107.861333 107.861333 0 0 1 107.562667 107.562666 107.861333 107.861333 0 0 1-107.562667 107.562667H107.605333A107.861333 107.861333 0 0 1 0 376.917333a107.861333 107.861333 0 0 1 107.605333-107.562666h269.312z m431.872 107.562666a107.861333 107.861333 0 0 1 107.605334-107.562666A107.861333 107.861333 0 0 1 1024 376.917333a107.861333 107.861333 0 0 1-107.605333 107.562667h-107.605334V376.917333z m-54.101333 0a107.861333 107.861333 0 0 1-107.648 107.562667 107.818667 107.818667 0 0 1-107.52-107.562667V107.605333A107.818667 107.818667 0 0 1 647.04 0a107.861333 107.861333 0 0 1 107.648 107.605333v269.312z m-107.648 431.872a107.861333 107.861333 0 0 1 107.648 107.605334A107.861333 107.861333 0 0 1 647.04 1024a107.818667 107.818667 0 0 1-107.52-107.605333v-107.605334h107.52z m0-54.101333a107.818667 107.818667 0 0 1-107.52-107.648 107.776 107.776 0 0 1 107.52-107.52h269.354667A107.818667 107.818667 0 0 1 1024 647.04a107.861333 107.861333 0 0 1-107.605333 107.648h-269.354667z" p-id="3089" fill="#909094"></path></svg></a> <a href="https://lists.apache.org/list.html?dev@seatunnel.apache.org" target="_blank" title="Mailing list" style="margin-left: 20px;"><svg t="1644553175467" class="icon" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg" p-id="5304" width="39" height="39"><path d="M853.333333 170.666667H170.666667c-46.933333 0-85.333333 38.4-85.333334 85.333333v512c0 46.933333 38.4 85.333333 85.333334 85.333333h682.666666c46.933333 0 85.333333-38.4 85.333334-85.333333V256c0-46.933333-38.4-85.333333-85.333334-85.333333z m0 170.666666l-341.333333 213.333334-341.333333-213.333334V256l341.333333 213.333333 341.333333-213.333333v85.333333z" p-id="5305" fill="#909094"></path></svg></a> <a href="https://github.com/apache/seatunnel" target="_blank" title="GitHub" style="margin-left: 20px;"><svg t="1644553223000" class="icon" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg" p-id="6156" width="36" height="36"><path d="M512 12.64c-282.752 0-512 229.216-512 512 0 226.208 146.72 418.144 350.144 485.824 25.6 4.736 35.008-11.104 35.008-24.64 0-12.192-0.48-52.544-0.704-95.328-142.464 30.976-172.512-60.416-172.512-60.416-23.296-59.168-56.832-74.912-56.832-74.912-46.464-31.776 3.52-31.136 3.52-31.136 51.392 3.616 78.464 52.768 78.464 52.768 45.664 78.272 119.776 55.648 148.992 42.56 4.576-33.088 17.856-55.68 32.512-68.48-113.728-12.928-233.28-56.864-233.28-253.024 0-55.904 20-101.568 52.768-137.44-5.312-12.896-22.848-64.96 4.96-135.488 0 0 43.008-13.76 140.832 52.48a491.296 491.296 0 0 1 128.16-17.248c43.488 0.192 87.328 5.888 128.256 17.248 97.728-66.24 140.64-52.48 140.64-52.48 27.872 70.528 10.336 122.592 5.024 135.488 32.832 35.84 52.704 81.536 52.704 137.44 0 196.64-119.776 239.936-233.792 252.64 18.368 15.904 34.72 47.04 34.72 94.816 0 68.512-0.608 123.648-0.608 140.512 0 13.632 9.216 29.6 35.168 24.576C877.472 942.624 1024 750.784 1024 524.64c0-282.784-229.248-512-512-512z" p-id="6157" fill="#909094"></path></svg></a> </div> <div></div></div></div></div></div></footer></div> <script src="/assets/js/runtime~main.49ca96aa.js"></script> <script src="/assets/js/main.b6cbf3fe.js"></script> </body> </html>