This is an automated email from the ASF dual-hosted git repository. vinoth pushed a commit to branch asf-site in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/asf-site by this push: new ffe4e7f Travis CI build asf-site ffe4e7f is described below commit ffe4e7f70b14cd5c5949c7223b005c9af5cb7891 Author: CI <ci...@hudi.apache.org> AuthorDate: Sat Feb 20 11:55:15 2021 +0000 Travis CI build asf-site --- content/activity.html | 24 ++ content/assets/js/lunr/lunr-store.js | 5 + content/blog.html | 24 ++ content/blog/hudi-key-generators/index.html | 637 ++++++++++++++++++++++++++++ content/cn/activity.html | 24 ++ content/sitemap.xml | 4 + 6 files changed, 718 insertions(+) diff --git a/content/activity.html b/content/activity.html index 7ec278e..71bf9b4 100644 --- a/content/activity.html +++ b/content/activity.html @@ -193,6 +193,30 @@ <h2 class="archive__item-title" itemprop="headline"> + <a href="/blog/hudi-key-generators/" rel="permalink">Apache Hudi Key Generators +</a> + + </h2> + <!-- Look the author details up from the site config. --> + + <!-- Output author details if some exist. --> + + + <p class="archive__item-excerpt" itemprop="description">Different key generators available with Apache Hudi +</p> + </article> +</div> + + + + + + +<div class="list__item"> + <article class="archive__item" itemscope itemtype="https://schema.org/CreativeWork"> + + <h2 class="archive__item-title" itemprop="headline"> + <a href="/blog/hudi-clustering-intro/" rel="permalink">Optimize Data lake layout using Clustering in Apache Hudi </a> diff --git a/content/assets/js/lunr/lunr-store.js b/content/assets/js/lunr/lunr-store.js index bf8a81a..ae425f0 100644 --- a/content/assets/js/lunr/lunr-store.js +++ b/content/assets/js/lunr/lunr-store.js @@ -1438,4 +1438,9 @@ var store = [{ "excerpt":"Background Apache Hudi brings stream processing to big data, providing fresh data while being an order of magnitude efficient over traditional batch processing. In a data lake/warehouse, one of the key trade-offs is between ingestion speed and query performance. Data ingestion typically prefers small files to improve parallelism and make...","categories": ["blog"], "tags": [], "url": "https://hudi.apache.org/blog/hudi-clustering-intro/", + "teaser":"https://hudi.apache.org/assets/images/500x300.png"},{ + "title": "Apache Hudi Key Generators", + "excerpt":"Every record in Hudi is uniquely identified by a HoodieKey, which is a pair of record key and partition path where the record belongs to. Hudi has imposed this constraint so that updates and deletes can be applied to the record of interest. Hudi relies on the partition path field...","categories": ["blog"], + "tags": [], + "url": "https://hudi.apache.org/blog/hudi-key-generators/", "teaser":"https://hudi.apache.org/assets/images/500x300.png"},] diff --git a/content/blog.html b/content/blog.html index 004a368..0935196 100644 --- a/content/blog.html +++ b/content/blog.html @@ -191,6 +191,30 @@ <h2 class="archive__item-title" itemprop="headline"> + <a href="/blog/hudi-key-generators/" rel="permalink">Apache Hudi Key Generators +</a> + + </h2> + <!-- Look the author details up from the site config. --> + + <!-- Output author details if some exist. --> + + + <p class="archive__item-excerpt" itemprop="description">Different key generators available with Apache Hudi +</p> + </article> +</div> + + + + + + +<div class="list__item"> + <article class="archive__item" itemscope itemtype="https://schema.org/CreativeWork"> + + <h2 class="archive__item-title" itemprop="headline"> + <a href="/blog/hudi-clustering-intro/" rel="permalink">Optimize Data lake layout using Clustering in Apache Hudi </a> diff --git a/content/blog/hudi-key-generators/index.html b/content/blog/hudi-key-generators/index.html new file mode 100644 index 0000000..508dafb --- /dev/null +++ b/content/blog/hudi-key-generators/index.html @@ -0,0 +1,637 @@ +<!doctype html> +<html lang="en" class="no-js"> + <head> + <meta charset="utf-8"> + +<!-- begin _includes/seo.html --><title>Apache Hudi Key Generators - Apache Hudi</title> +<meta name="description" content="Different key generators available with Apache Hudi"> + +<meta property="og:type" content="article"> +<meta property="og:locale" content="en_US"> +<meta property="og:site_name" content=""> +<meta property="og:title" content="Apache Hudi Key Generators"> +<meta property="og:url" content="https://hudi.apache.org/blog/hudi-key-generators/"> + + + <meta property="og:description" content="Different key generators available with Apache Hudi"> + + + + + + + + + + + +<!-- end _includes/seo.html --> + + +<!--<link href="/feed.xml" type="application/atom+xml" rel="alternate" title=" Feed">--> + +<!-- https://t.co/dKP3o1e --> +<meta name="viewport" content="width=device-width, initial-scale=1.0"> + +<script> + document.documentElement.className = document.documentElement.className.replace(/\bno-js\b/g, '') + ' js '; +</script> + +<!-- For all browsers --> +<link rel="stylesheet" href="/assets/css/main.css"> + +<!--[if IE]> + <style> + /* old IE unsupported flexbox fixes */ + .greedy-nav .site-title { + padding-right: 3em; + } + .greedy-nav button { + position: absolute; + top: 0; + right: 0; + height: 100%; + } + </style> +<![endif]--> + + + +<link rel="icon" type="image/x-icon" href="/assets/images/favicon.ico"> +<link rel="stylesheet" href="/assets/css/font-awesome.min.css"> +<script src="/assets/js/jquery.min.js"></script> + + +<script src="/assets/js/main.min.js"></script> + + </head> + + <body class="layout--single"> + <!--[if lt IE 9]> +<div class="notice--danger align-center" style="margin: 0;">You are using an <strong>outdated</strong> browser. Please <a href="https://browsehappy.com/">upgrade your browser</a> to improve your experience.</div> +<![endif]--> + + <div class="masthead"> + <div class="masthead__inner-wrap" id="masthead__inner-wrap"> + <div class="masthead__menu"> + <nav id="site-nav" class="greedy-nav"> + + <a class="site-logo" href="/"> + <div style="width: 150px; height: 40px"> + </div> + </a> + + <a class="site-title" href="/"> + + </a> + <ul class="visible-links"><li class="masthead__menu-item"> + <a href="/docs/quick-start-guide.html" target="_self" >Documentation</a> + </li><li class="masthead__menu-item"> + <a href="/community.html" target="_self" >Community</a> + </li><li class="masthead__menu-item"> + <a href="/blog.html" target="_self" >Blog</a> + </li><li class="masthead__menu-item"> + <a href="https://cwiki.apache.org/confluence/display/HUDI/FAQ" target="_blank" >FAQ</a> + </li><li class="masthead__menu-item"> + <a href="/docs/powered_by.html" target="_self" >Powered By</a> + </li><li class="masthead__menu-item"> + <a href="/releases.html" target="_self" >Releases</a> + </li></ul> + <button class="greedy-nav__toggle hidden" type="button"> + <span class="visually-hidden">Toggle menu</span> + <div class="navicon"></div> + </button> + <ul class="hidden-links hidden"></ul> + </nav> + </div> + </div> +</div> +<!-- +<p class="notice--warning" style="margin: 0 !important; text-align: center !important;"><strong>Note:</strong> This site is work in progress, if you notice any issues, please <a target="_blank" href="https://github.com/apache/hudi/issues">Report on Issue</a>. + Click <a href="/"> here</a> back to old site.</p> +--> + + <div class="initial-content"> + <div id="main" role="main"> + + + <div class="sidebar sticky"> + + + <div itemscope itemtype="https://schema.org/Person"> + + <div class="author__content"> + + <h3 class="author__name" itemprop="name">Quick Links</h3> + + + <div class="author__bio" itemprop="description"> + <p>Hudi <em>ingests</em> & <em>manages</em> storage of large analytical datasets over DFS.</p> + + </div> + + </div> + + <div class="author__urls-wrapper"> + <ul class="author__urls social-icons"> + + + <li><a href="/docs/quick-start-guide" target="_self" rel="nofollow noopener noreferrer"><i class="fa fa-book" aria-hidden="true"></i> Documentation</a></li> + + + + <li><a href="https://cwiki.apache.org/confluence/display/HUDI" target="_blank" rel="nofollow noopener noreferrer"><i class="fa fa-wikipedia-w" aria-hidden="true"></i> Technical Wiki</a></li> + + + + <li><a href="/contributing" target="_self" rel="nofollow noopener noreferrer"><i class="fa fa-thumbs-o-up" aria-hidden="true"></i> Contribution Guide</a></li> + + + + <li><a href="https://join.slack.com/t/apache-hudi/shared_invite/enQtODYyNDAxNzc5MTg2LTE5OTBlYmVhYjM0N2ZhOTJjOWM4YzBmMWU2MjZjMGE4NDc5ZDFiOGQ2N2VkYTVkNzU3ZDQ4OTI1NmFmYWQ0NzE" target="_blank" rel="nofollow noopener noreferrer"><i class="fa fa-slack" aria-hidden="true"></i> Join on Slack</a></li> + + + + <li><a href="https://github.com/apache/hudi" target="_blank" rel="nofollow noopener noreferrer"><i class="fa fa-github" aria-hidden="true"></i> Fork on GitHub</a></li> + + + + <li><a href="https://issues.apache.org/jira/projects/HUDI/summary" target="_blank" rel="nofollow noopener noreferrer"><i class="fa fa-navicon" aria-hidden="true"></i> Report Issues</a></li> + + + + <li><a href="/security" target="_self" rel="nofollow noopener noreferrer"><i class="fa fa-navicon" aria-hidden="true"></i> Report Security Issues</a></li> + + + + + </ul> + </div> +</div> + + + + + </div> + + + <article class="page" itemscope itemtype="https://schema.org/CreativeWork"> + <!-- Look the author details up from the site config. --> + + + <div class="page__inner-wrap"> + + <header> + <h1 id="page-title" class="page__title" itemprop="headline">Apache Hudi Key Generators +</h1> + <!-- Output author details if some exist. --> + + </header> + + + <section class="page__content" itemprop="text"> + + <style> + .page { + padding-right: 0 !important; + } + </style> + + <p>Every record in Hudi is uniquely identified by a HoodieKey, which is a pair of record key and partition path where the +record belongs to. Hudi has imposed this constraint so that updates and deletes can be applied to the record of interest. +Hudi relies on the partition path field to partition your dataset and records within a partition have unique record keys. +Since uniqueness is guaranteed only within the partition, there could be records with same record keys across different +partitions. One should choose the partition field wisely as it could be a determining factor for your ingestion and +query latency.</p> + +<h2 id="key-generators">Key Generators</h2> + +<p>Hudi exposes a number of out of the box key generators that customers can use based on their need. Or can have their +own implementation for the KeyGenerator. This blog goes over all different types of key generators that are readily +available to use.</p> + +<p><a href="https://github.com/apache/hudi/blob/master/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/KeyGenerator.java">Here</a> +is the interface for KeyGenerator in Hudi for your reference.</p> + +<p>Before diving into different types of key generators, let’s go over some of the common configs required to be set for +key generators.</p> + +<table> + <thead> + <tr> + <th>Config</th> + <th style="text-align: center">Meaning/purpose</th> + </tr> + </thead> + <tbody> + <tr> + <td><code class="highlighter-rouge">hoodie.datasource.write.recordkey.field</code></td> + <td style="text-align: center">Refers to record key field. This is a mandatory field.</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.datasource.write.partitionpath.field</code></td> + <td style="text-align: center">Refers to partition path field. This is a mandatory field.</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.datasource.write.keygenerator.class</code></td> + <td style="text-align: center">Refers to Key generator class(including full path). Could refer to any of the available ones or user defined one. This is a mandatory field.</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.datasource.write.partitionpath.urlencode</code></td> + <td style="text-align: center">When set to true, partition path will be url encoded. Default value is false.</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.datasource.write.hive_style_partitioning</code></td> + <td style="text-align: center">When set to true, uses hive style partitioning. Partition field name will be prefixed to the value. Format: “<partition_path_field_name>=<partition_path_value>”. Default value is false.</partition_path_value></partition_path_field_name></td> + </tr> + </tbody> +</table> + +<p>There are few more configs involved if you are looking for TimestampBasedKeyGenerator. Will cover those in the respective section.</p> + +<p>Lets go over different key generators available to be used with Hudi.</p> + +<h3 id="simplekeygenerator"><a href="https://github.com/apache/hudi/blob/master/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/SimpleKeyGenerator.java">SimpleKeyGenerator</a></h3> + +<p>Record key refers to one field(column in dataframe) by name and partition path refers to one field (single column in dataframe) +by name. This is one of the most commonly used one. Values are interpreted as is from dataframe and converted to string.</p> + +<h3 id="complexkeygenerator"><a href="https://github.com/apache/hudi/blob/master/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/ComplexKeyGenerator.java">ComplexKeyGenerator</a></h3> +<p>Both record key and partition paths comprise one or more than one field by name(combination of multiple fields). Fields +are expected to be comma separated in the config value. For example <code class="highlighter-rouge">"Hoodie.datasource.write.recordkey.field" : “col1,col4”</code></p> + +<h3 id="globaldeletekeygenerator"><a href="https://github.com/apache/hudi/blob/master/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/GlobalDeleteKeyGenerator.java">GlobalDeleteKeyGenerator</a></h3> +<p>Global index deletes do not require partition value. So this key generator avoids using partition value for generating HoodieKey.</p> + +<h3 id="timestampbasedkeygenerator"><a href="https://github.com/apache/hudi/blob/master/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/TimestampBasedKeyGenerator.java">TimestampBasedKeyGenerator</a></h3> +<p>This key generator relies on timestamps for the partition field. The field values are interpreted as timestamps +and not just converted to string while generating partition path value for records. Record key is same as before where it is chosen by +field name. Users are expected to set few more configs to use this KeyGenerator.</p> + +<p>Configs to be set:</p> + +<table> + <thead> + <tr> + <th>Config</th> + <th>Meaning/purpose</th> + </tr> + </thead> + <tbody> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.timestamp.type</code></td> + <td>One of the timestamp types supported(UNIX_TIMESTAMP, DATE_STRING, MIXED, EPOCHMILLISECONDS, SCALAR)</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.output.dateformat</code></td> + <td>Output date format</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.timezone</code></td> + <td>Timezone of the data format</td> + </tr> + <tr> + <td><code class="highlighter-rouge">oodie.deltastreamer.keygen.timebased.input.dateformat</code></td> + <td>Input date format</td> + </tr> + </tbody> +</table> + +<p>Let’s go over some example values for TimestampBasedKeyGenerator.</p> + +<h4 id="timestamp-is-gmt">Timestamp is GMT</h4> + +<table> + <thead> + <tr> + <th>Config field</th> + <th>Value</th> + </tr> + </thead> + <tbody> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.timestamp.type</code></td> + <td>“EPOCHMILLISECONDS”</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.output.dateformat</code></td> + <td>“yyyy-MM-dd hh”</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.timezone</code></td> + <td>“GMT+8:00”</td> + </tr> + </tbody> +</table> + +<p>Input Field value: “1578283932000L” <br /> +Partition path generated from key generator: “2020-01-06 12”</p> + +<p>If input field value is null for some rows. <br /> +Partition path generated from key generator: “1970-01-01 08”</p> + +<h4 id="timestamp-is-date_string">Timestamp is DATE_STRING</h4> + +<table> + <thead> + <tr> + <th>Config field</th> + <th>Value</th> + </tr> + </thead> + <tbody> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.timestamp.type</code></td> + <td>“DATE_STRING”</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.output.dateformat</code></td> + <td>“yyyy-MM-dd hh”</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.timezone</code></td> + <td>“GMT+8:00”</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.input.dateformat</code></td> + <td>“yyyy-MM-dd hh:mm:ss”</td> + </tr> + </tbody> +</table> + +<p>Input field value: “2020-01-06 12:12:12” <br /> +Partition path generated from key generator: “2020-01-06 12”</p> + +<p>If input field value is null for some rows. <br /> +Partition path generated from key generator: “1970-01-01 12:00:00” +<br /></p> + +<h4 id="scalar-examples">Scalar examples</h4> + +<table> + <thead> + <tr> + <th>Config field</th> + <th>Value</th> + </tr> + </thead> + <tbody> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.timestamp.type</code></td> + <td>“SCALAR”</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.output.dateformat</code></td> + <td>“yyyy-MM-dd hh”</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.timezone</code></td> + <td>“GMT”</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.timestamp.scalar.time.unit</code></td> + <td>“days”</td> + </tr> + </tbody> +</table> + +<p>Input field value: “20000L” <br /> +Partition path generated from key generator: “2024-10-04 12”</p> + +<p>If input field value is null. <br /> +Partition path generated from key generator: “1970-01-02 12”</p> + +<h4 id="iso8601withmsz-with-single-input-format">ISO8601WithMsZ with Single Input format</h4> + +<table> + <thead> + <tr> + <th>Config field</th> + <th>Value</th> + </tr> + </thead> + <tbody> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.timestamp.type</code></td> + <td>“DATE_STRING”</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.input.dateformat</code></td> + <td>“yyyy-MM-dd’T’HH:mm:ss.SSSZ”</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.input.dateformat.list.delimiter.regex</code></td> + <td>””</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.input.timezone</code></td> + <td>””</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.output.dateformat</code></td> + <td>“yyyyMMddHH”</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.output.timezone</code></td> + <td>“GMT”</td> + </tr> + </tbody> +</table> + +<p>Input field value: “2020-04-01T13:01:33.428Z” <br /> +Partition path generated from key generator: “2020040113”</p> + +<h4 id="iso8601withmsz-with-multiple-input-formats">ISO8601WithMsZ with Multiple Input formats</h4> + +<table> + <thead> + <tr> + <th>Config field</th> + <th>Value</th> + </tr> + </thead> + <tbody> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.timestamp.type</code></td> + <td>“DATE_STRING”</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.input.dateformat</code></td> + <td>“yyyy-MM-dd’T’HH:mm:ssZ,yyyy-MM-dd’T’HH:mm:ss.SSSZ”</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.input.dateformat.list.delimiter.regex</code></td> + <td>””</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.input.timezone</code></td> + <td>””</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.output.dateformat</code></td> + <td>“yyyyMMddHH”</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.output.timezone</code></td> + <td>“UTC”</td> + </tr> + </tbody> +</table> + +<p>Input field value: “2020-04-01T13:01:33.428Z” <br /> +Partition path generated from key generator: “2020040113”</p> + +<h4 id="iso8601noms-with-offset-using-multiple-input-formats">ISO8601NoMs with offset using multiple input formats</h4> + +<table> + <thead> + <tr> + <th>Config field</th> + <th>Value</th> + </tr> + </thead> + <tbody> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.timestamp.type</code></td> + <td>“DATE_STRING”</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.input.dateformat</code></td> + <td>“yyyy-MM-dd’T’HH:mm:ssZ,yyyy-MM-dd’T’HH:mm:ss.SSSZ”</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.input.dateformat.list.delimiter.regex</code></td> + <td>””</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.input.timezone</code></td> + <td>””</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.output.dateformat</code></td> + <td>“yyyyMMddHH”</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.output.timezone</code></td> + <td>“UTC”</td> + </tr> + </tbody> +</table> + +<p>Input field value: “2020-04-01T13:01:33-<strong>05:00</strong>” <br /> +Partition path generated from key generator: “2020040118”</p> + +<h4 id="input-as-short-date-string-and-expect-date-in-date-format">Input as short date string and expect date in date format</h4> + +<table> + <thead> + <tr> + <th>Config field</th> + <th>Value</th> + </tr> + </thead> + <tbody> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.timestamp.type</code></td> + <td>“DATE_STRING”</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.input.dateformat</code></td> + <td>“yyyy-MM-dd’T’HH:mm:ssZ,yyyy-MM-dd’T’HH:mm:ss.SSSZ,yyyyMMdd”</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.input.dateformat.list.delimiter.regex</code></td> + <td>””</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.input.timezone</code></td> + <td>“UTC”</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.output.dateformat</code></td> + <td>“MM/dd/yyyy”</td> + </tr> + <tr> + <td><code class="highlighter-rouge">hoodie.deltastreamer.keygen.timebased.output.timezone</code></td> + <td>“UTC”</td> + </tr> + </tbody> +</table> + +<p>Input field value: “220200401” <br /> +Partition path generated from key generator: “04/01/2020”</p> + +<h3 id="customkeygenerator"><a href="https://github.com/apache/hudi/blob/master/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/CustomKeyGenerator.java">CustomKeyGenerator</a></h3> +<p>This is a generic implementation of KeyGenerator where users are able to leverage the benefits of SimpleKeyGenerator, +ComplexKeyGenerator and TimestampBasedKeyGenerator all at the same time. One can configure record key and partition +paths as a single field or a combination of fields. This keyGenerator is particularly useful if you want to define +complex partition paths involving regular fields and timestamp based fields. It expects value for prop <code class="highlighter-rouge">"hoodie.datasource.write.partitionpath.field"</code> +in a specific format. The format should be “field1:PartitionKeyType1,field2:PartitionKeyType2…”</p> + +<p>The complete partition path is created as +<code class="highlighter-rouge"><value for field1 basis PartitionKeyType1>/<value for field2 basis PartitionKeyType2></code> +and so on. Each partition key type could either be SIMPLE or TIMESTAMP.</p> + +<p>Example config value: <code class="highlighter-rouge">“field_3:simple,field_5:timestamp”</code></p> + +<p>RecordKey config value is either single field incase of SimpleKeyGenerator or a comma separate field names if referring to ComplexKeyGenerator. +Eg: “col1” or “col3,col4”.</p> + +<h3 id="nonpartitionedkeygenerator"><a href="https://github.com/apache/hudi/blob/master/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/NonpartitionedKeyGenerator.java">NonPartitionedKeyGenerator</a></h3> +<p>If your hudi dataset is not partitioned, you could use this “NonPartitionedKeyGenerator” which will return an empty +partition for all records. In other words, all records go to the same partition (which is empty “”)</p> + +<p>Hope this blog gave you a good understanding of different types of Key Generators available in Apache Hudi. Thanks for your continued support for Hudi’s community.</p> + + + </section> + + <a href="#masthead__inner-wrap" class="back-to-top">Back to top ↑</a> + + + + + </div> + + </article> + +</div> + + </div> + + <div class="page__footer"> + <footer> + +<div class="row"> + <div class="col-lg-12 footer"> + <p> + <table class="table-apache-info"> + <tr> + <td> + <a class="footer-link-img" href="https://apache.org"> + <img width="250px" src="/assets/images/asf_logo.svg" alt="The Apache Software Foundation"> + </a> + </td> + <td> + <a style="float: right" href="https://www.apache.org/events/current-event.html"> + <img src="https://www.apache.org/events/current-event-234x60.png" /> + </a> + </td> + </tr> + </table> + </p> + <p> + <a href="https://www.apache.org/licenses/">License</a> | <a href="https://www.apache.org/security/">Security</a> | <a href="https://www.apache.org/foundation/thanks.html">Thanks</a> | <a href="https://www.apache.org/foundation/sponsorship.html">Sponsorship</a> + </p> + <p> + Copyright © <span id="copyright-year">2019</span> <a href="https://apache.org">The Apache Software Foundation</a>, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0"> Apache License, Version 2.0</a>. + Hudi, Apache and the Apache feather logo are trademarks of The Apache Software Foundation. <a href="/docs/privacy">Privacy Policy</a> + </p> + </div> +</div> + </footer> + </div> + + + </body> +</html> \ No newline at end of file diff --git a/content/cn/activity.html b/content/cn/activity.html index eb734ec..e096c4a 100644 --- a/content/cn/activity.html +++ b/content/cn/activity.html @@ -191,6 +191,30 @@ <h2 class="archive__item-title" itemprop="headline"> + <a href="/blog/hudi-key-generators/" rel="permalink">Apache Hudi Key Generators +</a> + + </h2> + <!-- Look the author details up from the site config. --> + + <!-- Output author details if some exist. --> + + + <p class="archive__item-excerpt" itemprop="description">Different key generators available with Apache Hudi +</p> + </article> +</div> + + + + + + +<div class="list__item"> + <article class="archive__item" itemscope itemtype="https://schema.org/CreativeWork"> + + <h2 class="archive__item-title" itemprop="headline"> + <a href="/blog/hudi-clustering-intro/" rel="permalink">Optimize Data lake layout using Clustering in Apache Hudi </a> diff --git a/content/sitemap.xml b/content/sitemap.xml index a6de7ea..5d19956 100644 --- a/content/sitemap.xml +++ b/content/sitemap.xml @@ -1153,6 +1153,10 @@ <lastmod>2021-01-27T00:00:00-05:00</lastmod> </url> <url> +<loc>https://hudi.apache.org/blog/hudi-key-generators/</loc> +<lastmod>2021-02-13T00:00:00-05:00</lastmod> +</url> +<url> <loc>https://hudi.apache.org/cn/activity</loc> <lastmod>2019-12-30T14:59:57-05:00</lastmod> </url>