This is an automated email from the ASF dual-hosted git repository. vinoth pushed a commit to branch asf-site in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/asf-site by this push: new 26ef788 Travis CI build asf-site 26ef788 is described below commit 26ef788715ac2a7024413c50b8a043695282478c Author: CI <ci...@hudi.apache.org> AuthorDate: Tue Mar 23 22:41:07 2021 +0000 Travis CI build asf-site --- content/assets/js/lunr/lunr-store.js | 5 + content/docs/0.5.3-cloud.html | 11 + content/docs/azure_hoodie.html | 11 + content/docs/cloud.html | 11 + content/docs/comparison.html | 11 + content/docs/concurrency_control.html | 568 ++++++++++++++++++++++++++++++ content/docs/configurations.html | 73 ++++ content/docs/cos_hoodie.html | 11 + content/docs/deployment.html | 11 + content/docs/docker_demo.html | 11 + content/docs/docs-versions.html | 11 + content/docs/flink-quick-start-guide.html | 11 + content/docs/gcs_hoodie.html | 11 + content/docs/ibm_cos_hoodie.html | 11 + content/docs/metrics.html | 11 + content/docs/migration_guide.html | 11 + content/docs/oss_hoodie.html | 11 + content/docs/overview.html | 11 + content/docs/performance.html | 11 + content/docs/powered_by.html | 11 + content/docs/privacy.html | 11 + content/docs/querying_data.html | 11 + content/docs/s3_hoodie.html | 11 + content/docs/spark_quick-start-guide.html | 11 + content/docs/structure.html | 11 + content/docs/use_cases.html | 11 + content/docs/writing_data.html | 11 + content/sitemap.xml | 4 + 28 files changed, 914 insertions(+) diff --git a/content/assets/js/lunr/lunr-store.js b/content/assets/js/lunr/lunr-store.js index f13c4df..4fa69d9 100644 --- a/content/assets/js/lunr/lunr-store.js +++ b/content/assets/js/lunr/lunr-store.js @@ -1324,6 +1324,11 @@ var store = [{ "tags": [], "url": "https://hudi.apache.org/docs/metrics.html", "teaser":"https://hudi.apache.org/assets/images/500x300.png"},{ + "title": "Concurrent Writes to Hudi Tables", + "excerpt":"In this section, we will cover Hudi’s concurrency model and describe ways to ingest data into a Hudi Table from multiple writers; using the DeltaStreamer tool as well as using the Hudi datasource. Supported Concurrency Controls MVCC : Hudi table services such as compaction, cleaning, clustering leverage Multi Version Concurrency...","categories": [], + "tags": [], + "url": "https://hudi.apache.org/docs/concurrency_control.html", + "teaser":"https://hudi.apache.org/assets/images/500x300.png"},{ "title": "Privacy Policy", "excerpt":"Information about your use of this website is collected using server access logs and a tracking cookie. The collected information consists of the following: The IP address from which you access the website; The type of browser and operating system you use to access our site; The date and time...","categories": [], "tags": [], diff --git a/content/docs/0.5.3-cloud.html b/content/docs/0.5.3-cloud.html index b045e52..405c630 100644 --- a/content/docs/0.5.3-cloud.html +++ b/content/docs/0.5.3-cloud.html @@ -208,6 +208,17 @@ + <li><a href="/docs/concurrency_control.html" class="">Concurrency Control</a></li> + + + + + + + + + + <li><a href="/docs/querying_data.html" class="">Querying Data</a></li> diff --git a/content/docs/azure_hoodie.html b/content/docs/azure_hoodie.html index a781bde..4427c6e 100644 --- a/content/docs/azure_hoodie.html +++ b/content/docs/azure_hoodie.html @@ -208,6 +208,17 @@ + <li><a href="/docs/concurrency_control.html" class="">Concurrency Control</a></li> + + + + + + + + + + <li><a href="/docs/querying_data.html" class="">Querying Data</a></li> diff --git a/content/docs/cloud.html b/content/docs/cloud.html index 99a378f..2a5b12a 100644 --- a/content/docs/cloud.html +++ b/content/docs/cloud.html @@ -208,6 +208,17 @@ + <li><a href="/docs/concurrency_control.html" class="">Concurrency Control</a></li> + + + + + + + + + + <li><a href="/docs/querying_data.html" class="">Querying Data</a></li> diff --git a/content/docs/comparison.html b/content/docs/comparison.html index 7dcc624..99651ea 100644 --- a/content/docs/comparison.html +++ b/content/docs/comparison.html @@ -208,6 +208,17 @@ + <li><a href="/docs/concurrency_control.html" class="">Concurrency Control</a></li> + + + + + + + + + + <li><a href="/docs/querying_data.html" class="">Querying Data</a></li> diff --git a/content/docs/concurrency_control.html b/content/docs/concurrency_control.html new file mode 100644 index 0000000..af97c0f --- /dev/null +++ b/content/docs/concurrency_control.html @@ -0,0 +1,568 @@ +<!doctype html> +<html lang="en" class="no-js"> + <head> + <meta charset="utf-8"> + +<!-- begin _includes/seo.html --><title>Concurrent Writes to Hudi Tables - Apache Hudi</title> +<meta name="description" content="In this section, we will cover Hudi’s concurrency model and describe ways to ingest data into a Hudi Table from multiple writers; using the DeltaStreamer tool as well as using the Hudi datasource."> + +<meta property="og:type" content="article"> +<meta property="og:locale" content="en_US"> +<meta property="og:site_name" content=""> +<meta property="og:title" content="Concurrent Writes to Hudi Tables"> +<meta property="og:url" content="https://hudi.apache.org/docs/concurrency_control.html"> + + + <meta property="og:description" content="In this section, we will cover Hudi’s concurrency model and describe ways to ingest data into a Hudi Table from multiple writers; using the DeltaStreamer tool as well as using the Hudi datasource."> + + + + + + <meta property="article:modified_time" content="2021-03-19T15:59:57-04:00"> + + + + + + + +<!-- end _includes/seo.html --> + + +<!--<link href="/feed.xml" type="application/atom+xml" rel="alternate" title=" Feed">--> + +<!-- https://t.co/dKP3o1e --> +<meta name="viewport" content="width=device-width, initial-scale=1.0"> + +<script> + document.documentElement.className = document.documentElement.className.replace(/\bno-js\b/g, '') + ' js '; +</script> + +<!-- For all browsers --> +<link rel="stylesheet" href="/assets/css/main.css"> + +<!--[if IE]> + <style> + /* old IE unsupported flexbox fixes */ + .greedy-nav .site-title { + padding-right: 3em; + } + .greedy-nav button { + position: absolute; + top: 0; + right: 0; + height: 100%; + } + </style> +<![endif]--> + + + +<link rel="icon" type="image/x-icon" href="/assets/images/favicon.ico"> +<link rel="stylesheet" href="/assets/css/font-awesome.min.css"> +<script src="/assets/js/jquery.min.js"></script> + + +<script src="/assets/js/main.min.js"></script> + + </head> + + <body class="layout--single"> + <!--[if lt IE 9]> +<div class="notice--danger align-center" style="margin: 0;">You are using an <strong>outdated</strong> browser. Please <a href="https://browsehappy.com/">upgrade your browser</a> to improve your experience.</div> +<![endif]--> + + <div class="masthead"> + <div class="masthead__inner-wrap" id="masthead__inner-wrap"> + <div class="masthead__menu"> + <nav id="site-nav" class="greedy-nav"> + + <a class="site-logo" href="/"> + <div style="width: 150px; height: 40px"> + </div> + </a> + + <a class="site-title" href="/"> + + </a> + <ul class="visible-links"><li class="masthead__menu-item"> + <a href="/docs/spark_quick-start-guide.html" target="_self" >Documentation</a> + </li><li class="masthead__menu-item"> + <a href="/community.html" target="_self" >Community</a> + </li><li class="masthead__menu-item"> + <a href="/blog.html" target="_self" >Blog</a> + </li><li class="masthead__menu-item"> + <a href="https://cwiki.apache.org/confluence/display/HUDI/FAQ" target="_blank" >FAQ</a> + </li><li class="masthead__menu-item"> + <a href="/docs/powered_by.html" target="_self" >Powered By</a> + </li><li class="masthead__menu-item"> + <a href="/releases.html" target="_self" >Releases</a> + </li></ul> + <button class="greedy-nav__toggle hidden" type="button"> + <span class="visually-hidden">Toggle menu</span> + <div class="navicon"></div> + </button> + <ul class="hidden-links hidden"></ul> + </nav> + </div> + </div> +</div> +<!-- +<p class="notice--warning" style="margin: 0 !important; text-align: center !important;"><strong>Note:</strong> This site is work in progress, if you notice any issues, please <a target="_blank" href="https://github.com/apache/hudi/issues">Report on Issue</a>. + Click <a href="/"> here</a> back to old site.</p> +--> + + <div class="initial-content"> + <div id="main" role="main"> + + + <div class="sidebar sticky"> + + + + + + + + + + + + + + +<nav class="nav__list"> + + <input id="ac-toc" name="accordion-toc" type="checkbox" /> + <label for="ac-toc">Toggle Menu</label> + <ul class="nav__items"> + + <li> + + <span class="nav__sub-title">Documentation</span> + + + + <ul> + + + + + + + + + <li><a href="/docs/overview.html" class="">Overview</a></li> + + + + + + + + + + + <li><a href="/docs/spark_quick-start-guide.html" class="">Quick Start(Spark)</a></li> + + + + + + + + + + + <li><a href="/docs/flink-quick-start-guide.html" class="">Quick Start(Flink)</a></li> + + + + + + + + + + + <li><a href="/docs/use_cases.html" class="">Use Cases</a></li> + + + + + + + + + + + <li><a href="/docs/writing_data.html" class="">Writing Data</a></li> + + + + + + + + + + + <li><a href="/docs/concurrency_control.html" class="active">Concurrency Control</a></li> + + + + + + + + + + + <li><a href="/docs/querying_data.html" class="">Querying Data</a></li> + + + + + + + + + + + <li><a href="/docs/configurations.html" class="">Configuration</a></li> + + + + + + + + + + + <li><a href="/docs/performance.html" class="">Performance</a></li> + + + + + + + + + + + <li><a href="/docs/deployment.html" class="">Deployment</a></li> + + + + </ul> + + </li> + + <li> + + <span class="nav__sub-title">Resources</span> + + + + <ul> + + + + + + + + + <li><a href="/docs/docker_demo.html" class="">Dockerized Demo</a></li> + + + + + + + + + + + <li><a href="/docs/cloud.html" class="">Storage Configuration</a></li> + + + + + + + + + + + <li><a href="/docs/metrics.html" class="">Metrics</a></li> + + + + + + + + + + + <li><a href="/docs/docs-versions.html" class="">Docs Versions</a></li> + + + + + + + + + + + <li><a href="/docs/privacy.html" class="">Privacy Policy</a></li> + + + + </ul> + + </li> + + </ul> +</nav> + + + + + </div> + + + <article class="page" itemscope itemtype="https://schema.org/CreativeWork"> + <!-- Look the author details up from the site config. --> + + + <div class="page__inner-wrap"> + + <header> + <h1 id="page-title" class="page__title" itemprop="headline">Concurrent Writes to Hudi Tables +</h1> + <!-- Output author details if some exist. --> + + </header> + + + <section class="page__content" itemprop="text"> + + <aside class="sidebar__right sticky"> + <nav class="toc"> + <header><h4 class="nav__title"><i class="fas fa-file-alt"></i> IN THIS PAGE</h4></header> + <ul class="toc__menu"> + <li><a href="#supported-concurrency-controls">Supported Concurrency Controls</a></li> + <li><a href="#single-writer-guarantees">Single Writer Guarantees</a></li> + <li><a href="#multi-writer-guarantees">Multi Writer Guarantees</a></li> + <li><a href="#enabling-multi-writing">Enabling Multi Writing</a></li> + <li><a href="#datasource-writer">Datasource Writer</a></li> + <li><a href="#deltastreamer">DeltaStreamer</a></li> + <li><a href="#best-practices-when-using-optimistic-concurrency-control">Best Practices when using Optimistic Concurrency Control</a></li> + <li><a href="#disabling-multi-writing">Disabling Multi Writing</a></li> +</ul> + </nav> + </aside> + + <p>In this section, we will cover Hudi’s concurrency model and describe ways to ingest data into a Hudi Table from multiple writers; using the <a href="#deltastreamer">DeltaStreamer</a> tool as well as +using the <a href="#datasource-writer">Hudi datasource</a>.</p> + +<h2 id="supported-concurrency-controls">Supported Concurrency Controls</h2> + +<ul> + <li> + <p><strong>MVCC</strong> : Hudi table services such as compaction, cleaning, clustering leverage Multi Version Concurrency Control to provide snapshot isolation +between multiple table service writers and readers. Additionally, using MVCC, Hudi provides snapshot isolation between an ingestion writer and multiple concurrent readers. +With this model, Hudi supports running any number of table service jobs concurrently, without any concurrency conflict. +This is made possible by ensuring that scheduling plans of such table services always happens in a single writer mode to ensure no conflict and avoids race conditions.</p> + </li> + <li> + <p><strong>[NEW] OPTIMISTIC CONCURRENCY</strong> : Write operations such as the ones described above (UPSERT, INSERT) etc, leverage optimistic concurrency control to enable multiple ingestion writers to +the same Hudi Table. Hudi supports <code class="highlighter-rouge">file level OCC</code>, i.e., for any 2 commits (or writers) happening to the same table, if they do not have writes to overlapping files being changed, both writers are allowed to succeed. +This feature is currently <em>experimental</em> and requires either Zookeeper or HiveMetastore to acquire locks.</p> + </li> +</ul> + +<p>It may be helpful to understand the different guarantees provided by <a href="/docs/writing_data.html#write-operations">write operations</a> via Hudi datasource or the delta streamer.</p> + +<h2 id="single-writer-guarantees">Single Writer Guarantees</h2> + +<ul> + <li><em>UPSERT Guarantee</em>: The target table will NEVER show duplicates.</li> + <li><em>INSERT Guarantee</em>: The target table wilL NEVER have duplicates if <a href="/docs/configurations.html#INSERT_DROP_DUPS_OPT_KEY">dedup</a> is enabled.</li> + <li><em>BULK_INSERT Guarantee</em>: The target table will NEVER have duplicates if <a href="/docs/configurations.html#INSERT_DROP_DUPS_OPT_KEY">dedup</a> is enabled.</li> + <li><em>INCREMENTAL PULL Guarantee</em>: Data consumption and checkpoints are NEVER out of order.</li> +</ul> + +<h2 id="multi-writer-guarantees">Multi Writer Guarantees</h2> + +<p>With multiple writers using OCC, some of the above guarantees change as follows</p> + +<ul> + <li><em>UPSERT Guarantee</em>: The target table will NEVER show duplicates.</li> + <li><em>INSERT Guarantee</em>: The target table MIGHT have duplicates even if <a href="/docs/configurations.html#INSERT_DROP_DUPS_OPT_KEY">dedup</a> is enabled.</li> + <li><em>BULK_INSERT Guarantee</em>: The target table MIGHT have duplicates even if <a href="/docs/configurations.html#INSERT_DROP_DUPS_OPT_KEY">dedup</a> is enabled.</li> + <li><em>INCREMENTAL PULL Guarantee</em>: Data consumption and checkpoints MIGHT be out of order due to multiple writer jobs finishing at different times.</li> +</ul> + +<h2 id="enabling-multi-writing">Enabling Multi Writing</h2> + +<p>The following properties are needed to be set properly to turn on optimistic concurrency control.</p> + +<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>hoodie.write.concurrency.mode=optimistic_concurrency_control +hoodie.failed.writes.cleaner.policy=LAZY +hoodie.writer.lock.provider=<lock-provider-classname> +</code></pre></div></div> + +<p>There are 2 different server based lock providers that require different configuration to be set.</p> + +<p><strong><code class="highlighter-rouge">Zookeeper</code></strong> based lock provider</p> + +<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>hoodie.writer.lock.provider=org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider +hoodie.writer.lock.zookeeper.url +hoodie.writer.lock.zookeeper.port +hoodie.writer.lock.wait_time_ms +hoodie.writer.lock.num_retries +hoodie.writer.lock.lock_key +hoodie.writer.lock.zookeeper.zk_base_path +</code></pre></div></div> + +<p><strong><code class="highlighter-rouge">HiveMetastore</code></strong> based lock provider</p> + +<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>hoodie.writer.lock.provider=org.apache.hudi.hive.HiveMetastoreBasedLockProvider +hoodie.writer.lock.hivemetastore.database +hoodie.writer.lock.hivemetastore.table +hoodie.writer.lock.wait_time_ms +hoodie.writer.lock.num_retries +</code></pre></div></div> + +<p><code class="highlighter-rouge">The HiveMetastore URI's are picked up from the hadoop configuration file loaded during runtime.</code></p> + +<h2 id="datasource-writer">Datasource Writer</h2> + +<p>The <code class="highlighter-rouge">hudi-spark</code> module offers the DataSource API to write (and read) a Spark DataFrame into a Hudi table.</p> + +<p>Following is an example of how to use optimistic_concurrency_control via spark datasource</p> + +<div class="language-java highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">inputDF</span><span class="o">.</span><span class="na">write</span><span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">"hudi"</span><span class="o">)</span> + <span class="o">.</span><span class="na">options</span><span class="o">(</span><span class="n">getQuickstartWriteConfigs</span><span class="o">)</span> + <span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="no">PRECOMBINE_FIELD_OPT_KEY</span><span class="o">,</span> <span class="s">"ts"</span><span class="o">)</span> + <span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="s">"hoodie.failed.writes.cleaner.policy"</span><span class="o">,</span> <span class="s">"LAZY"</span><span class="o">)</span> + <span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="s">"hoodie.write.concurrency.mode"</span><span class="o">,</span> <span class="s">"optimistic_concurrency_control"</span><span class="o">)</span> + <span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="s">"hoodie.writer.lock.zookeeper.url"</span><span class="o">,</span> <span class="s">"zookeeper"</span><span class="o">)</span> + <span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="s">"hoodie.writer.lock.zookeeper.port"</span><span class="o">,</span> <span class="s">"2181"</span><span class="o">)</span> + <span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="s">"hoodie.writer.lock.wait_time_ms"</span><span class="o">,</span> <span class="s">"12000"</span><span class="o">)</span> + <span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="s">"hoodie.writer.lock.num_retries"</span><span class="o">,</span> <span class="s">"2"</span><span class="o">)</span> + <span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="s">"hoodie.writer.lock.lock_key"</span><span class="o">,</span> <span class="s">"test_table"</span><span class="o">)</span> + <span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="s">"hoodie.writer.lock.zookeeper.zk_base_path"</span><span class="o">,</span> <span class="s">"/test"</span><span class="o">)</span> + <span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="no">RECORDKEY_FIELD_OPT_KEY</span><span class="o">,</span> <span class="s">"uuid"</span><span class="o">)</span> + <span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="no">PARTITIONPATH_FIELD_OPT_KEY</span><span class="o">,</span> <span class="s">"partitionpath"</span><span class="o">)</span> + <span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="no">TABLE_NAME</span><span class="o">,</span> <span class="n">tableName</span><span class="o">)</span> + <span class="o">.</span><span class="na">mode</span><span class="o">(</span><span class="nc">Overwrite</span><span class="o">)</span> + <span class="o">.</span><span class="na">save</span><span class="o">(</span><span class="n">basePath</span><span class="o">)</span> +</code></pre></div></div> + +<h2 id="deltastreamer">DeltaStreamer</h2> + +<p>The <code class="highlighter-rouge">HoodieDeltaStreamer</code> utility (part of hudi-utilities-bundle) provides ways to ingest from different sources such as DFS or Kafka, with the following capabilities.</p> + +<p>Using optimistic_concurrency_control via delta streamer requires adding the above configs to the properties file that can be passed to the +job. For example below, adding the configs to kafka-source.properties file and passing them to deltastreamer will enable optimistic concurrency. +A deltastreamer job can then be triggered as follows:</p> + +<div class="language-java highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="o">[</span><span class="n">hoodie</span><span class="o">]</span><span class="err">$</span> <span class="n">spark</span><span class="o">-</span><span class="n">submit</span> <span class="o">--</span><span class="kd">class</span> <span class="nc">org</span><span class="o">.</span><span class="na">apache</span><span class="o">.</span><span class="na">hudi</span><span class="o">.</sp [...] + <span class="o">--</span><span class="n">props</span> <span class="nl">file:</span><span class="c1">//${PWD}/hudi-utilities/src/test/resources/delta-streamer-config/kafka-source.properties \</span> + <span class="o">--</span><span class="n">schemaprovider</span><span class="o">-</span><span class="kd">class</span> <span class="nc">org</span><span class="o">.</span><span class="na">apache</span><span class="o">.</span><span class="na">hudi</span><span class="o">.</span><span class="na">utilities</span><span class="o">.</span><span class="na">schema</span><span class="o">.</span><span class="na">SchemaRegistryProvider</span> <span class="err">\</span> + <span class="o">--</span><span class="n">source</span><span class="o">-</span><span class="kd">class</span> <span class="nc">org</span><span class="o">.</span><span class="na">apache</span><span class="o">.</span><span class="na">hudi</span><span class="o">.</span><span class="na">utilities</span><span class="o">.</span><span class="na">sources</span><span class="o">.</span><span class="na">AvroKafkaSource</span> <span class="err">\</span> + <span class="o">--</span><span class="n">source</span><span class="o">-</span><span class="n">ordering</span><span class="o">-</span><span class="n">field</span> <span class="n">impresssiontime</span> <span class="err">\</span> + <span class="o">--</span><span class="n">target</span><span class="o">-</span><span class="n">base</span><span class="o">-</span><span class="n">path</span> <span class="nl">file:</span><span class="err">\</span><span class="o">/</span><span class="err">\</span><span class="o">/</span><span class="err">\</span><span class="o">/</span><span class="n">tmp</span><span class="o">/</span><span class="n">hudi</span><span class="o">-</span><span class="n">deltastreamer</span><span class="o">- [...] + <span class="o">--</span><span class="n">target</span><span class="o">-</span><span class="n">table</span> <span class="n">uber</span><span class="o">.</span><span class="na">impressions</span> <span class="err">\</span> + <span class="o">--</span><span class="n">op</span> <span class="no">BULK_INSERT</span> +</code></pre></div></div> + +<h2 id="best-practices-when-using-optimistic-concurrency-control">Best Practices when using Optimistic Concurrency Control</h2> + +<p>Concurrent Writing to Hudi tables requires acquiring a lock with either Zookeeper or HiveMetastore. Due to several reasons you might want to configure retries to allow your application to acquire the lock.</p> +<ol> + <li>Network connectivity or excessive load on servers increasing time for lock acquisition resulting in timeouts</li> + <li>Running a large number of concurrent jobs that are writing to the same hudi table can result in contention during lock acquisition can cause timeouts</li> + <li>In some scenarios of conflict resolution, Hudi commit operations might take upto 10’s of seconds while the lock is being held. This can result in timeouts for other jobs waiting to acquire a lock.</li> +</ol> + +<p>Set the correct native lock provider client retries. NOTE that sometimes these settings are set on the server once and all clients inherit the same configs. Please check your settings before enabling optimistic concurrency.</p> + +<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>hoodie.writer.lock.wait_time_ms +hoodie.writer.lock.num_retries +</code></pre></div></div> + +<p>Set the correct hudi client retries for Zookeeper & HiveMetastore. This is useful in cases when native client retry settings cannot be changed. Please note that these retries will happen in addition to any native client retries that you may have set.</p> + +<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>hoodie.writer.lock.client.wait_time_ms +hoodie.writer.lock.client.num_retries +</code></pre></div></div> + +<p><em>Setting the right values for these depends on a case by case basis; some defaults have been provided for general cases.</em></p> + +<h2 id="disabling-multi-writing">Disabling Multi Writing</h2> + +<p>Remove the following settings that were used to enable multi-writer or override with default values.</p> + +<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>hoodie.write.concurrency.mode=single_writer +hoodie.failed.writes.cleaner.policy=EAGER +</code></pre></div></div> + + </section> + + <a href="#masthead__inner-wrap" class="back-to-top">Back to top ↑</a> + + + + + </div> + + </article> + +</div> + + </div> + + <div class="page__footer"> + <footer> + +<div class="row"> + <div class="col-lg-12 footer"> + <p> + <table class="table-apache-info"> + <tr> + <td> + <a class="footer-link-img" href="https://apache.org"> + <img width="250px" src="/assets/images/asf_logo.svg" alt="The Apache Software Foundation"> + </a> + </td> + <td> + <a style="float: right" href="https://www.apache.org/events/current-event.html"> + <img src="https://www.apache.org/events/current-event-234x60.png" /> + </a> + </td> + </tr> + </table> + </p> + <p> + <a href="https://www.apache.org/licenses/">License</a> | <a href="https://www.apache.org/security/">Security</a> | <a href="https://www.apache.org/foundation/thanks.html">Thanks</a> | <a href="https://www.apache.org/foundation/sponsorship.html">Sponsorship</a> + </p> + <p> + Copyright © <span id="copyright-year">2019</span> <a href="https://apache.org">The Apache Software Foundation</a>, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0"> Apache License, Version 2.0</a>. + Hudi, Apache and the Apache feather logo are trademarks of The Apache Software Foundation. <a href="/docs/privacy">Privacy Policy</a> + </p> + </div> +</div> + </footer> + </div> + + + </body> +</html> \ No newline at end of file diff --git a/content/docs/configurations.html b/content/docs/configurations.html index e4f1f61..99bd6f3 100644 --- a/content/docs/configurations.html +++ b/content/docs/configurations.html @@ -208,6 +208,17 @@ + <li><a href="/docs/concurrency_control.html" class="">Concurrency Control</a></li> + + + + + + + + + + <li><a href="/docs/querying_data.html" class="">Querying Data</a></li> @@ -369,6 +380,7 @@ <li><a href="#metrics-configs">Metrics configs</a></li> <li><a href="#memory-configs">Memory configs</a></li> <li><a href="#write-commit-callback-configs">Write commit callback configs</a></li> + <li><a href="#locking-configs">Locking configs</a></li> </ul> </li> </ul> @@ -1365,6 +1377,67 @@ Each clustering operation can create multiple groups. Total amount of data proce <p>Property: <code class="highlighter-rouge">hoodie.write.commit.callback.kafka.retries</code> <br /> <span style="color:grey">Times to retry. 3 by default</span></p> +<h3 id="locking-configs">Locking configs</h3> +<p>Configs that control locking mechanisms if <a href="#WriteConcurrencyMode">WriteConcurrencyMode=optimistic_concurrency_control</a> is enabled +<a href="#withLockConfig">withLockConfig</a> (HoodieLockConfig) <br /></p> + +<h4 id="withLockProvider">withLockProvider(lockProvider = org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider)</h4> +<p>Property: <code class="highlighter-rouge">hoodie.writer.lock.provider</code> <br /> +<span style="color:grey">Lock provider class name, user can provide their own implementation of LockProvider which should be subclass of org.apache.hudi.common.lock.LockProvider</span></p> + +<h4 id="withZkQuorum">withZkQuorum(zkQuorum)</h4> +<p>Property: <code class="highlighter-rouge">hoodie.writer.lock.zookeeper.url</code> <br /> +<span style="color:grey">Set the list of comma separated servers to connect to</span></p> + +<h4 id="withZkBasePath">withZkBasePath(zkBasePath)</h4> +<p>Property: <code class="highlighter-rouge">hoodie.writer.lock.zookeeper.base_path</code> [Required] <br /> +<span style="color:grey">The base path on Zookeeper under which to create a ZNode to acquire the lock. This should be common for all jobs writing to the same table</span></p> + +<h4 id="withZkPort">withZkPort(zkPort)</h4> +<p>Property: <code class="highlighter-rouge">hoodie.writer.lock.zookeeper.port</code> [Required] <br /> +<span style="color:grey">The connection port to be used for Zookeeper</span></p> + +<h4 id="withZkLockKey">withZkLockKey(zkLockKey)</h4> +<p>Property: <code class="highlighter-rouge">hoodie.writer.lock.zookeeper.lock_key</code> [Required] <br /> +<span style="color:grey">Key name under base_path at which to create a ZNode and acquire lock. Final path on zk will look like base_path/lock_key. We recommend setting this to the table name</span></p> + +<h4 id="withZkConnectionTimeoutInMs">withZkConnectionTimeoutInMs(connectionTimeoutInMs = 15000)</h4> +<p>Property: <code class="highlighter-rouge">hoodie.writer.lock.zookeeper.connection_timeout_ms</code> <br /> +<span style="color:grey">How long to wait when connecting to ZooKeeper before considering the connection a failure</span></p> + +<h4 id="withZkSessionTimeoutInMs">withZkSessionTimeoutInMs(sessionTimeoutInMs = 60000)</h4> +<p>Property: <code class="highlighter-rouge">hoodie.writer.lock.zookeeper.session_timeout_ms</code> <br /> +<span style="color:grey">How long to wait after losing a connection to ZooKeeper before the session is expired</span></p> + +<h4 id="withNumRetries">withNumRetries(num_retries = 3)</h4> +<p>Property: <code class="highlighter-rouge">hoodie.writer.lock.num_retries</code> <br /> +<span style="color:grey">Maximum number of times to retry by lock provider client</span></p> + +<h4 id="withRetryWaitTimeInMillis">withRetryWaitTimeInMillis(retryWaitTimeInMillis = 5000)</h4> +<p>Property: <code class="highlighter-rouge">hoodie.writer.lock.wait_time_ms_between_retry</code> <br /> +<span style="color:grey">Initial amount of time to wait between retries by lock provider client</span></p> + +<h4 id="withHiveDatabaseName">withHiveDatabaseName(hiveDatabaseName)</h4> +<p>Property: <code class="highlighter-rouge">hoodie.writer.lock.hivemetastore.database</code> [Required] <br /> +<span style="color:grey">The Hive database to acquire lock against</span></p> + +<h4 id="withHiveTableName">withHiveTableName(hiveTableName)</h4> +<p>Property: <code class="highlighter-rouge">hoodie.writer.lock.hivemetastore.table</code> [Required] <br /> +<span style="color:grey">The Hive table under the hive database to acquire lock against</span></p> + +<h4 id="withClientNumRetries">withClientNumRetries(clientNumRetries = 0)</h4> +<p>Property: <code class="highlighter-rouge">hoodie.writer.lock.client.num_retries</code> <br /> +<span style="color:grey">Maximum number of times to retry to acquire lock additionally from the hudi client</span></p> + +<h4 id="withRetryWaitTimeInMillis">withRetryWaitTimeInMillis(retryWaitTimeInMillis = 10000)</h4> +<p>Property: <code class="highlighter-rouge">hoodie.writer.lock.client.wait_time_ms_between_retry</code> <br /> +<span style="color:grey">Amount of time to wait between retries from the hudi client</span></p> + +<h4 id="withConflictResolutionStrategy">withConflictResolutionStrategy(lockProvider = org.apache.hudi.client.transaction.SimpleConcurrentFileWritesConflictResolutionStrategy)</h4> +<p>Property: <code class="highlighter-rouge">hoodie.writer.lock.conflict.resolution.strategy</code> <br /> +<span style="color:grey">Lock provider class name, this should be subclass of org.apache.hudi.client.transaction.ConflictResolutionStrategy</span></p> + + </section> <a href="#masthead__inner-wrap" class="back-to-top">Back to top ↑</a> diff --git a/content/docs/cos_hoodie.html b/content/docs/cos_hoodie.html index 2a9d9df..5f7977f 100644 --- a/content/docs/cos_hoodie.html +++ b/content/docs/cos_hoodie.html @@ -208,6 +208,17 @@ + <li><a href="/docs/concurrency_control.html" class="">Concurrency Control</a></li> + + + + + + + + + + <li><a href="/docs/querying_data.html" class="">Querying Data</a></li> diff --git a/content/docs/deployment.html b/content/docs/deployment.html index 21ef6d0..30d144b 100644 --- a/content/docs/deployment.html +++ b/content/docs/deployment.html @@ -208,6 +208,17 @@ + <li><a href="/docs/concurrency_control.html" class="">Concurrency Control</a></li> + + + + + + + + + + <li><a href="/docs/querying_data.html" class="">Querying Data</a></li> diff --git a/content/docs/docker_demo.html b/content/docs/docker_demo.html index f7baaf9..e24699f 100644 --- a/content/docs/docker_demo.html +++ b/content/docs/docker_demo.html @@ -208,6 +208,17 @@ + <li><a href="/docs/concurrency_control.html" class="">Concurrency Control</a></li> + + + + + + + + + + <li><a href="/docs/querying_data.html" class="">Querying Data</a></li> diff --git a/content/docs/docs-versions.html b/content/docs/docs-versions.html index 6d47eeb..6db93ed 100644 --- a/content/docs/docs-versions.html +++ b/content/docs/docs-versions.html @@ -208,6 +208,17 @@ + <li><a href="/docs/concurrency_control.html" class="">Concurrency Control</a></li> + + + + + + + + + + <li><a href="/docs/querying_data.html" class="">Querying Data</a></li> diff --git a/content/docs/flink-quick-start-guide.html b/content/docs/flink-quick-start-guide.html index 6e5d4ee..f4f48e7 100644 --- a/content/docs/flink-quick-start-guide.html +++ b/content/docs/flink-quick-start-guide.html @@ -208,6 +208,17 @@ + <li><a href="/docs/concurrency_control.html" class="">Concurrency Control</a></li> + + + + + + + + + + <li><a href="/docs/querying_data.html" class="">Querying Data</a></li> diff --git a/content/docs/gcs_hoodie.html b/content/docs/gcs_hoodie.html index 138cb1b..d71d866 100644 --- a/content/docs/gcs_hoodie.html +++ b/content/docs/gcs_hoodie.html @@ -208,6 +208,17 @@ + <li><a href="/docs/concurrency_control.html" class="">Concurrency Control</a></li> + + + + + + + + + + <li><a href="/docs/querying_data.html" class="">Querying Data</a></li> diff --git a/content/docs/ibm_cos_hoodie.html b/content/docs/ibm_cos_hoodie.html index 68c65ce..9307778 100644 --- a/content/docs/ibm_cos_hoodie.html +++ b/content/docs/ibm_cos_hoodie.html @@ -208,6 +208,17 @@ + <li><a href="/docs/concurrency_control.html" class="">Concurrency Control</a></li> + + + + + + + + + + <li><a href="/docs/querying_data.html" class="">Querying Data</a></li> diff --git a/content/docs/metrics.html b/content/docs/metrics.html index 9eb712b..65ca70e 100644 --- a/content/docs/metrics.html +++ b/content/docs/metrics.html @@ -208,6 +208,17 @@ + <li><a href="/docs/concurrency_control.html" class="">Concurrency Control</a></li> + + + + + + + + + + <li><a href="/docs/querying_data.html" class="">Querying Data</a></li> diff --git a/content/docs/migration_guide.html b/content/docs/migration_guide.html index d671c72..3ef7f99 100644 --- a/content/docs/migration_guide.html +++ b/content/docs/migration_guide.html @@ -208,6 +208,17 @@ + <li><a href="/docs/concurrency_control.html" class="">Concurrency Control</a></li> + + + + + + + + + + <li><a href="/docs/querying_data.html" class="">Querying Data</a></li> diff --git a/content/docs/oss_hoodie.html b/content/docs/oss_hoodie.html index ec429f3..a1a643b 100644 --- a/content/docs/oss_hoodie.html +++ b/content/docs/oss_hoodie.html @@ -208,6 +208,17 @@ + <li><a href="/docs/concurrency_control.html" class="">Concurrency Control</a></li> + + + + + + + + + + <li><a href="/docs/querying_data.html" class="">Querying Data</a></li> diff --git a/content/docs/overview.html b/content/docs/overview.html index d6a78d2..04de878 100644 --- a/content/docs/overview.html +++ b/content/docs/overview.html @@ -208,6 +208,17 @@ + <li><a href="/docs/concurrency_control.html" class="">Concurrency Control</a></li> + + + + + + + + + + <li><a href="/docs/querying_data.html" class="">Querying Data</a></li> diff --git a/content/docs/performance.html b/content/docs/performance.html index b64c5c0..48b1183 100644 --- a/content/docs/performance.html +++ b/content/docs/performance.html @@ -208,6 +208,17 @@ + <li><a href="/docs/concurrency_control.html" class="">Concurrency Control</a></li> + + + + + + + + + + <li><a href="/docs/querying_data.html" class="">Querying Data</a></li> diff --git a/content/docs/powered_by.html b/content/docs/powered_by.html index 8761748..e604d51 100644 --- a/content/docs/powered_by.html +++ b/content/docs/powered_by.html @@ -208,6 +208,17 @@ + <li><a href="/docs/concurrency_control.html" class="">Concurrency Control</a></li> + + + + + + + + + + <li><a href="/docs/querying_data.html" class="">Querying Data</a></li> diff --git a/content/docs/privacy.html b/content/docs/privacy.html index 22e16b7..5b03c8e 100644 --- a/content/docs/privacy.html +++ b/content/docs/privacy.html @@ -208,6 +208,17 @@ + <li><a href="/docs/concurrency_control.html" class="">Concurrency Control</a></li> + + + + + + + + + + <li><a href="/docs/querying_data.html" class="">Querying Data</a></li> diff --git a/content/docs/querying_data.html b/content/docs/querying_data.html index fb0bfc9..18f5c59 100644 --- a/content/docs/querying_data.html +++ b/content/docs/querying_data.html @@ -208,6 +208,17 @@ + <li><a href="/docs/concurrency_control.html" class="">Concurrency Control</a></li> + + + + + + + + + + <li><a href="/docs/querying_data.html" class="active">Querying Data</a></li> diff --git a/content/docs/s3_hoodie.html b/content/docs/s3_hoodie.html index e00a6de..b86d9f3 100644 --- a/content/docs/s3_hoodie.html +++ b/content/docs/s3_hoodie.html @@ -208,6 +208,17 @@ + <li><a href="/docs/concurrency_control.html" class="">Concurrency Control</a></li> + + + + + + + + + + <li><a href="/docs/querying_data.html" class="">Querying Data</a></li> diff --git a/content/docs/spark_quick-start-guide.html b/content/docs/spark_quick-start-guide.html index 8c36992..ad47de5 100644 --- a/content/docs/spark_quick-start-guide.html +++ b/content/docs/spark_quick-start-guide.html @@ -208,6 +208,17 @@ + <li><a href="/docs/concurrency_control.html" class="">Concurrency Control</a></li> + + + + + + + + + + <li><a href="/docs/querying_data.html" class="">Querying Data</a></li> diff --git a/content/docs/structure.html b/content/docs/structure.html index 2adc401..061aa5e 100644 --- a/content/docs/structure.html +++ b/content/docs/structure.html @@ -208,6 +208,17 @@ + <li><a href="/docs/concurrency_control.html" class="">Concurrency Control</a></li> + + + + + + + + + + <li><a href="/docs/querying_data.html" class="">Querying Data</a></li> diff --git a/content/docs/use_cases.html b/content/docs/use_cases.html index d563210..bd823fb 100644 --- a/content/docs/use_cases.html +++ b/content/docs/use_cases.html @@ -208,6 +208,17 @@ + <li><a href="/docs/concurrency_control.html" class="">Concurrency Control</a></li> + + + + + + + + + + <li><a href="/docs/querying_data.html" class="">Querying Data</a></li> diff --git a/content/docs/writing_data.html b/content/docs/writing_data.html index 72e2a67..1fb6536 100644 --- a/content/docs/writing_data.html +++ b/content/docs/writing_data.html @@ -208,6 +208,17 @@ + <li><a href="/docs/concurrency_control.html" class="">Concurrency Control</a></li> + + + + + + + + + + <li><a href="/docs/querying_data.html" class="">Querying Data</a></li> diff --git a/content/sitemap.xml b/content/sitemap.xml index 229a39d..125ecc7 100644 --- a/content/sitemap.xml +++ b/content/sitemap.xml @@ -1061,6 +1061,10 @@ <lastmod>2020-06-20T15:59:57-04:00</lastmod> </url> <url> +<loc>https://hudi.apache.org/docs/concurrency_control.html</loc> +<lastmod>2021-03-19T15:59:57-04:00</lastmod> +</url> +<url> <loc>https://hudi.apache.org/cn/docs/privacy.html</loc> <lastmod>2019-12-30T14:59:57-05:00</lastmod> </url>