This is an automated email from the ASF dual-hosted git repository. yihua pushed a commit to branch asf-site in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/asf-site by this push: new c0a5b21a35d [HUDI-5833] Add 0.13.0 release notes (#8022) c0a5b21a35d is described below commit c0a5b21a35d4d1fb05b54c699565c0455e622dde Author: Y Ethan Guo <ethan.guoyi...@gmail.com> AuthorDate: Fri Feb 24 13:29:39 2023 -0800 [HUDI-5833] Add 0.13.0 release notes (#8022) Adds the 0.13.0 release notes and download links for 0.13.0. --- website/docusaurus.config.js | 6 +- website/releases/download.md | 4 + website/releases/older-releases.md | 2 +- website/releases/release-0.10.0.md | 2 +- website/releases/release-0.10.1.md | 2 +- website/releases/release-0.11.0.md | 2 +- website/releases/release-0.11.1.md | 2 +- website/releases/release-0.12.0.md | 2 +- website/releases/release-0.12.1.md | 2 +- website/releases/release-0.12.2.md | 2 +- website/releases/release-0.13.0.md | 506 +++++++++++++++++++++++++ website/releases/release-0.7.0.md | 2 +- website/releases/release-0.8.0.md | 2 +- website/releases/release-0.9.0.md | 2 +- website/src/components/HomepageHeader/index.js | 2 +- 15 files changed, 525 insertions(+), 15 deletions(-) diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index 94cd3d02bad..074e887a023 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -112,11 +112,11 @@ module.exports = { }, { from: ['/docs/releases', '/docs/next/releases'], - to: '/releases/release-0.12.2', + to: '/releases/release-0.13.0', }, { from: ['/releases'], - to: '/releases/release-0.12.2', + to: '/releases/release-0.13.0', }, ], }, @@ -281,7 +281,7 @@ module.exports = { }, { label: 'Releases', - to: '/releases/release-0.12.2', + to: '/releases/release-0.13.0', }, { label: 'Download', diff --git a/website/releases/download.md b/website/releases/download.md index e7ceb1d5c56..12b9f614d94 100644 --- a/website/releases/download.md +++ b/website/releases/download.md @@ -6,6 +6,10 @@ toc: true last_modified_at: 2022-12-27T15:59:57-04:00 --- +### Release 0.13.0 +* Source Release : [Apache Hudi 0.13.0 Source Release](https://www.apache.org/dyn/closer.lua/hudi/0.13.0/hudi-0.13.0.src.tgz) ([asc](https://downloads.apache.org/hudi/0.13.0/hudi-0.13.0.src.tgz.asc), [sha512](https://downloads.apache.org/hudi/0.13.0/hudi-0.13.0.src.tgz.sha512)) +* Release Note : ([Release Note for Apache Hudi 0.13.0](/releases/release-0.13.0)) + ### Release 0.12.2 * [Long Term Support](/releases/release-0.12.2#long-term-support): this is the latest stable release * Source Release : [Apache Hudi 0.12.2 Source Release](https://www.apache.org/dyn/closer.lua/hudi/0.12.2/hudi-0.12.2.src.tgz) ([asc](https://downloads.apache.org/hudi/0.12.2/hudi-0.12.2.src.tgz.asc), [sha512](https://downloads.apache.org/hudi/0.12.2/hudi-0.12.2.src.tgz.sha512)) diff --git a/website/releases/older-releases.md b/website/releases/older-releases.md index 3147cb7c645..9822d6e6510 100644 --- a/website/releases/older-releases.md +++ b/website/releases/older-releases.md @@ -1,6 +1,6 @@ --- title: "Older Releases" -sidebar_position: 12 +sidebar_position: 13 layout: releases toc: true last_modified_at: 2020-05-28T08:40:00-07:00 diff --git a/website/releases/release-0.10.0.md b/website/releases/release-0.10.0.md index 35223c5e526..45fbf4593f0 100644 --- a/website/releases/release-0.10.0.md +++ b/website/releases/release-0.10.0.md @@ -1,6 +1,6 @@ --- title: "Release 0.10.0" -sidebar_position: 8 +sidebar_position: 9 layout: releases toc: true last_modified_at: 2021-12-10T22:07:00+08:00 diff --git a/website/releases/release-0.10.1.md b/website/releases/release-0.10.1.md index be4bc237091..04e9f88f53f 100644 --- a/website/releases/release-0.10.1.md +++ b/website/releases/release-0.10.1.md @@ -1,6 +1,6 @@ --- title: "Release 0.10.1" -sidebar_position: 7 +sidebar_position: 8 layout: releases toc: true last_modified_at: 2022-01-27T22:07:00+08:00 diff --git a/website/releases/release-0.11.0.md b/website/releases/release-0.11.0.md index 312722f5846..7e7a4ef748b 100644 --- a/website/releases/release-0.11.0.md +++ b/website/releases/release-0.11.0.md @@ -1,6 +1,6 @@ --- title: "Release 0.11.0" -sidebar_position: 6 +sidebar_position: 7 layout: releases toc: true last_modified_at: 2022-01-27T22:07:00+08:00 diff --git a/website/releases/release-0.11.1.md b/website/releases/release-0.11.1.md index 95118e91c03..c0c16c05e2b 100644 --- a/website/releases/release-0.11.1.md +++ b/website/releases/release-0.11.1.md @@ -1,6 +1,6 @@ --- title: "Release 0.11.1" -sidebar_position: 5 +sidebar_position: 6 layout: releases toc: true last_modified_at: 2022-06-19T23:30:00-07:00 diff --git a/website/releases/release-0.12.0.md b/website/releases/release-0.12.0.md index 83b6671cbd8..3ff92d919b1 100644 --- a/website/releases/release-0.12.0.md +++ b/website/releases/release-0.12.0.md @@ -1,6 +1,6 @@ --- title: "Release 0.12.0" -sidebar_position: 4 +sidebar_position: 5 layout: releases toc: true last_modified_at: 2022-08-17T10:30:00+05:30 diff --git a/website/releases/release-0.12.1.md b/website/releases/release-0.12.1.md index dbd98f98ed9..dc716de15d1 100644 --- a/website/releases/release-0.12.1.md +++ b/website/releases/release-0.12.1.md @@ -1,6 +1,6 @@ --- title: "Release 0.12.1" -sidebar_position: 3 +sidebar_position: 4 layout: releases toc: true last_modified_at: 2022-08-17T10:30:00+05:30 diff --git a/website/releases/release-0.12.2.md b/website/releases/release-0.12.2.md index 3594206cda4..c414cca55b1 100644 --- a/website/releases/release-0.12.2.md +++ b/website/releases/release-0.12.2.md @@ -1,6 +1,6 @@ --- title: "Release 0.12.2" -sidebar_position: 2 +sidebar_position: 3 layout: releases toc: true last_modified_at: 2022-12-27T10:30:00+05:30 diff --git a/website/releases/release-0.13.0.md b/website/releases/release-0.13.0.md new file mode 100644 index 00000000000..c6b53013cd3 --- /dev/null +++ b/website/releases/release-0.13.0.md @@ -0,0 +1,506 @@ +--- +title: "Release 0.13.0" +sidebar_position: 2 +layout: releases +toc: true +last_modified_at: 2022-02-22T13:00:00-08:00 +--- +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# [Release 0.13.0](https://github.com/apache/hudi/releases/tag/release-0.13.0) ([docs](/docs/quick-start-guide)) + +Apache Hudi 0.13.0 release introduces a number of new features including [Metaserver](#metaserver), +[Change Data Capture](#change-data-capture), [new Record Merge API](#optimizing-record-payload-handling), +[new sources for Deltastreamer](#new-source-support-in-deltastreamer) and more. While there is no table version upgrade +required for this release, users are expected to take actions by following the [Migration Guide](#migration-guide-overview) +down below on relevant [breaking changes](#migration-guide-breaking-changes) and +[behavior changes](#migration-guide-behavior-changes) before using 0.13.0 release. + +## Migration Guide: Overview + +This release keeps the same table version (`5`) as [0.12.0 release](/releases/release-0.12.0), and there is no need for +a table version upgrade if you are upgrading from 0.12.0. There are a few +[breaking changes](#migration-guide-breaking-changes) and [behavior changes](#migration-guide-behavior-changes) as +described below, and users are expected to take action accordingly before using 0.13.0 release. + +:::caution +If migrating from an older release (pre 0.12.0), please also check the upgrade instructions from each older release in +sequence. +::: + +## Migration Guide: Breaking Changes + +### Bundle Updates + +#### Spark bundle Support + +From now on, [`hudi-spark3.2-bundle`](https://mvnrepository.com/artifact/org.apache.hudi/hudi-spark3.2-bundle) works +with Apache Spark 3.2.1 and newer versions for Spark 3.2.x. The support for Spark 3.2.0 with +[`hudi-spark3.2-bundle`](https://mvnrepository.com/artifact/org.apache.hudi/hudi-spark3.2-bundle) is +dropped because of the Spark implementation change of `getHive` method of `HiveClientImpl` which is incompatible between +Spark version 3.2.0 and 3.2.1. + +#### Utilities Bundle Change + +The AWS and GCP bundle jars are separated from +[`hudi-utilities-bundle`](https://mvnrepository.com/artifact/org.apache.hudi/hudi-utilities-bundle). The user would need +to use [**`hudi-aws-bundle`**](https://mvnrepository.com/artifact/org.apache.hudi/hudi-aws-bundle) or +[**`hudi-gcp-bundle`**](https://mvnrepository.com/artifact/org.apache.hudi/hudi-gcp-bundle) along with +[`hudi-utilities-bundle`](https://mvnrepository.com/artifact/org.apache.hudi/hudi-utilities-bundle) while using the +cloud services. + +#### New Flink Bundle + +Hudi is now supported on Flink 1.16.x with the new +[`hudi-flink1.16-bundle`](https://mvnrepository.com/artifact/org.apache.hudi/hudi-flink1.16-bundle). + +### Lazy File Index in Spark + +Hudi's File Index in Spark is switched to be listed lazily ***by default***: this entails that it would **only** be listing +partitions that are requested by the query (i.e., after partition-pruning) as opposed to always listing the whole table +before this release. This is expected to bring considerable performance improvement for large tables. + +A new configuration property is added if the user wants to change the listing behavior: +`hoodie.datasource.read.file.index.listing.mode` (now default to **`lazy`**). There are two possible values that you can +set: + +- **`eager`**: This lists all partition paths and corresponding file slices within them eagerly, during initialization. +This is the default behavior prior 0.13.0. + - If a Hudi table has 1000 partitions, the eager mode lists the files under all of them when constructing the file index. + +- **`lazy`**: The partitions and file-slices within them will be listed lazily, allowing partition pruning predicates to +be pushed down appropriately, therefore only listing partitions after these have already been pruned. + - The files are not listed under the partitions when the File Index is initialized. The files are listed only under + targeted partition(s) after partition pruning using predicates (e.g., `datestr=2023-02-19`) in queries. + +:::tip +To preserve the behavior pre 0.13.0, the user needs to set `hoodie.datasource.read.file.index.listing.mode=eager`. +::: + +:::danger Breaking Change +The **breaking change** occurs only in cases when the table has **BOTH**: multiple partition columns AND partition +values contain slashes that are not URL-encoded. +::: + +For example let's assume we want to parse two partition columns - `month` (`2022/01`) and `day` (`03`), from the +partition path `2022/01/03`. Since there is a mismatch between the number of partition columns (2 here - `month` and +`day`) and the number of components in the partition path delimited by `/` (3 in this case - month, year and day) it +causes ambiguity. In such cases, it is not possible to recover the partition value corresponding to each partition column. + +There are two ways to **avoid** the breaking changes: + +- The first option is to change how partition values are constructed. A user can switch the partition value of column +`month` to avoid slashes in any partition column values, such as `202201`, then there is no problem parsing the +partition path (`202201/03`). + +- The second option is to switch the listing mode to `eager`. The File Index would "gracefully regress" to assume the +table is non-partitioned and just sacrifice partition-pruning, but would be able to process the query as if the +table was non-partitioned (therefore potentially incurring performance penalty), instead of failing the queries. + +### Checkpoint Management in Spark Structured Streaming + +If you are using [Spark streaming](https://spark.apache.org/docs/3.3.2/structured-streaming-programming-guide.html) to +ingest into Hudi, Hudi self-manages the checkpoint internally. We are now adding support for multiple writers, each +ingesting into the same Hudi table via streaming ingest. In older versions of hudi, you can't have multiple streaming +ingestion writers ingesting into the same hudi table (one streaming ingestion writer with a concurrent Spark datasource +writer works with lock provider; however, two Spark streaming ingestion writers are not supported). With 0.13.0, we are +adding support where multiple streaming ingestions can be done to the same table. In case of a single streaming ingestion, +users don't have to do anything; the old pipeline will work without needing any additional changes. But, if you are +having multiple streaming writers to same Hudi table, each table has to set a unique value for the config, +`hoodie.datasource.write.streaming.checkpoint.identifier`. Also, users are expected to set the usual multi-writer +configs. More details can be found [here](/docs/concurrency_control). + +### ORC Support in Spark + +The [ORC](https://orc.apache.org/) support for Spark 2.x is removed in this release, as the dependency of +`orc-core:nohive` in Hudi is now replaced by `orc-core`, to be compatible with Spark 3. [ORC](https://orc.apache.org/) +support is now available for Spark 3.x, which was broken in previous releases. + +### Mandatory Record Key Field + +The configuration for setting the record key field, `hoodie.datasource.write.recordkey.field`, is now required to be set +and has no default value. Previously, the default value is `uuid`. + +## Migration Guide: Behavior Changes + +### Schema Handling in Write Path + +Many users have requested using Hudi for CDC use cases that they want to have schema auto-evolution where existing +columns might be dropped in a new schema. As of 0.13.0 release, Hudi now has this functionality. You can permit schema +auto-evolution where existing columns can be dropped in a new schema. + +Since dropping columns in the target table based on the source schema constitutes a considerable behavior change, this +is disabled by default and is guarded by the following config: `hoodie.datasource.write.schema.allow.auto.evolution.column.drop`. +To enable automatic dropping of the columns along with new evolved schema of the incoming batch, set this to **`true`**. + +:::tip +This config is **NOT** required to evolve schema manually by using, for example, `ALTER TABLE … DROP COLUMN` in Spark. +::: + +### Removal of Default Shuffle Parallelism + +This release changes how Hudi decides the shuffle parallelism of [write operations](/docs/write_operations) including +`INSERT`, `BULK_INSERT`, `UPSERT` and `DELETE` (**`hoodie.insert|bulkinsert|upsert|delete.shuffle.parallelism`**), which +can ultimately affect the write performance. + +Previously, if users did not configure it, Hudi would use `200` as the default shuffle parallelism. From 0.13.0 onwards +Hudi by default automatically deduces the shuffle parallelism by either using the number of output RDD partitions as +determined by Spark when available or by using the `spark.default.parallelism` value. If the above Hudi shuffle +parallelisms are explicitly configured by the user, then the user-configured parallelism is still used in defining the +actual parallelism. Such behavior change improves the out-of-the-box performance by 20% for workloads with reasonably +sized input. + +:::caution +If the input data files are small, e.g., smaller than 10MB, we suggest configuring the Hudi shuffle parallelism +(`hoodie.insert|bulkinsert|upsert|delete.shuffle.parallelism`) explicitly, such that the parallelism is at least +total_input_data_size/500MB, to avoid potential performance regression (see [Tuning Guide](/docs/tuning-guide) for more +information). +::: + +### Simple Write Executor as Default + +For the execution of insert/upsert operations, Hudi historically used the notion of an executor, relying on in-memory +queue to decouple ingestion operations (that were previously often bound by I/O operations fetching shuffled blocks) +from writing operations. Since then, Spark architectures have evolved considerably making such writing architecture +redundant. Towards evolving this writing pattern and leveraging the changes in Spark, in 0.13.0 we introduce a new, +simplified version of the executor named (creatively) as **`SimpleExecutor`** and also make it out-of-the-box default. + +The **`SimpleExecutor`** does not have any internal buffering (i.e., does not hold records in memory), which internally +implements simple iteration over provided iterator (similar to default Spark behavior). It provides **~10%** +out-of-the-box performance improvement on modern Spark versions (3.x) and even more when used with Spark's native +**`SparkRecordMerger`**. + +### `NONE` Sort Mode for Bulk Insert to Match Parquet Writes + +This release adjusts the parallelism for `NONE` sort mode (default sort mode) for `BULK_INSERT` write operation. From +now on, by default, the input parallelism is used instead of the shuffle parallelism (`hoodie.bulkinsert.shuffle.parallelism`) +for writing data, to match the default parquet write behavior. This does not change the behavior of clustering using the +`NONE` sort mode. + +Such behavior change on `BULK_INSERT` write operation improves the write performance out of the box. + +:::tip +If you still observe small file issues with the default `NONE` sort mode, we suggest sorting the input data based on the +partition path and record key before writing to the Hudi table. You can also use `GLOBAL_SORT` to ensure the best file +size. +::: + +### Meta Sync Failure in Deltastreamer + +In earlier versions, we used a fail-fast approach where syncing to remaining catalogs is not attempted if any +[catalog sync](/docs/syncing_metastore) fails. In 0.13.0, syncing to all configured catalogs is attempted before failing +the operation on any catalog sync failure. In the case of sync failure for one catalog, the sync to other catalogs can +still succeed, so the user only needs to retry the failed one now. + +### No Override of Internal Metadata Table Configs + +Since misconfiguration could lead to possible data integrity issues, in 0.13.0, we have put in efforts to make the +metadata table configuration much simpler for the users. Internally, Hudi determines the best choices for these +configurations for optimal performance and stability of the system. + +The following metadata-table-related configurations are made internal; you can no longer configure these configs +explicitly: +``` +hoodie.metadata.clean.async +hoodie.metadata.cleaner.commits.retained +hoodie.metadata.enable.full.scan.log.files +hoodie.metadata.populate.meta.fields +``` + +### Spark SQL CTAS Performance Fix + +Previously, CTAS write operation was incorrectly set to use `UPSERT` due to misconfiguration. In 0.13.0 release, we fix +this to make sure CTAS uses **`BULK_INSERT`** operation to boost the write performance of the first batch to a Hudi table +(there's no real need to use `UPSERT` for it, as the table is being created). + +### Flink CkpMetadata + +Before 0.13.0, we bootstrapped the ckp metadata (checkpoint related metadata) by cleaning all the messages. Some corner +cases were not handled correctly. For example: + +- The write task can not fetch the pending instant correctly when restarting a job. + +- If a checkpoint succeeds and the job crashes suddenly, the instant hasn't had time to commit. The data is lost because +the last pending instant was rolled back; however, the Flink engine still thinks the checkpoint/instant is successful. + +Q: Why did we clean the messages prior to the 0.13.0 release? + +A: To prevent inconsistencies between timeline and the messages. + +Q: Why are we retaining the messages in the 0.13.0 release? + +A: There are two cases for the inconsistency: + +1. The timeline instant is complete but the ckp message is inflight (for committing instant). + +2. The timeline instant is pending while the ckp message does not start (for starting a new instant). + +For case 1, there is no need to re-commit the instant, and it is fine if the write task does not get any pending instant +when recovering. + +For case 2, the instant is basically pending. The instant would be rolled back (as expected). Thus, keeping the ckp +messages as is can actually maintain correctness. + +## Release Highlights + +### Metaserver + +In 0.13.0, we introduce Metaserver, a centralized metadata management service. This is one of the first platform service +components we introduce from many more to come. Metaserver helps users to easily manage a large number of tables in a +data lake platform. + +:::caution +This is an ***EXPERIMENTAL*** feature. +::: + +To set up the metaserver in your environment, use +[`hudi-metaserver-server-bundle`](https://mvnrepository.com/artifact/org.apache.hudi/hudi-metaserver-server-bundle) and +run it as a java server application, like `java -jar hudi-metaserver-server-bundle-<HUDI_VERSION>.jar`. On the client +side, add the following options to integrate with the metaserver: +``` +hoodie.metaserver.enabled=true +hoodie.metaserver.uris=thrift://<server url>:9090 +``` + +The Metaserver stores Hudi tables' metadata like table name, database, owner; and the timeline's metadata like commit +instants, actions, states, etc. In addition, the Metaserver supports the Spark writer and reader through Hudi Spark +bundles. + +### Change Data Capture + +In cases where Hudi tables are used as streaming sources, we want to be aware of all changes for the records that belong +to a single commit. For instance, we want to know which records were inserted, deleted and updated. For updated records, +the subsequent pipeline may want to get the old values before the update and the new ones after. Prior to 0.13.0, the +incremental query does not contain the hard-delete records, and users need to use soft deletes to stream deletes, which +may not meet the GDPR requirements. + +The Change-Data-Capture (CDC) feature enables Hudi to show how records are changed by producing the changes and +therefore to handle CDC query use cases. + +:::caution +CDC is an ***EXPERIMENTAL*** feature and is supported to work for COW tables with Spark and Flink engines. MOR tables +are not supported by CDC query yet. +::: + +To use CDC, users need to enable it first while writing to a table to log extra data, which are returned by CDC +incremental queries. + +For writing, set `hoodie.table.cdc.enabled=true` and specify CDC logging mode through `hoodie.datasource.query.incremental.format`, +to control the data being logged. There are 3 modes to choose from: + +- **`data_before_after`**: This logs the changed records' operations and the whole record before and after the change. This +mode incurs the most CDC data on storage and has the least computing efforts for querying CDC results. + +- **`data_before`**: This logs the changed records' operations and the whole record before the change. + +- **`op_key_only`**: This only logs the changed records' operations and key. This mode incurs the least CDC data on +storage, and requires most computing efforts for querying CDC results. + +The default value is **`data_before_after`**. + +For reading, set: +``` +hoodie.datasource.query.type=incremental +hoodie.datasource.query.incremental.format=cdc +``` +and other usual [incremental query](/docs/quick-start-guide#incremental-query)'s options like begin and end instant +times, and CDC results are returned. + +:::caution +Note that `hoodie.table.cdc.enabled` is a table configuration. Once it is enabled, it is not allowed to be turned off +for that table. Similarly, you cannot change `hoodie.table.cdc.supplemental.logging.mode`, once it's saved as a table +configuration. +::: + +### Optimizing Record Payload handling + +This release introduces the long-awaited support for handling records as their engine-native representations, therefore +avoiding the need to convert them to an intermediate one (Avro). + +:::caution +This feature is in an ***EXPERIMENTAL*** mode and is currently only supported for Spark. +::: + +This was made possible through RFC-46 by introducing a new +[`HoodieRecordMerger`](https://github.com/apache/hudi/blob/release-0.13.0/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordMerger.java) +abstraction. The `HoodieRecordMerger` is the core +and a source of truth for implementing any merging semantics in Hudi going forward. In this capacity, it replaces the +[`HoodieRecordPayload`](https://github.com/apache/hudi/blob/release-0.13.0/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordPayload.java) +hierarchy previously used for implementing custom merging semantics. By relying on the unified +component in the form of a `HoodieRecordMerger` allows us to handle records throughout the lifecycle of the write operation +in a uniform manner. This substantially reduces latency because the records are now held in the engine-native +representation, avoiding unnecessary copying, deserialization and conversion to the intermediate representation (Avro). +In our benchmarks, upsert performance improves in the ballpark of 10% against 0.13.0 default state and 20% when compared +to 0.12.2. + +To try it today, you'd need to specify the configs differently for each Hudi table: + +- For COW, specify `hoodie.datasource.write.record.merger.impls=org.apache.hudi.HoodieSparkRecordMerger` +- For MOR, specify `hoodie.datasource.write.record.merger.impls=org.apache.hudi.HoodieSparkRecordMerger` and +`hoodie.logfile.data.block.format=parquet` + +:::caution +Please note, that the current +[`HoodieSparkRecordMerger`](https://github.com/apache/hudi/blob/release-0.13.0/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/HoodieSparkRecordMerger.java) +implementation only supports merging semantic equivalent to the +[`OverwriteWithLatestAvroPayload`](https://github.com/apache/hudi/blob/release-0.13.0/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java) +class, which is the default `HoodieRecordPayload` implementation used for merging records +currently (set as “hoodie.compaction.payload.class”). Therefore, if you're using any other `HoodieRecordPayload` +implementation, unfortunately, you'd need to wait until it is replaced by the corresponding `HoodieRecordMerger` implementation. +::: + +### New Source Support in Deltastreamer + +[Deltastreamer](/docs/hoodie_deltastreamer) is a fully-managed incremental ETL utility that supports a wide variety of +sources. In this release, we have added three new sources to its repertoire. + +#### Proto Kafka Source + +Deltastreamer already supports exactly-once ingestion of new events from Kafka using JSON and Avro formats. +[`ProtoKafkaSource`](https://github.com/apache/hudi/blob/release-0.13.0/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/ProtoKafkaSource.java) +extends this support to Protobuf class-based schemas as well. With just one additional config, one +can easily set up this source. Check out the [docs](/docs/hoodie_deltastreamer) for more details. + +#### GCS Incremental Source + +Along the lines of [S3 events source](https://hudi.apache.org/blog/2021/08/23/s3-events-source), we now have a reliable +and fast way of ingesting from objects in Google Cloud Storage (GCS) through +[`GcsEventsHoodieIncrSource`](https://github.com/apache/hudi/blob/release-0.13.0/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/GcsEventsHoodieIncrSource.java). +Check out the docs on how to set up this source. + +#### Pulsar Source +[Apache Pulsar](https://pulsar.apache.org/) is an open-source, distributed messaging and streaming platform built for +the cloud. [`PulsarSource`](https://github.com/apache/hudi/blob/release-0.13.0/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/PulsarSource.java) +supports ingesting from Apache Pulsar through the Deltastreamer. Check out the [docs](/docs/hoodie_deltastreamer) on how +to set up this source. + +### Support for Partial Payload Update + +Partial update is a frequently asked use case from the community that requires the ability to update only certain fields +and not replace the whole record. Previously, we recommended users satisfy this use case by bringing in their own custom +record payload implementation. With the popularity of this, in the 0.13.0 release, we added a new record payload +implementation, +[`PartialUpdateAvroPayload`](https://github.com/apache/hudi/blob/release-0.13.0/hudi-common/src/main/java/org/apache/hudi/common/model/PartialUpdateAvroPayload.java), +to support this out-of-the-box so users can use this implementation instead of having to write their own custom implementation. + +### Consistent Hashing Index + +We introduce the Consistent Hashing Index as yet another indexing option for your writes with Hudi. This is an +enhancement to [Bucket Index](/releases/release-0.11.0#bucket-index) which is added in the 0.11.0 release. With Bucket +Index, buckets/file groups per partition are statically allocated, whereas with Consistent Hashing Index, buckets can +grow dynamically and so users don't need to sweat about data skews. Buckets will expand and shrink depending on the load +factor for each partition. You can find the [RFC](https://github.com/apache/hudi/blob/master/rfc/rfc-42/rfc-42.md) for +the design of this feature. + +Here are the configs of interest if you wish to give it a try. +``` +hoodie.index.type=bucket +hoodie.index.bucket.engine=CONSISTENT_HASHING +hoodie.bucket.index.max.num.buckets=128 +hoodie.bucket.index.min.num.buckets=32 +hoodie.bucket.index.num.buckets=4 +## do split if the bucket size reach 1.5 * max_file_size +hoodie.bucket.index.split.threshold=1.5 +## do merge if the bucket size smaller than 0.2 * max_file_size +hoodie.bucket.index.merge.threshold=0.1 +``` + +To enforce shrinking or scaling up of buckets, you need to enable clustering using the following configs +``` +## check resize for every 4 commit +hoodie.clustering.inline=true +hoodie.clustering.inline.max.commits=4 +hoodie.clustering.plan.strategy.class=org.apache.hudi.client.clustering.plan.strategy.SparkConsistentBucketClusteringPlanStrategy +hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkConsistentBucketClusteringExecutionStrategy +## for supporting concurrent write & resizing +hoodie.clustering.updates.strategy=org.apache.hudi.client.clustering.update.strategy.SparkConsistentBucketDuplicateUpdateStrategy +``` + +:::caution +Consistent Hashing Index is still an evolving feature and currently there are some limitations to use it as of 0.13.0: + +- This index is supported only for Spark engine using a MOR table. +- It does not work with metadata table enabled. +- To scale up or shrink the buckets, users have to manually trigger clustering using above configs (at some cadence), but +they cannot have compaction concurrently running. +- So, if compaction is enabled with your regular write pipeline, please follow this recommendation: You can choose to +trigger the scale/shrink once every 12 hours. In such cases, once every 12 hours, you might need to disable compaction, +stop your write pipeline and enable clustering. You should take extreme care to not run both concurrently because it +might result in conflicts and a failed pipeline. Once clustering is complete, you can resume your regular write pipeline, +which will have compaction enabled. +::: + +We are working towards automating these and making it easier for users to leverage the Consistent Hashing Index. You can +follow the ongoing work on the Consistent Hashing Index [here](https://issues.apache.org/jira/browse/HUDI-3000). + +### Early Conflict Detection for Multi-Writer + +Hudi provides Optimistic Concurrency Control (OCC) to allow multiple writers to concurrently write and atomically commit +to the Hudi table if there is no overlapping data file to be written, to guarantee data consistency, integrity and +correctness. Prior to the 0.13.0 release, such conflict detection of overlapping data files is performed before commit +metadata and after the data writing is completed. If any conflict is detected in the final stage, it could have wasted +compute resources because the data writing is finished already. + +To improve the concurrency control, the 0.13.0 release introduces a new feature, early conflict detection in OCC, to +detect the conflict during the data writing phase and abort the writing early on once a conflict is detected, using +Hudi's marker mechanism. Hudi can now stop a conflicting writer much earlier because of the early conflict detection +and release computing resources necessary to cluster, improving resource utilization. + +:::caution +The early conflict detection in OCC is ***EXPERIMENTAL*** in 0.13.0 release. +::: + +By default, this feature is turned off. To try this out, a user needs to set +`hoodie.write.concurrency.early.conflict.detection.enable` to **`true`**, when using OCC for concurrency control +(see the [Concurrency Control](/docs/concurrency_control) page for more details). + +### Lock-Free Message Queue in Writing Data + +In previous versions, Hudi writes incoming data into a table via a bounded in-memory queue using a producer-consumer +model. In this release, we added a new type of queue, leveraging [Disruptor](https://lmax-exchange.github.io/disruptor/user-guide/index.html), +which is lock-free. This increases the write throughput when data volume is large. +[The benchmark](https://github.com/apache/hudi/pull/5416) writing 100 million records to 1000 partitions in a Hudi table +on cloud storage shows **20%** performance improvement compared to the existing executor type of bounded in-memory queue. + +:::caution +`DisruptorExecutor` is supported for Spark insert and Spark bulk insert operations as an ***EXPERIMENTAL*** feature +::: + +Users can set `hoodie.write.executor.type=DISRUPTOR_EXECUTOR` to enable this feature. There are other configurations +like `hoodie.write.wait.strategy` and `hoodie.write.buffer.size` to tune the performance further. + +### Hudi CLI Bundle + +We introduce a new Hudi CLI Bundle, [`hudi-cli-bundle_2.12`](https://mvnrepository.com/artifact/org.apache.hudi/hudi-cli-bundle), +for Spark 3.x to make Hudi CLI easier and more usable. A user can now use this single bundle jar +(published to Maven repository) along with Hudi Spark bundle to start the script +to launch the Hudi-CLI shell with Spark. This brings ease of deployment for the Hudi-CLI as the user does not need to +compile the Hudi CLI module locally, upload jars and address any dependency conflict if there is any, which was the +case before this release. Detailed instructions can be found on the [Hudi CLI](/docs/cli) page. + +### Support for Flink 1.16 +Flink 1.16.x is integrated with Hudi, using profile param `-Pflink1.16` when compiling the source code to activate the +version. Alternatively, use [`hudi-flink1.16-bundle`](https://mvnrepository.com/artifact/org.apache.hudi/hudi-flink1.16-bundle). +Flink 1.15, Flink 1.14 and Flink 1.13 will continue to be supported. Please check the [migration guide](#new-flink-bundle) +for bundle updates. + +### Json Schema Converter + +For DeltaStreamer users who configure schema registry, a JSON schema converter is added to help convert JSON schema into +AVRO for the target Hudi table. Set `hoodie.deltastreamer.schemaprovider.registry.schemaconverter` to +`org.apache.hudi.utilities.schema.converter.JsonToAvroSchemaConverter` to use this feature. Users can also implement +this interface `org.apache.hudi.utilities.schema.SchemaRegistryProvider.SchemaConverter` to provide custom conversion +from the original schema to AVRO. + +### Providing Hudi Configs via Spark SQL Config + +Users can now provide Hudi configs via Spark SQL conf, for example, setting +``` +spark.sql("set hoodie.sql.bulk.insert.enable = true") +``` +to make sure Hudi is able to use `BULK_INSERT` operation when executing `INSERT INTO` statement. + +## Raw Release Notes + +The raw release notes are available [here](https://issues.apache.org/jira/secure/ReleaseNote.jspa?projectId=12322822&version=12352101). diff --git a/website/releases/release-0.7.0.md b/website/releases/release-0.7.0.md index 4b7a7cd06ec..e23e298e403 100644 --- a/website/releases/release-0.7.0.md +++ b/website/releases/release-0.7.0.md @@ -1,6 +1,6 @@ --- title: "Release 0.7.0" -sidebar_position: 11 +sidebar_position: 12 layout: releases toc: true last_modified_at: 2020-05-28T08:40:00-07:00 diff --git a/website/releases/release-0.8.0.md b/website/releases/release-0.8.0.md index 4efe443c94b..48c29e3d117 100644 --- a/website/releases/release-0.8.0.md +++ b/website/releases/release-0.8.0.md @@ -1,6 +1,6 @@ --- title: "Release 0.8.0" -sidebar_position: 10 +sidebar_position: 11 layout: releases toc: true last_modified_at: 2020-05-28T08:40:00-07:00 diff --git a/website/releases/release-0.9.0.md b/website/releases/release-0.9.0.md index f4a8029fbd2..39c40d8f3d1 100644 --- a/website/releases/release-0.9.0.md +++ b/website/releases/release-0.9.0.md @@ -1,6 +1,6 @@ --- title: "Release 0.9.0" -sidebar_position: 9 +sidebar_position: 10 layout: releases toc: true last_modified_at: 2021-08-26T08:40:00-07:00 diff --git a/website/src/components/HomepageHeader/index.js b/website/src/components/HomepageHeader/index.js index fb7852f1106..c8c473dbb32 100644 --- a/website/src/components/HomepageHeader/index.js +++ b/website/src/components/HomepageHeader/index.js @@ -19,7 +19,7 @@ function HomepageHeader() { </h1> <FeatureRender /> <div className={styles.buttons}> - <LinkButton to="/releases/release-0.12.2"> + <LinkButton to="/releases/release-0.13.0"> Latest releases </LinkButton> <LinkButton type="secondary" to="/docs/quick-start-guide">