This is an automated email from the ASF dual-hosted git repository. sivabalan pushed a commit to branch asf-site in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/asf-site by this push: new e87bf7c26c6 [HUDI-6861] update sql pages for 0.14.0 (#9699) e87bf7c26c6 is described below commit e87bf7c26c6dbf3bf2e374445d9d2a5c797d3b0f Author: Jon Vexler <jbvex...@gmail.com> AuthorDate: Fri Sep 15 22:44:00 2023 -0400 [HUDI-6861] update sql pages for 0.14.0 (#9699) --------- Co-authored-by: Jonathan Vexler <=> --- website/docs/procedures.md | 175 +++++++++++++++++++++++++-------------- website/docs/table_management.md | 31 +++---- 2 files changed, 129 insertions(+), 77 deletions(-) diff --git a/website/docs/procedures.md b/website/docs/procedures.md index b280afb346c..ba2d1c06968 100644 --- a/website/docs/procedures.md +++ b/website/docs/procedures.md @@ -468,14 +468,14 @@ archive commits. **Input** -| Parameter Name | Type | Required | Default Value | Description | -|-----------------|---------|----------|---------------|--------------------------------------------------| -| table | String | N | None | Hudi table name | -| path | String | N | None | Path of table | -| min_commits | Int | N | 20 | Configuration as 'hoodie.keep.min.commits' | -| max_commits | Int | N | 30 | Configuration as 'hoodie.keep.max.commits' | -| retain_commits | Int | N | 10 | Configuration as 'hoodie.commits.archival.batch' | -| enable_metadata | Boolean | N | false | Enable the internal metadata table | +| Parameter Name | Type | Required | Default Value | Description | +|------------------------------------------------------------------------|---------|----------|---------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| table | String | N | None | Hudi table name | +| path | String | N | None | Path of table | +| [min_commits](/docs/next/configurations#hoodiekeepmincommits) | Int | N | 20 | Similar to hoodie.keep.max.commits, but controls the minimum number of instants to retain in the active timeline. | +| [max_commits](/docs/next/configurations#hoodiekeepmaxcommits) | Int | N | 30 | Archiving service moves older entries from timeline into an archived log after each write, to keep the metadata overhead constant, even as the table size grows. This config controls the maximum number of instants to retain in the active timeline. | +| [retain_commits](/docs/next/configurations#hoodiecommitsarchivalbatch) | Int | N | 10 | Archiving of instants is batched in best-effort manner, to pack more instants into a single archive log. This config controls such archival batch size. | +| [enable_metadata](/docs/next/configurations#hoodiemetadataenable) | Boolean | N | false | Enable the internal metadata table | **Output** @@ -669,16 +669,16 @@ copy table to a temporary view. **Input** -| Parameter Name | Type | Required | Default Value | Description | -|---------------------|---------|----------|---------------|-------------------------------------------------| -| table | String | Y | None | Hudi table name | -| query_type | String | N | "snapshot" | Configuration as 'hoodie.datasource.query.type' | -| view_name | String | Y | None | Name of view | -| begin_instance_time | String | N | "" | Begin instance time | -| end_instance_time | String | N | "" | End instance time | -| as_of_instant | String | N | "" | As of instant time | -| replace | Boolean | N | false | Replace an existed view | -| global | Boolean | N | false | Global view | +| Parameter Name | Type | Required | Default Value | Description | +|-------------------------------------------------------------------|---------|----------|---------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| table | String | Y | None | Hudi table name | +| [query_type](/docs/next/configurations#hoodiedatasourcequerytype) | String | N | "snapshot" | Whether data needs to be read, in `incremental` mode (new data since an instantTime) (or) `read_optimized` mode (obtain latest view, based on base files) (or) `snapshot` mode (obtain latest view, by merging base and (if any) log files) | +| view_name | String | Y | None | Name of view | +| begin_instance_time | String | N | "" | Begin instance time | +| end_instance_time | String | N | "" | End instance time | +| as_of_instant | String | N | "" | As of instant time | +| replace | Boolean | N | false | Replace an existed view | +| global | Boolean | N | false | Global view | **Output** @@ -702,15 +702,15 @@ copy table to a new table. **Input** -| Parameter Name | Type | Required | Default Value | Description | -|---------------------|--------|----------|---------------|-------------------------------------------------| -| table | String | Y | None | Hudi table name | -| query_type | String | N | "snapshot" | Configuration as 'hoodie.datasource.query.type' | -| new_table | String | Y | None | Name of new table | -| begin_instance_time | String | N | "" | Begin instance time | -| end_instance_time | String | N | "" | End instance time | -| as_of_instant | String | N | "" | As of instant time | -| save_mode | String | N | "overwrite" | Save mode | +| Parameter Name | Type | Required | Default Value | Description | +|-------------------------------------------------------------------|--------|----------|---------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| table | String | Y | None | Hudi table name | +| [query_type](/docs/next/configurations#hoodiedatasourcequerytype) | String | N | "snapshot" | Whether data needs to be read, in `incremental` mode (new data since an instantTime) (or) `read_optimized` mode (obtain latest view, based on base files) (or) `snapshot` mode (obtain latest view, by merging base and (if any) log files) | +| new_table | String | Y | None | Name of new table | +| begin_instance_time | String | N | "" | Begin instance time | +| end_instance_time | String | N | "" | End instance time | +| as_of_instant | String | N | "" | As of instant time | +| save_mode | String | N | "overwrite" | Save mode | **Output** @@ -1348,12 +1348,13 @@ If both parameters are given, ``table`` will take effect. **Input** -| Parameter Name | Type | Required | Default Value | Description | -|----------------|--------|----------|---------------|-------------------------------------| -| op | String | N | None | Operation type, `RUN` or `SCHEDULE` | -| table | String | N | None | Name of table to be compacted | -| path | String | N | None | Path of table to be compacted | -| timestamp | String | N | None | Instant time | +| Parameter Name | Type | Required | Default Value | Description | +|----------------|--------|----------|---------------|----------------------------------------------------------------------------------------------------| +| op | String | N | None | Operation type, `RUN` or `SCHEDULE` | +| table | String | N | None | Name of table to be compacted | +| path | String | N | None | Path of table to be compacted | +| timestamp | String | N | None | Instant time | +| options | String | N | None | comma separated list of Hudi configs for compaction in the format "config1=value1,config2=value2" | **Output** @@ -1379,6 +1380,10 @@ Run compaction with table path and timestamp ``` call run_compaction(op => 'run', path => '/tmp/hoodie/test_hudi_table', timestamp => '20220408153658568'); ``` +Run compaction with options +``` +call run_compaction(op => 'run', table => 'test_hudi_table', options => hoodie.compaction.strategy=org.apache.hudi.table.action.compact.strategy.LogFileNumBasedCompactionStrategy,hoodie.compaction.logfile.num.threshold=3); +``` Schedule compaction with table name ``` @@ -1458,6 +1463,47 @@ call show_compaction(table => 'test_hudi_table', limit => 1); |-------------------|------------|---------| | 20220408153707928 | compaction | 10 | +### run_clean + +Run cleaner on a hoodie table. + +**Input** + +| Parameter Name | Type | Required | Default Value | Description [...] +|---------------------------------------------------------------------------------------|---------|----------|---------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ [...] +| table | String | Y | None | Name of table to be cleaned [...] +| schedule_in_line | Boolean | N | true | Set "true" if you want to schedule and run a clean. Set false if you have already scheduled a clean and want to run that. [...] +| [clean_policy](/docs/next/configurations#hoodiecleanerpolicy) | String | N | None | org.apache.hudi.common.model.HoodieCleaningPolicy: Cleaning policy to be used. The cleaner service deletes older file slices files to re-claim space. Long running query plans may often refer to older file slices and will break if those are cleaned, before the query has had a chance to run. So, it is good to make sure that the data is retained for more than the ma [...] +| [retain_commits](/docs/next/configurations#hoodiecleanercommitsretained) | Int | N | None | When KEEP_LATEST_COMMITS cleaning policy is used, the number of commits to retain, without cleaning. This will be retained for num_of_commits * time_between_commits (scheduled). This also directly translates into how much data retention the table supports for incremental queries. [...] +| [hours_retained](/docs/next/configurations#hoodiecleanerhoursretained) | Int | N | None | When KEEP_LATEST_BY_HOURS cleaning policy is used, the number of hours for which commits need to be retained. This config provides a more flexible option as compared to number of commits retained for cleaning service. Setting this property ensures all the files, but the latest in a file group, corresponding to commits with commit times older than the configured n [...] +| [file_versions_retained](/docs/next/configurations#hoodiecleanerfileversionsretained) | Int | N | None | When KEEP_LATEST_FILE_VERSIONS cleaning policy is used, the minimum number of file slices to retain in each file group, during cleaning. [...] +| [trigger_strategy](/docs/next/configurations#hoodiecleantriggerstrategy) | String | N | None | org.apache.hudi.table.action.clean.CleaningTriggerStrategy: Controls when cleaning is scheduled. NUM_COMMITS(default): Trigger the cleaning service every N commits, determined by `hoodie.clean.max.commits` [...] +| [trigger_max_commits](/docs/next/configurations/#hoodiecleanmaxcommits) | Int | N | None | Number of commits after the last clean operation, before scheduling of a new clean is attempted. [...] +| [options](/docs/next/configurations/#Clean-Configs) | String | N | None | comma separated list of Hudi configs for cleaning in the format "config1=value1,config2=value2" [...] + +**Output** + +| Parameter Name | Type | +|---------------------------|--------| +| start_clean_time | String | +| time_taken_in_millis | Long | +| total_files_deleted | Int | +| earliest_commit_to_retain | String | +| bootstrap_part_metadata | String | +| version | Int | + +**Example** + +Run clean with table name +``` +call run_clean(table => 'test_hudi_table'); +``` + +Run clean with keep latest file versions policy +``` +call run_clean(table => 'test_hudi_table', trigger_max_commits => 2, clean_policy => 'KEEP_LATEST_FILE_VERSIONS', file_versions_retained => 1) +``` + ### delete_marker Delete marker files of a hudi table. @@ -1521,17 +1567,20 @@ Sync the table's latest schema to Hive metastore. **Input** -| Parameter Name | Type | Required | Default Value | Description | -|---------------------------|--------|----------|---------------|--------------------------------------------------------------------------| -| table | String | Y | None | Hudi table name | -| metastore_uri | String | N | "" | Metastore_uri | -| username | String | N | "" | User name | -| password | String | N | "" | Password | -| use_jdbc | String | N | "" | Configration as 'hoodie.datasource.hive_sync.use_jdbc' | -| mode | String | N | "" | Configuration as 'hoodie.datasource.hive_sync.mode' | -| partition_fields | String | N | "" | Configuration as 'hoodie.datasource.hive_sync.partition_fields' | | -| partition_extractor_class | String | N | "" | Configuration as 'hoodie.datasource.hive_sync.partition_extractor_class' | -| strategy | String | N | "" | Configuration as 'hoodie.datasource.hive_sync.table.strategy' | +| Parameter Name | Type | Required | Default Value | Description | +|-----------------------------------------------------------------------------------------------------------|--------|----------|---------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| table | String | Y | None | Hudi table name | +| metastore_uri | String | N | "" | Metastore_uri | +| username | String | N | "" | User name | +| password | String | N | "" | Password | +| [use_jdbc](/docs/next/configurations#hoodiedatasourcehive_syncuse_jdbc) | String | N | "" | Use JDBC when hive synchronization is enabled | +| [mode](/docs/next/configurations#hoodiedatasourcehive_syncmode) | String | N | "" | Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql. | +| [partition_fields](/docs/next/configurations#hoodiedatasourcehive_syncpartition_fields) | String | N | "" | Field in the table to use for determining hive partition columns. | | +| [partition_extractor_class](/docs/next/configurations#hoodiedatasourcehive_syncpartition_extractor_class) | String | N | "" | Class which implements PartitionValueExtractor to extract the partition values, default 'org.apache.hudi.hive.MultiPartKeysValueExtractor'. | +| [strategy](/docs/next/configurations#hoodiedatasourcehive_synctablestrategy) | String | N | "" | Hive table synchronization strategy. Available option: RO, RT, ALL. | +| [sync_incremental](/docs/next/configurations#hoodiemetasyncincremental) | String | N | "" | Whether to incrementally sync the partitions to the metastore, i.e., only added, changed, and deleted partitions based on the commit metadata. If set to `false`, the meta sync executes a full partition sync operation when partitions are lost. | + + **Output** @@ -1735,25 +1784,25 @@ Convert an existing table to Hudi. **Input** -| Parameter Name | Type | Required | Default Value | Description | -|-------------------------------|---------|----------|-------------------------------------------------------------------------------|--------------------------------------------| -| table | String | Y | None | Name of table to be clustered | -| table_type | String | Y | None | Table type, MERGE_ON_READ or COPY_ON_WRITE | -| bootstrap_path | String | Y | None | Bootstrap path | -| base_path | String | Y | None | Base path | -| rowKey_field | String | Y | None | Primary key field | -| base_file_format | String | N | "PARQUET" | Format of base file | -| partition_path_field | String | N | "" | Partitioned column field | -| bootstrap_index_class | String | N | "org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex" | Class of bootstrap index | -| selector_class | String | N | "org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector" | Class of selector | -| key_generator_class | String | N | "org.apache.hudi.keygen.SimpleKeyGenerator" | Class of key generator | -| full_bootstrap_input_provider | String | N | "org.apache.hudi.bootstrap.SparkParquetBootstrapDataProvider" | Class of full bootstrap input provider | -| schema_provider_class | String | N | "" | Class of schema provider | -| payload_class | String | N | "org.apache.hudi.common.model.OverwriteWithLatestAvroPayload" | Class of payload | -| parallelism | Int | N | 1500 | Parallelism | -| enable_hive_sync | Boolean | N | false | Whether to enable hive sync | -| props_file_path | String | N | "" | Path of properties file | -| bootstrap_overwrite | Boolean | N | false | Overwrite bootstrap path | +| Parameter Name | Type | Required | Default Value | Description [...] +|------------------------------------------------------------------------------|---------|----------|-------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- [...] +| table | String | Y | None | Name of table to be clustered [...] +| table_type | String | Y | None | Table type, MERGE_ON_READ or COPY_ON_WRITE [...] +| [bootstrap_path](/docs/next/configurations#hoodiebootstrapbasepath) | String | Y | None | Base path of the dataset that needs to be bootstrapped as a Hudi table [...] +| base_path | String | Y | None | Base path [...] +| rowKey_field | String | Y | None | Primary key field [...] +| base_file_format | String | N | "PARQUET" | Format of base file [...] +| partition_path_field | String | N | "" | Partitioned column field [...] +| [bootstrap_index_class](/docs/next/configurations#hoodiebootstrapindexclass) | String | N | "org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex" | Implementation to use, for mapping a skeleton base file to a bootstrap base file. [...] +| [selector_class](/docs/next/configurations#hoodiebootstrapmodeselector) | String | N | "org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector" | Selects the mode in which each file/partition in the bootstrapped dataset gets bootstrapped [...] +| key_generator_class | String | N | "org.apache.hudi.keygen.SimpleKeyGenerator" | Class of key generator [...] +| full_bootstrap_input_provider | String | N | "org.apache.hudi.bootstrap.SparkParquetBootstrapDataProvider" | Class of full bootstrap input provider [...] +| schema_provider_class | String | N | "" | Class of schema provider [...] +| payload_class | String | N | "org.apache.hudi.common.model.OverwriteWithLatestAvroPayload" | Class of payload [...] +| [parallelism](/docs/next/configurations#hoodiebootstrapparallelism) | Int | N | 1500 | For metadata-only bootstrap, Hudi parallelizes the operation so that each table partition is handled by one Spark task. This config limits the number of parallelism. We pick the configured parallelism if the number of table partitions is larger than this configured value. The parallelism is assigned to the nu [...] +| enable_hive_sync | Boolean | N | false | Whether to enable hive sync [...] +| props_file_path | String | N | "" | Path of properties file [...] +| bootstrap_overwrite | Boolean | N | false | Overwrite bootstrap path [...] **Output** diff --git a/website/docs/table_management.md b/website/docs/table_management.md index d00797ac99e..672ade8ccb8 100644 --- a/website/docs/table_management.md +++ b/website/docs/table_management.md @@ -18,13 +18,13 @@ Only SparkSQL needs an explicit Create Table command. No Create Table command is Users can set table options while creating a hudi table. -| Parameter Name | Description | (Optional/Required) : Default Value | -|------------|--------|--------| -| primaryKey | The primary key names of the table, multiple fields separated by commas. | (Optional) : `id`| -| type | The type of table to create ([read more](/docs/table_types)). <br></br> `cow` = COPY-ON-WRITE, `mor` = MERGE-ON-READ.| (Optional) : `cow` | -| preCombineField | The Pre-Combine field of the table. | (Optional) : `ts`| +| Parameter Name | Default | Description | +|-----------------|----------------|-----------------------------------------------------------------------------------------------------------------------| +| primaryKey | id (Optional) | The primary key names of the table, multiple fields separated by commas. | +| type | cow (Optional) | The type of table to create ([read more](/docs/table_types)). <br></br> `cow` = COPY-ON-WRITE, `mor` = MERGE-ON-READ. | +| preCombineField | ts (Optional) | The Pre-Combine field of the table. | -To set any custom hudi config(like index type, max parquet size, etc), see the "Set hudi config section" . +To set any custom hudi config(like index type, max parquet size, etc), see the section [Set hudi config options](#set-hoodie-config-options) . ### Table Type Here is an example of creating a COW table. @@ -36,7 +36,7 @@ create table if not exists hudi_table2( name string, price double ) using hudi -options ( +tblproperties ( type = 'cow' ); ``` @@ -51,7 +51,7 @@ create table if not exists hudi_table0 ( name string, price double ) using hudi -options ( +tblproperties ( type = 'cow', primaryKey = 'id' ); @@ -69,7 +69,7 @@ create table if not exists hudi_table1 ( price double, ts bigint ) using hudi -options ( +tblproperties ( type = 'mor', primaryKey = 'id,name', preCombineField = 'ts' @@ -77,6 +77,9 @@ options ( ``` ### Partitioned Table +:::note +When created in spark-sql, partition columns will always be the last columns of the table. +::: Here is an example of creating a COW partitioned table. ```sql create table if not exists hudi_table_p0 ( @@ -85,7 +88,7 @@ name string, dt string, hh string ) using hudi -options ( +tblproperties ( type = 'cow', primaryKey = 'id' ) @@ -118,7 +121,7 @@ select 1 as id, 'a1' as name, 10 as price; ```sql create table h2 using hudi -options (type = 'cow', primaryKey = 'id') +tblproperties (type = 'cow', primaryKey = 'id') partitioned by (dt) as select 1 as id, 'a1' as name, 10 as price, 1000 as dt; @@ -131,7 +134,7 @@ select 1 as id, 'a1' as name, 10 as price, 1000 as dt; create table parquet_mngd using parquet location 'file:///tmp/parquet_dataset/*.parquet'; # CTAS by loading data into hudi table -create table hudi_tbl using hudi location 'file:/tmp/hudi/hudi_tbl/' options ( +create table hudi_tbl using hudi location 'file:/tmp/hudi/hudi_tbl/' tblproperties ( type = 'cow', primaryKey = 'id', preCombineField = 'ts' @@ -148,7 +151,7 @@ create table if not exists h3( name string, price double ) using hudi -options ( +tblproperties ( primaryKey = 'id', type = 'mor', ${hoodie.config.key1} = '${hoodie.config.value2}', @@ -162,7 +165,7 @@ create table if not exists h3( name string, price double ) using hudi -options ( +tblproperties ( primaryKey = 'id', type = 'mor', hoodie.cleaner.fileversions.retained = '20',