[hudi] branch master updated: [HUDI-5977] Fix Date to String column schema evolution (#8280)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git The following commit(s) were added to refs/heads/master by this push: new 04ec593413e [HUDI-5977] Fix Date to String column schema evolution (#8280) 04ec593413e is described below commit 04ec593413e0be986dc021a7385b0d66d5659749 Author: voonhous AuthorDate: Tue Mar 28 15:23:37 2023 +0800 [HUDI-5977] Fix Date to String column schema evolution (#8280) --- .../org/apache/spark/sql/hudi/TestSpark3DDL.scala | 48 ++ .../parquet/Spark24HoodieParquetFileFormat.scala | 6 ++- .../parquet/Spark31HoodieParquetFileFormat.scala | 6 ++- .../Spark32PlusHoodieParquetFileFormat.scala | 6 ++- 4 files changed, 63 insertions(+), 3 deletions(-) diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestSpark3DDL.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestSpark3DDL.scala index de77b07fd83..7b1cf43b739 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestSpark3DDL.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestSpark3DDL.scala @@ -735,4 +735,52 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase { } } } + + test("Test DATE to STRING conversions when vectorized reading is not enabled") { +withTempDir { tmp => + Seq("COPY_ON_WRITE", "MERGE_ON_READ").foreach { tableType => +val tableName = generateTableName +val tablePath = s"${new Path(tmp.getCanonicalPath, tableName).toUri.toString}" +if (HoodieSparkUtils.gteqSpark3_1) { + // adding a struct column to force reads to use non-vectorized readers + spark.sql( +s""" + | create table $tableName ( + | id int, + | name string, + | price double, + | struct_col struct, + | ts long + |) using hudi + | location '$tablePath' + | options ( + | type = '$tableType', + | primaryKey = 'id', + | preCombineField = 'ts' + | ) + | partitioned by (ts) + """.stripMargin) + spark.sql( +s""" + | insert into $tableName + | values (1, 'a1', 10, struct(1, 'f_1'), 1000) + """.stripMargin) + spark.sql(s"select * from $tableName") + + spark.sql("set hoodie.schema.on.read.enable=true") + spark.sql(s"alter table $tableName add columns(date_to_string_col date)") + spark.sql( +s""" + | insert into $tableName + | values (2, 'a2', 20, struct(2, 'f_2'), date '2023-03-22', 1001) + """.stripMargin) + spark.sql(s"alter table $tableName alter column date_to_string_col type string") + + // struct and string (converted from date) column must be read to ensure that non-vectorized reader is used + // not checking results as we just need to ensure that the table can be read without any errors thrown + spark.sql(s"select * from $tableName") +} + } +} + } } diff --git a/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark24HoodieParquetFileFormat.scala b/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark24HoodieParquetFileFormat.scala index 1a8585b38aa..c168911302e 100644 --- a/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark24HoodieParquetFileFormat.scala +++ b/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark24HoodieParquetFileFormat.scala @@ -107,6 +107,7 @@ class Spark24HoodieParquetFileFormat(private val shouldAppendPartitionValues: Bo val pushDownStringStartWith = sqlConf.parquetFilterPushDownStringStartWith val pushDownInFilterThreshold = sqlConf.parquetFilterPushDownInFilterThreshold val isCaseSensitive = sqlConf.caseSensitiveAnalysis +val timeZoneId = Option(sqlConf.sessionLocalTimeZone) (file: PartitionedFile) => { assert(!shouldAppendPartitionValues || file.partitionValues.numFields == partitionSchema.size) @@ -238,7 +239,10 @@ class Spark24HoodieParquetFileFormat(private val shouldAppendPartitionValues: Bo }).toAttributes ++ partitionSchema.toAttributes val castS
[hudi] branch master updated: Fix some comments on schema change (#8084)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git The following commit(s) were added to refs/heads/master by this push: new dcedf139d32 Fix some comments on schema change (#8084) dcedf139d32 is described below commit dcedf139d32b681ef196df279f93da5c01fc5d61 Author: mwish <1506118...@qq.com> AuthorDate: Mon Mar 6 19:29:18 2023 +0800 Fix some comments on schema change (#8084) --- .../org/apache/hudi/client/utils/SparkInternalSchemaConverter.java | 4 ++-- .../org/apache/hudi/internal/schema/utils/SchemaChangeUtils.java| 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkInternalSchemaConverter.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkInternalSchemaConverter.java index 198b097134b..2b14bb3a066 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkInternalSchemaConverter.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkInternalSchemaConverter.java @@ -287,7 +287,7 @@ public class SparkInternalSchemaConverter { /** * Convert Int/long type to other Type. - * Now only support int/long -> long/float/double/string + * Now only support int/long -> long/float/double/string/Decimal * TODO: support more types */ private static boolean convertIntLongType(WritableColumnVector oldV, WritableColumnVector newV, DataType newType, int len) { @@ -321,7 +321,7 @@ public class SparkInternalSchemaConverter { /** * Convert float type to other Type. - * Now only support float -> double/String + * Now only support float -> double/String/Decimal * TODO: support more types */ private static boolean convertFloatType(WritableColumnVector oldV, WritableColumnVector newV, DataType newType, int len) { diff --git a/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/SchemaChangeUtils.java b/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/SchemaChangeUtils.java index f768830737c..ff2ca89e98e 100644 --- a/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/SchemaChangeUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/SchemaChangeUtils.java @@ -38,9 +38,9 @@ public class SchemaChangeUtils { /** * Whether to allow the column type to be updated. * now only support: - * int => long/float/double/string - * long => float/double/string - * float => double/String + * int => long/float/double/String/Decimal + * long => float/double/String/Decimal + * float => double/String/Decimal * double => String/Decimal * Decimal => Decimal/String * String => date/decimal
[hudi] branch master updated: [MINOR] schema evolution relax decimal type conversion conditions (#8063)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git The following commit(s) were added to refs/heads/master by this push: new 30175e992ef [MINOR] schema evolution relax decimal type conversion conditions (#8063) 30175e992ef is described below commit 30175e992ef68592f574fc90ccf415623a29827d Author: watermelon12138 <49849410+watermelon12...@users.noreply.github.com> AuthorDate: Tue Feb 28 09:25:17 2023 +0800 [MINOR] schema evolution relax decimal type conversion conditions (#8063) --- .../java/org/apache/hudi/internal/schema/utils/SchemaChangeUtils.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/SchemaChangeUtils.java b/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/SchemaChangeUtils.java index 290e3489a3e..f768830737c 100644 --- a/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/SchemaChangeUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/SchemaChangeUtils.java @@ -77,6 +77,9 @@ public class SchemaChangeUtils { if (decimalDsr.isWiderThan(decimalSrc)) { return true; } + if (decimalDsr.precision() >= decimalSrc.precision() && decimalDsr.scale() == decimalSrc.scale()) { +return true; + } } else if (dsr.typeId() == Type.TypeID.STRING) { return true; }
[hudi] branch master updated (3e12b46d717 -> 31e94abf902)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git from 3e12b46d717 [MINOR] fix the compilation failure for master branch. (#8059) add 31e94abf902 [HUDI-5835] After performing the update operation, the hoodie table cannot be read normally by spark (#8026) No new revisions were added by this update. Summary of changes: .../scala/org/apache/hudi/HoodieBaseRelation.scala | 28 +- .../org/apache/hudi/HoodieBootstrapRelation.scala | 9 +++--- .../apache/hudi/MergeOnReadSnapshotRelation.scala | 2 +- .../apache/spark/sql/hudi/TestUpdateTable.scala| 34 ++ 4 files changed, 55 insertions(+), 18 deletions(-)
[hudi] branch master updated (1377143656a -> 252c4033010)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git from 1377143656a [HUDI-5487] Reduce duplicate logs in ExternalSpillableMap (#7579) add 252c4033010 [MINOR] Standardise schema concepts on Flink Engine (#7761) No new revisions were added by this update. Summary of changes: .../internal/schema/utils/InternalSchemaUtils.java | 4 +- .../hudi/table/format/InternalSchemaManager.java | 57 +- .../apache/hudi/table/format/RecordIterators.java | 8 +-- 3 files changed, 41 insertions(+), 28 deletions(-)
[hudi] branch master updated: [HUDI-5400] Fix read issues when Hudi-FULL schema evolution is not enabled (#7480)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git The following commit(s) were added to refs/heads/master by this push: new 64b814ea23 [HUDI-5400] Fix read issues when Hudi-FULL schema evolution is not enabled (#7480) 64b814ea23 is described below commit 64b814ea237bd1576af3673d04c7bb965218fdef Author: voonhous AuthorDate: Sat Dec 24 15:41:59 2022 +0800 [HUDI-5400] Fix read issues when Hudi-FULL schema evolution is not enabled (#7480) --- .../parquet/HoodieParquetFileFormatHelper.scala| 72 ++ .../hudi/TestAvroSchemaResolutionSupport.scala | 794 + ...Spark24HoodieVectorizedParquetRecordReader.java | 185 + .../parquet/Spark24HoodieParquetFileFormat.scala | 62 +- .../parquet/Spark31HoodieParquetFileFormat.scala | 12 +- .../Spark32PlusHoodieParquetFileFormat.scala | 10 +- 6 files changed, 1116 insertions(+), 19 deletions(-) diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieParquetFileFormatHelper.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieParquetFileFormatHelper.scala new file mode 100644 index 00..ce1a719cb9 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieParquetFileFormatHelper.scala @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.parquet + +import org.apache.hadoop.conf.Configuration +import org.apache.parquet.hadoop.metadata.FileMetaData +import org.apache.spark.sql.types.{ArrayType, DataType, MapType, StructField, StructType} + +object HoodieParquetFileFormatHelper { + + def buildImplicitSchemaChangeInfo(hadoopConf: Configuration, +parquetFileMetaData: FileMetaData, +requiredSchema: StructType): (java.util.Map[Integer, org.apache.hudi.common.util.collection.Pair[DataType, DataType]], StructType) = { +val implicitTypeChangeInfo: java.util.Map[Integer, org.apache.hudi.common.util.collection.Pair[DataType, DataType]] = new java.util.HashMap() +val convert = new ParquetToSparkSchemaConverter(hadoopConf) +val fileStruct = convert.convert(parquetFileMetaData.getSchema) +val fileStructMap = fileStruct.fields.map(f => (f.name, f.dataType)).toMap +val sparkRequestStructFields = requiredSchema.map(f => { + val requiredType = f.dataType + if (fileStructMap.contains(f.name) && !isDataTypeEqual(requiredType, fileStructMap(f.name))) { +implicitTypeChangeInfo.put(new Integer(requiredSchema.fieldIndex(f.name)), org.apache.hudi.common.util.collection.Pair.of(requiredType, fileStructMap(f.name))) +StructField(f.name, fileStructMap(f.name), f.nullable) + } else { +f + } +}) +(implicitTypeChangeInfo, StructType(sparkRequestStructFields)) + } + + def isDataTypeEqual(requiredType: DataType, fileType: DataType): Boolean = (requiredType, fileType) match { +case (requiredType, fileType) if requiredType == fileType => true + +case (ArrayType(rt, _), ArrayType(ft, _)) => + // Do not care about nullability as schema evolution require fields to be nullable + isDataTypeEqual(rt, ft) + +case (MapType(requiredKey, requiredValue, _), MapType(fileKey, fileValue, _)) => + // Likewise, do not care about nullability as schema evolution require fields to be nullable + isDataTypeEqual(requiredKey, fileKey) && isDataTypeEqual(requiredValue, fileValue) + +case (StructType(requiredFields), StructType(fileFields)) => + // Find fields that are in requiredFields and fileFields as they might not be the same during add column + change column operations + val commonFieldNames = requiredFields.map(_.name) intersect fileFields.map(_.name) + + // Need to match by name instead of StructField as name will stay the same whilst type may chan
[hudi] branch master updated (74f8d94b73b -> 06c8fa5a62f)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git from 74f8d94b73b [HUDI-5331] Add schema settings with stream api (#7384) add 06c8fa5a62f [HUDI-5294] Support type change for schema on read + reconcile schema (#7326) No new revisions were added by this update. Summary of changes: .../apache/hudi/client/BaseHoodieWriteClient.java | 2 +- .../table/action/commit/HoodieMergeHelper.java | 14 ++- .../hudi/common/util/InternalSchemaCache.java | 26 - .../schema/utils/AvroSchemaEvolutionUtils.java | 33 --- .../org/apache/hudi/HoodieSparkSqlWriter.scala | 35 +-- .../org/apache/spark/sql/hudi/TestSpark3DDL.scala | 106 - 6 files changed, 186 insertions(+), 30 deletions(-)
[hudi] branch master updated: [HUDI-5244] Fix bugs in schema evolution client with lost operation field and not found schema (#7248)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git The following commit(s) were added to refs/heads/master by this push: new 4c218231bd [HUDI-5244] Fix bugs in schema evolution client with lost operation field and not found schema (#7248) 4c218231bd is described below commit 4c218231bdb2a24310145b721cf87ba7b0f1534a Author: Alexander Trushev AuthorDate: Mon Nov 21 10:54:01 2022 +0700 [HUDI-5244] Fix bugs in schema evolution client with lost operation field and not found schema (#7248) * [HUDI-5244] Fix bugs in schema evolution client with lost operation field and not found schema --- .../apache/hudi/client/BaseHoodieWriteClient.java | 21 +++-- .../action/commit/TestSchemaEvolutionClient.java | 96 ++ .../java/org/apache/hudi/avro/HoodieAvroUtils.java | 4 + 3 files changed, 114 insertions(+), 7 deletions(-) diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieWriteClient.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieWriteClient.java index 4a3f6bd311..133dfce9e9 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieWriteClient.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieWriteClient.java @@ -280,7 +280,7 @@ public abstract class BaseHoodieWriteClient getInternalSchemaAndMetaClient() { HoodieTableMetaClient metaClient = createMetaClient(true); TableSchemaResolver schemaUtil = new TableSchemaResolver(metaClient); -Option internalSchemaOption = schemaUtil.getTableInternalSchemaFromCommitMetadata(); -if (!internalSchemaOption.isPresent()) { - throw new HoodieException(String.format("cannot find schema for current table: %s", config.getBasePath())); -} -return Pair.of(internalSchemaOption.get(), metaClient); +return Pair.of(getInternalSchema(schemaUtil), metaClient); } private void commitTableChange(InternalSchema newSchema, HoodieTableMetaClient metaClient) { TableSchemaResolver schemaUtil = new TableSchemaResolver(metaClient); -String historySchemaStr = schemaUtil.getTableHistorySchemaStrFromCommitMetadata().orElse(""); +String historySchemaStr = schemaUtil.getTableHistorySchemaStrFromCommitMetadata().orElseGet( +() -> SerDeHelper.inheritSchemas(getInternalSchema(schemaUtil), "")); Schema schema = AvroInternalSchemaConverter.convert(newSchema, config.getTableName()); String commitActionType = CommitUtils.getCommitActionType(WriteOperationType.ALTER_SCHEMA, metaClient.getTableType()); String instantTime = HoodieActiveTimeline.createNewInstantTime(); @@ -1793,4 +1790,14 @@ public abstract class BaseHoodieWriteClient { + try { +return AvroInternalSchemaConverter.convert(schemaUtil.getTableAvroSchema()); + } catch (Exception e) { +throw new HoodieException(String.format("cannot find schema for current table: %s", config.getBasePath())); + } +}); + } } diff --git a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestSchemaEvolutionClient.java b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestSchemaEvolutionClient.java new file mode 100644 index 00..bda4a3267d --- /dev/null +++ b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestSchemaEvolutionClient.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit; + +import org.apache.hudi.client.HoodieJavaWriteClient; +import org.apache.hudi.common.engine.EngineType; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.table.TableSchemaResolver; +import org.apache.hudi.common.testutils.RawTripTestPayload; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.internal.schema.Types; +import org.apache.hudi.testutils.Hoodie
[hudi] branch master updated: [HUDI-5237] Support for HoodieUnMergedLogRecordScanner with InternalSchema (#7237)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git The following commit(s) were added to refs/heads/master by this push: new 21bcbfd792 [HUDI-5237] Support for HoodieUnMergedLogRecordScanner with InternalSchema (#7237) 21bcbfd792 is described below commit 21bcbfd7923c65acd7e458d6ca2a05f1ff9df109 Author: Alexander Trushev AuthorDate: Mon Nov 21 10:26:16 2022 +0700 [HUDI-5237] Support for HoodieUnMergedLogRecordScanner with InternalSchema (#7237) * [HUDI-5237] Support for HoodieUnMergedLogRecordScanner with InternalSchema --- .../common/table/log/AbstractHoodieLogRecordReader.java | 2 ++ .../common/table/log/HoodieMergedLogRecordScanner.java| 3 ++- .../common/table/log/HoodieUnMergedLogRecordScanner.java | 15 --- .../metadata/HoodieMetadataMergedLogRecordReader.java | 5 + 4 files changed, 21 insertions(+), 4 deletions(-) diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java index 88da6aa1f0..0c8d8b3f6c 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java @@ -817,6 +817,8 @@ public abstract class AbstractHoodieLogRecordReader { public abstract Builder withReaderSchema(Schema schema); +public abstract Builder withInternalSchema(InternalSchema internalSchema); + public abstract Builder withLatestInstantTime(String latestInstantTime); public abstract Builder withReadBlocksLazily(boolean readBlocksLazily); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java index e846637493..708015b1c1 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java @@ -308,8 +308,9 @@ public class HoodieMergedLogRecordScanner extends AbstractHoodieLogRecordReader return this; } +@Override public Builder withInternalSchema(InternalSchema internalSchema) { - this.internalSchema = internalSchema == null ? InternalSchema.getEmptyInternalSchema() : internalSchema; + this.internalSchema = internalSchema; return this; } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java index 7ddb9f1236..b0d127c562 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java @@ -40,9 +40,10 @@ public class HoodieUnMergedLogRecordScanner extends AbstractHoodieLogRecordReade private HoodieUnMergedLogRecordScanner(FileSystem fs, String basePath, List logFilePaths, Schema readerSchema, String latestInstantTime, boolean readBlocksLazily, boolean reverseReader, int bufferSize, - LogRecordScannerCallback callback, Option instantRange, boolean useScanV2) { + LogRecordScannerCallback callback, Option instantRange, InternalSchema internalSchema, + boolean useScanV2) { super(fs, basePath, logFilePaths, readerSchema, latestInstantTime, readBlocksLazily, reverseReader, bufferSize, instantRange, -false, true, Option.empty(), InternalSchema.getEmptyInternalSchema(), useScanV2); +false, true, Option.empty(), internalSchema, useScanV2); this.callback = callback; } @@ -81,6 +82,7 @@ public class HoodieUnMergedLogRecordScanner extends AbstractHoodieLogRecordReade private String basePath; private List logFilePaths; private Schema readerSchema; +private InternalSchema internalSchema; private String latestInstantTime; private boolean readBlocksLazily; private boolean reverseReader; @@ -112,6 +114,12 @@ public class HoodieUnMergedLogRecordScanner extends AbstractHoodieLogRecordReade return this; } +@Override +public Builder withInternalSchema(InternalSchema internalSchema) { + this.internalSchema = internalSchema; + return this; +} + public Builder withLatestInstantTime(String latestInstantTime) { this.latestInstantTime = latestInstantTime; return this; @@ -151,7 +159,8 @@ public class HoodieUnMergedLogRecordScanner extends
[hudi] branch master updated (c5590cdd0f -> e4e28836c2)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git from c5590cdd0f [HUDI-4812] Lazy fetching partition path & file slice for HoodieFileIndex (#6680) add e4e28836c2 [HUDI-5233] Fix bug when InternalSchemaUtils.collectTypeChangedCols returns all columns (#7228) No new revisions were added by this update. Summary of changes: .../java/org/apache/hudi/internal/schema/Type.java | 22 ++ .../schema/utils/TestInternalSchemaUtils.java | 14 ++ 2 files changed, 36 insertions(+)
[hudi] branch master updated: [HUDI-5178] Add Call show_table_properties for spark sql (#7161)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git The following commit(s) were added to refs/heads/master by this push: new 1d1181a441 [HUDI-5178] Add Call show_table_properties for spark sql (#7161) 1d1181a441 is described below commit 1d1181a4410154ff0615f374cfee97630b425e88 Author: ForwardXu AuthorDate: Wed Nov 9 10:41:03 2022 +0800 [HUDI-5178] Add Call show_table_properties for spark sql (#7161) --- .../hudi/command/procedures/BaseProcedure.scala| 4 +- .../hudi/command/procedures/HoodieProcedures.scala | 1 + .../procedures/ShowTablePropertiesProcedure.scala | 71 ++ .../TestShowTablePropertiesProcedure.scala | 45 ++ 4 files changed, 119 insertions(+), 2 deletions(-) diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/BaseProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/BaseProcedure.scala index d0404664f4..67930cb3ed 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/BaseProcedure.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/BaseProcedure.scala @@ -22,7 +22,7 @@ import org.apache.hudi.client.SparkRDDWriteClient import org.apache.hudi.client.common.HoodieSparkEngineContext import org.apache.hudi.common.model.HoodieRecordPayload import org.apache.hudi.config.{HoodieIndexConfig, HoodieWriteConfig} -import org.apache.hudi.exception.HoodieClusteringException +import org.apache.hudi.exception.HoodieException import org.apache.hudi.index.HoodieIndex.IndexType import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql.SparkSession @@ -111,7 +111,7 @@ abstract class BaseProcedure extends Procedure { t => HoodieCLIUtils.getHoodieCatalogTable(sparkSession, t.asInstanceOf[String]).tableLocation) .getOrElse( tablePath.map(p => p.asInstanceOf[String]).getOrElse( - throw new HoodieClusteringException("Table name or table path must be given one")) + throw new HoodieException("Table name or table path must be given one")) ) } diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala index f14d96139b..3b38925455 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala @@ -82,6 +82,7 @@ object HoodieProcedures { ,(HiveSyncProcedure.NAME, HiveSyncProcedure.builder) ,(CopyToTempView.NAME, CopyToTempView.builder) ,(ShowCommitExtraMetadataProcedure.NAME, ShowCommitExtraMetadataProcedure.builder) + ,(ShowTablePropertiesProcedure.NAME, ShowTablePropertiesProcedure.builder) ) } } diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowTablePropertiesProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowTablePropertiesProcedure.scala new file mode 100644 index 00..d75df07fc9 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowTablePropertiesProcedure.scala @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.HoodieCLIUtils +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util +import java.util.function.Supplier +import scala.collection.JavaConversions._ + +class ShowTablePropertiesProcedure(
[hudi] branch master updated: [HUDI-5083]Fixed a bug when schema evolution (#7045)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git The following commit(s) were added to refs/heads/master by this push: new b4563ddca5 [HUDI-5083]Fixed a bug when schema evolution (#7045) b4563ddca5 is described below commit b4563ddca566358be0897523decd29fe84ab332a Author: 申胜利 <48829688+shenshen...@users.noreply.github.com> AuthorDate: Sat Oct 29 09:24:16 2022 +0800 [HUDI-5083]Fixed a bug when schema evolution (#7045) --- .../java/org/apache/hudi/avro/HoodieAvroUtils.java | 4 +-- .../org/apache/spark/sql/hudi/TestSpark3DDL.scala | 38 ++ 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java b/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java index a352e86b96..5288f7fa0c 100644 --- a/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java @@ -767,14 +767,14 @@ public class HoodieAvroUtils { Schema.Field field = fields.get(i); String fieldName = field.name(); fieldNames.push(fieldName); - if (oldSchema.getField(field.name()) != null) { + if (oldSchema.getField(field.name()) != null && !renameCols.containsKey(field.name())) { Schema.Field oldField = oldSchema.getField(field.name()); newRecord.put(i, rewriteRecordWithNewSchema(indexedRecord.get(oldField.pos()), oldField.schema(), fields.get(i).schema(), renameCols, fieldNames)); } else { String fieldFullName = createFullName(fieldNames); String fieldNameFromOldSchema = renameCols.getOrDefault(fieldFullName, ""); // deal with rename -if (oldSchema.getField(field.name()) == null && oldSchema.getField(fieldNameFromOldSchema) != null) { +if (oldSchema.getField(fieldNameFromOldSchema) != null) { // find rename Schema.Field oldField = oldSchema.getField(fieldNameFromOldSchema); newRecord.put(i, rewriteRecordWithNewSchema(indexedRecord.get(oldField.pos()), oldField.schema(), fields.get(i).schema(), renameCols, fieldNames)); diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestSpark3DDL.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestSpark3DDL.scala index 65357b903b..9d955cb831 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestSpark3DDL.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestSpark3DDL.scala @@ -385,6 +385,44 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase { } } + test("Test Alter Table multiple times") { +withTempDir { tmp => + Seq("cow", "mor").foreach { tableType => +val tableName = generateTableName +val tablePath = s"${new Path(tmp.getCanonicalPath, tableName).toUri.toString}" +if (HoodieSparkUtils.gteqSpark3_1) { + spark.sql("set hoodie.schema.on.read.enable=true") + spark.sql( +s""" + |create table $tableName ( + | id int, + | col1 string, + | col2 string, + | ts long + |) using hudi + | location '$tablePath' + | options ( + | type = '$tableType', + | primaryKey = 'id', + | preCombineField = 'ts' + | ) + """.stripMargin) + spark.sql(s"show create table ${tableName}").show(false) + spark.sql(s"insert into ${tableName} values (1, 'aaa', 'bbb', 1000)") + + // Rename to a previously existing column name + insert + spark.sql(s"alter table ${tableName} drop column col1") + spark.sql(s"alter table ${tableName} rename column col2 to col1") + + spark.sql(s"insert into ${tableName} values (2, 'aaa', 1000)") + checkAnswer(spark.sql(s"select col1 from ${tableName} order by id").collect())( +Seq("bbb"), Seq("aaa") + ) +} + } +} + } + test("Test Alter Table complex") { withTempDir { tmp => Seq("cow", "mor").foreach { tableType =>
[hudi] branch master updated (c6e5fd0271 -> 48d23ff233)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git from c6e5fd0271 [HUDI-3287] Remove hudi-spark dependencies from hudi-kafka-connect-bundle (#6079) add 48d23ff233 [HUDI-5000] Support schema evolution for Hive/presto (#6989) No new revisions were added by this update. Summary of changes: .../hudi/hadoop/HoodieParquetInputFormat.java | 3 + .../apache/hudi/hadoop/SchemaEvolutionContext.java | 353 + .../realtime/AbstractRealtimeRecordReader.java | 29 +- .../realtime/RealtimeCompactedRecordReader.java| 4 +- .../functional/TestHiveTableSchemaEvolution.java | 155 + packaging/hudi-hadoop-mr-bundle/pom.xml| 1 + packaging/hudi-presto-bundle/pom.xml | 1 + 7 files changed, 539 insertions(+), 7 deletions(-) create mode 100644 hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/SchemaEvolutionContext.java create mode 100644 hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestHiveTableSchemaEvolution.java
[hudi] branch master updated: [HUDI-5085]When a flink job has multiple sink tables, the index loading status is abnormal (#7051)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git The following commit(s) were added to refs/heads/master by this push: new 22ccdc1540 [HUDI-5085]When a flink job has multiple sink tables, the index loading status is abnormal (#7051) 22ccdc1540 is described below commit 22ccdc154008e38b74a4361c2a5aedfaef56a87e Author: YangXiao <919869...@qq.com> AuthorDate: Wed Oct 26 16:44:26 2022 +0800 [HUDI-5085]When a flink job has multiple sink tables, the index loading status is abnormal (#7051) --- .../src/main/java/org/apache/hudi/sink/bootstrap/BootstrapOperator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/BootstrapOperator.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/BootstrapOperator.java index 10d46abc94..09250e3132 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/BootstrapOperator.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/BootstrapOperator.java @@ -160,7 +160,7 @@ public class BootstrapOperator> int readyTaskNum = 1; while (taskNum != readyTaskNum) { try { -readyTaskNum = aggregateManager.updateGlobalAggregate(BootstrapAggFunction.NAME, taskID, new BootstrapAggFunction()); +readyTaskNum = aggregateManager.updateGlobalAggregate(BootstrapAggFunction.NAME + conf.getString(FlinkOptions.TABLE_NAME), taskID, new BootstrapAggFunction()); LOG.info("Waiting for other bootstrap tasks to complete, taskId = {}.", taskID); TimeUnit.SECONDS.sleep(5);
[hudi] branch master updated (488f58d770 -> 22d6019559)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git from 488f58d770 [HUDI-4785] Fix partition discovery in bootstrap operation (#6673) add 22d6019559 [HUDI-4706] Fix InternalSchemaChangeApplier#applyAddChange error to add nest type (#6486) No new revisions were added by this update. Summary of changes: .../schema/action/InternalSchemaChangeApplier.java | 3 +- .../internal/schema/action/TableChangesHelper.java | 5 ++ .../internal/schema/action/TestTableChanges.java | 86 ++ 3 files changed, 93 insertions(+), 1 deletion(-)
[hudi] branch master updated (b228a2788a -> b6b515917c)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git from b228a2788a [HUDI-4399][RFC-57] Protobuf support in DeltaStreamer (#6111) add b6b515917c [HUDI-4703] use the historical schema to response time travel query (#6499) No new revisions were added by this update. Summary of changes: .../hudi/common/table/TableSchemaResolver.java | 29 +++ .../scala/org/apache/hudi/HoodieBaseRelation.scala | 11 ++- .../hudi/functional/TestTimeTravelQuery.scala | 96 +++--- .../spark/sql/hudi/TestTimeTravelTable.scala | 59 + 4 files changed, 184 insertions(+), 11 deletions(-)
[hudi] branch master updated (4bda6afe0b -> ded197800a)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git from 4bda6afe0b [HUDI-4249] Fixing in-memory `HoodieData` implementation to operate lazily (#5855) add ded197800a [HUDI-4170] Make user can use hoodie.datasource.read.paths to read necessary files (#5722) No new revisions were added by this update. Summary of changes: .../main/scala/org/apache/hudi/DefaultSource.scala | 16 +++- .../scala/org/apache/hudi/HoodieBaseRelation.scala | 44 +-- .../scala/org/apache/hudi/HoodieFileIndex.scala| 17 ++-- .../apache/hudi/MergeOnReadSnapshotRelation.scala | 20 + .../apache/hudi/functional/TestCOWDataSource.scala | 35 - .../apache/hudi/functional/TestMORDataSource.scala | 90 ++ 6 files changed, 186 insertions(+), 36 deletions(-)
[hudi] branch master updated (fc8d96246a -> b686c07407)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git from fc8d96246a [HUDI-4335] Bug fixes in AWSGlueCatalogSyncClient post schema evolution. (#5995) add b686c07407 [HUDI-4276] Reconcile schema-inject null values for missing fields and add new fields (#6017) No new revisions were added by this update. Summary of changes: .../apache/hudi/client/BaseHoodieWriteClient.java | 17 +++-- .../table/action/commit/HoodieMergeHelper.java | 2 +- .../scala/org/apache/hudi/HoodieSparkUtils.scala | 9 +-- .../java/org/apache/hudi/avro/HoodieAvroUtils.java | 42 ++-- .../hudi/common/config/HoodieCommonConfig.java | 7 ++ .../table/log/AbstractHoodieLogRecordReader.java | 4 +- .../schema/action/InternalSchemaMerger.java| 10 ++- .../schema/utils/AvroSchemaEvolutionUtils.java | 74 ++-- .../internal/schema/utils/InternalSchemaUtils.java | 7 +- .../org/apache/hudi/avro/TestHoodieAvroUtils.java | 30 + .../schema/utils/TestAvroSchemaEvolutionUtils.java | 78 ++ .../scala/org/apache/hudi/DataSourceOptions.scala | 7 +- .../org/apache/hudi/HoodieSparkSqlWriter.scala | 24 +-- .../org/apache/hudi/TestHoodieSparkUtils.scala | 4 +- .../org/apache/spark/sql/hudi/TestSpark3DDL.scala | 68 ++- 15 files changed, 273 insertions(+), 110 deletions(-)
[hudi] branch master updated (3670e82af5 -> b18c32379f)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git from 3670e82af5 [HUDI-4356] Fix the error when sync hive in CTAS (#6029) add b18c32379f [HUDI-4219] Merge Into when update expression "col=s.col+2" on precombine cause exception (#5828) No new revisions were added by this update. Summary of changes: .../hudi/command/MergeIntoHoodieTableCommand.scala | 40 - .../apache/spark/sql/hudi/TestMergeIntoTable.scala | 181 + 2 files changed, 215 insertions(+), 6 deletions(-)
[hudi] branch master updated (e8fbd4daf4 -> 360df576a9)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git from e8fbd4daf4 [TEST][DO_NOT_MERGE]fix random failed for ci (#5948) add 360df576a9 Revert "[TEST][DO_NOT_MERGE]fix random failed for ci (#5948)" (#5971) No new revisions were added by this update. Summary of changes: .../src/main/scala/org/apache/hudi/BaseFileOnlyRelation.scala | 7 +++ 1 file changed, 3 insertions(+), 4 deletions(-)
[hudi] branch revert-5948-testflaky created (now 5cdf95a030)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a change to branch revert-5948-testflaky in repository https://gitbox.apache.org/repos/asf/hudi.git at 5cdf95a030 Revert "[TEST][DO_NOT_MERGE]fix random failed for ci (#5948)" This branch includes the following new commits: new 5cdf95a030 Revert "[TEST][DO_NOT_MERGE]fix random failed for ci (#5948)" The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference.
[hudi] 01/01: Revert "[TEST][DO_NOT_MERGE]fix random failed for ci (#5948)"
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a commit to branch revert-5948-testflaky in repository https://gitbox.apache.org/repos/asf/hudi.git commit 5cdf95a03006a0e7c9f923fa468774341e41c8e3 Author: xiarixiaoyao AuthorDate: Sat Jun 25 10:45:00 2022 +0800 Revert "[TEST][DO_NOT_MERGE]fix random failed for ci (#5948)" This reverts commit e8fbd4daf49802f60f800ccc92e66369d44f07f6. --- .../src/main/scala/org/apache/hudi/BaseFileOnlyRelation.scala | 7 +++ 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/BaseFileOnlyRelation.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/BaseFileOnlyRelation.scala index d6ec645920..4160c34b0c 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/BaseFileOnlyRelation.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/BaseFileOnlyRelation.scala @@ -166,9 +166,7 @@ class BaseFileOnlyRelation(sqlContext: SQLContext, DataSource.apply( sparkSession = sparkSession, paths = extraReadPaths, -// Here we should specify the schema to the latest commit schema since -// the table schema evolution. -userSpecifiedSchema = userSchema.orElse(Some(tableStructSchema)), +userSpecifiedSchema = userSchema, className = formatClassName, // Since we're reading the table as just collection of files we have to make sure // we only read the latest version of every Hudi's file-group, which might be compacted, clustered, etc. @@ -177,7 +175,8 @@ class BaseFileOnlyRelation(sqlContext: SQLContext, // We rely on [[HoodieROTablePathFilter]], to do proper filtering to assure that options = optParams ++ Map( "mapreduce.input.pathFilter.class" -> classOf[HoodieROTablePathFilter].getName -) +), +partitionColumns = partitionColumns ) .resolveRelation() .asInstanceOf[HadoopFsRelation]
[hudi] branch master updated (eeafaeacd2 -> e8fbd4daf4)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git from eeafaeacd2 [HUDI-3512] Add call procedure for StatsCommand (#5955) add e8fbd4daf4 [TEST][DO_NOT_MERGE]fix random failed for ci (#5948) No new revisions were added by this update. Summary of changes: .../src/main/scala/org/apache/hudi/BaseFileOnlyRelation.scala | 7 --- 1 file changed, 4 insertions(+), 3 deletions(-)
[hudi] branch master updated: [HUDI-3508] Add call procedure for FileSystemViewCommand (#5929)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git The following commit(s) were added to refs/heads/master by this push: new 1bb017d396 [HUDI-3508] Add call procedure for FileSystemViewCommand (#5929) 1bb017d396 is described below commit 1bb017d396562b891065187bac417c52edea24da Author: jiz <31836510+microbe...@users.noreply.github.com> AuthorDate: Wed Jun 22 17:50:20 2022 +0800 [HUDI-3508] Add call procedure for FileSystemViewCommand (#5929) * [HUDI-3508] Add call procedure for FileSystemView * minor Co-authored-by: jiimmyzhan --- .../hudi/command/procedures/HoodieProcedures.scala | 2 + .../procedures/ShowFileSystemViewProcedure.scala | 258 + .../sql/hudi/procedure/TestFsViewProcedure.scala | 95 3 files changed, 355 insertions(+) diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala index ff129964fa..d974e216d5 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala @@ -48,6 +48,8 @@ object HoodieProcedures { mapBuilder.put(ShowRollbacksProcedure.NAME, ShowRollbacksProcedure.builder) mapBuilder.put(ShowRollbackDetailProcedure.NAME, ShowRollbackDetailProcedure.builder) mapBuilder.put(ExportInstantsProcedure.NAME, ExportInstantsProcedure.builder) +mapBuilder.put(ShowAllFileSystemViewProcedure.NAME, ShowAllFileSystemViewProcedure.builder) +mapBuilder.put(ShowLatestFileSystemViewProcedure.NAME, ShowLatestFileSystemViewProcedure.builder) mapBuilder.build } } diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowFileSystemViewProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowFileSystemViewProcedure.scala new file mode 100644 index 00..8c861cf0f6 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowFileSystemViewProcedure.scala @@ -0,0 +1,258 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import com.google.common.collect.Lists +import org.apache.hadoop.fs.{FileStatus, Path} +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.common.model.{FileSlice, HoodieLogFile} +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.table.timeline.{HoodieDefaultTimeline, HoodieInstant, HoodieTimeline} +import org.apache.hudi.common.table.view.HoodieTableFileSystemView +import org.apache.hudi.common.util +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util.function.{Function, Supplier} +import java.util.stream.Collectors +import scala.collection.JavaConverters.{asJavaIteratorConverter, asScalaIteratorConverter} + +class ShowFileSystemViewProcedure(showLatest: Boolean) extends BaseProcedure with ProcedureBuilder { + private val PARAMETERS_ALL: Array[ProcedureParameter] = Array[ProcedureParameter]( +ProcedureParameter.required(0, "table", DataTypes.StringType, None), +ProcedureParameter.optional(1, "max_instant", DataTypes.StringType, ""), +ProcedureParameter.optional(2, "include_max", DataTypes.BooleanType, false), +ProcedureParameter.optional(3, "include_inflight", DataTypes.BooleanType, false), +ProcedureParameter.optional(4, "exclude_compaction", DataTypes.BooleanType, false), +ProcedureParameter.optional(5, "limit", DataTypes.IntegerType, 10), +ProcedureParameter.optional(6, "path_regex", DataTypes.StringType, "*/*/*&q
[hudi] branch master updated: [HUDI-3507] Support export command based on Call Produce Command (#5901)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git The following commit(s) were added to refs/heads/master by this push: new c5c4cfec91 [HUDI-3507] Support export command based on Call Produce Command (#5901) c5c4cfec91 is described below commit c5c4cfec915fad198ccb57d00bc8d875a78e794b Author: ForwardXu AuthorDate: Sun Jun 19 18:48:22 2022 +0800 [HUDI-3507] Support export command based on Call Produce Command (#5901) --- .../apache/hudi/cli/commands/ExportCommand.java| 36 ++-- .../procedures/ExportInstantsProcedure.scala | 239 + .../hudi/command/procedures/HoodieProcedures.scala | 1 + .../procedure/TestExportInstantsProcedure.scala| 52 + 4 files changed, 309 insertions(+), 19 deletions(-) diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ExportCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ExportCommand.java index fa6e15b7af..95e7caa8a9 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ExportCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ExportCommand.java @@ -18,6 +18,12 @@ package org.apache.hudi.cli.commands; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.avro.specific.SpecificData; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.avro.model.HoodieArchivedMetaEntry; import org.apache.hudi.avro.model.HoodieCleanMetadata; @@ -36,14 +42,6 @@ import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; import org.apache.hudi.common.util.ClosableIterator; import org.apache.hudi.exception.HoodieException; - -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.IndexedRecord; -import org.apache.avro.specific.SpecificData; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; import org.springframework.shell.core.CommandMarker; import org.springframework.shell.core.annotation.CliCommand; import org.springframework.shell.core.annotation.CliOption; @@ -60,10 +58,10 @@ import java.util.stream.Collectors; /** * CLI commands to export various information from a HUDI dataset. - * + * * "export instants": Export Instants and their metadata from the Timeline to a local - *directory specified by the parameter --localFolder - * The instants are exported in the json format. + * directory specified by the parameter --localFolder + * The instants are exported in the json format. */ @Component public class ExportCommand implements CommandMarker { @@ -83,7 +81,7 @@ public class ExportCommand implements CommandMarker { int numExports = limit == -1 ? Integer.MAX_VALUE : limit; int numCopied = 0; -if (! new File(localFolder).isDirectory()) { +if (!new File(localFolder).isDirectory()) { throw new HoodieException(localFolder + " is not a valid local directory"); } @@ -94,7 +92,7 @@ public class ExportCommand implements CommandMarker { // Archived instants are in the commit archive files FileStatus[] statuses = FSUtils.getFs(basePath, HoodieCLI.conf).globStatus(archivePath); -List archivedStatuses = Arrays.stream(statuses).sorted((f1, f2) -> (int)(f1.getModificationTime() - f2.getModificationTime())).collect(Collectors.toList()); +List archivedStatuses = Arrays.stream(statuses).sorted((f1, f2) -> (int) (f1.getModificationTime() - f2.getModificationTime())).collect(Collectors.toList()); if (descending) { Collections.reverse(nonArchivedInstants); @@ -115,11 +113,11 @@ public class ExportCommand implements CommandMarker { private int copyArchivedInstants(List statuses, Set actionSet, int limit, String localFolder) throws Exception { int copyCount = 0; +FileSystem fileSystem = FSUtils.getFs(HoodieCLI.getTableMetaClient().getBasePath(), HoodieCLI.conf); for (FileStatus fs : statuses) { // read the archived file - Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(HoodieCLI.getTableMetaClient().getBasePath(), HoodieCLI.conf), - new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema()); + Reader reader = HoodieLogFormat.newReader(fileSystem, new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema()); // read the avro blocks while (reader.hasNext() && copyCount < limit) { @@ -130,7 +128,7 @@ public class ExportCommand implements CommandMarker { // Archived instants are saved as arvo encoded HoodieArchived
[hudi] branch master updated: [HUDI-3499] Add Call Procedure for show rollbacks (#5848)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git The following commit(s) were added to refs/heads/master by this push: new 7b946cf351 [HUDI-3499] Add Call Procedure for show rollbacks (#5848) 7b946cf351 is described below commit 7b946cf35142139ab90320d943c90d905722989f Author: superche <73096722+hechao-u...@users.noreply.github.com> AuthorDate: Wed Jun 15 16:50:15 2022 +0800 [HUDI-3499] Add Call Procedure for show rollbacks (#5848) * Add Call Procedure for show rollbacks * fix * add ut for show_rollback_detail and exception handle Co-authored-by: superche --- .../hudi/command/procedures/HoodieProcedures.scala | 2 + .../procedures/ShowRollbacksProcedure.scala| 147 + .../sql/hudi/procedure/TestCallProcedure.scala | 105 +++ 3 files changed, 254 insertions(+) diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala index 2b720bb94d..7cfeaaa0b6 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala @@ -45,6 +45,8 @@ object HoodieProcedures { mapBuilder.put(ShowCommitsMetadataProcedure.NAME, ShowCommitsMetadataProcedure.builder) mapBuilder.put(ShowSavepointsProcedure.NAME, ShowSavepointsProcedure.builder) mapBuilder.put(DeleteMarkerProcedure.NAME, DeleteMarkerProcedure.builder) +mapBuilder.put(ShowRollbacksProcedure.NAME, ShowRollbacksProcedure.builder) +mapBuilder.put(ShowRollbackDetailProcedure.NAME, ShowRollbackDetailProcedure.builder) mapBuilder.build } } diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowRollbacksProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowRollbacksProcedure.scala new file mode 100644 index 00..e5cacdb062 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowRollbacksProcedure.scala @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import java.io.IOException +import java.util +import java.util.function.Supplier + +import org.apache.hudi.avro.model.HoodieRollbackMetadata +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.table.timeline.HoodieInstant.State +import org.apache.hudi.common.table.timeline.HoodieTimeline.ROLLBACK_ACTION +import org.apache.hudi.common.table.timeline.{HoodieActiveTimeline, HoodieInstant, HoodieTimeline, TimelineMetadataUtils} +import org.apache.hudi.common.util.CollectionUtils +import org.apache.hudi.exception.HoodieException +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import scala.collection.JavaConversions.asScalaBuffer +import scala.collection.JavaConverters._ + +class ShowRollbacksProcedure(showDetails: Boolean) extends BaseProcedure with ProcedureBuilder { + private val ROLLBACKS_PARAMETERS = Array[ProcedureParameter]( +ProcedureParameter.required(0, "table", DataTypes.StringType, None), +ProcedureParameter.optional(1, "limit", DataTypes.IntegerType, 10) + ) + + private val ROLLBACK_PARAMETERS = Array[ProcedureParameter]( +ProcedureParameter.required(0, "table", DataTypes.StringType, None), +ProcedureParameter.optional(1, "limit", DataTypes.IntegerType, 10), +ProcedureParameter.required(2, "instant_time", DataTypes.StringType, None) + ) + + private val ROLLBACKS_OUTPUT_TYPE = new StructType(Array[StructField]( +StructField("instant", DataTypes.StringType, nullab
[hudi] branch master updated: [HUDI-4192] HoodieHFileReader scan top cells after bottom cells throw NullPointerException (#5755)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git The following commit(s) were added to refs/heads/master by this push: new 73b0be3c96 [HUDI-4192] HoodieHFileReader scan top cells after bottom cells throw NullPointerException (#5755) 73b0be3c96 is described below commit 73b0be3c962112efe541ae04fe0ea6f298558f17 Author: marchpure AuthorDate: Mon Jun 6 12:07:26 2022 +0800 [HUDI-4192] HoodieHFileReader scan top cells after bottom cells throw NullPointerException (#5755) SeekTo top cells avoid NullPointerException --- .../io/storage/TestHoodieHFileReaderWriter.java| 32 ++ .../apache/hudi/io/storage/HoodieHFileReader.java | 6 2 files changed, 38 insertions(+) diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieHFileReaderWriter.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieHFileReaderWriter.java index da6f717258..baede154c9 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieHFileReaderWriter.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieHFileReaderWriter.java @@ -294,6 +294,38 @@ public class TestHoodieHFileReaderWriter extends TestHoodieReaderWriterBase { StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED), false) .collect(Collectors.toList()); assertEquals(Collections.emptyList(), recordsByPrefix); + +// filter for "key50" and "key1" : entries from key50 and 'key10 to key19' should be matched. +List expectedKey50and1s = allRecords.stream().filter(entry -> (entry.get("_row_key").toString()).contains("key1") +|| (entry.get("_row_key").toString()).contains("key50")).collect(Collectors.toList()); +iterator = +hfileReader.getRecordsByKeyPrefixIterator(Arrays.asList("key50", "key1"), avroSchema); +recordsByPrefix = +StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED), false) +.collect(Collectors.toList()); +assertEquals(expectedKey50and1s, recordsByPrefix); + +// filter for "key50" and "key0" : entries from key50 and 'key00 to key09' should be matched. +List expectedKey50and0s = allRecords.stream().filter(entry -> (entry.get("_row_key").toString()).contains("key0") +|| (entry.get("_row_key").toString()).contains("key50")).collect(Collectors.toList()); +iterator = +hfileReader.getRecordsByKeyPrefixIterator(Arrays.asList("key50", "key0"), avroSchema); +recordsByPrefix = +StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED), false) +.collect(Collectors.toList()); +assertEquals(expectedKey50and0s, recordsByPrefix); + +// filter for "key1" and "key0" : entries from 'key10 to key19' and 'key00 to key09' should be matched. +List expectedKey1sand0s = expectedKey1s; +expectedKey1sand0s.addAll(allRecords.stream() +.filter(entry -> (entry.get("_row_key").toString()).contains("key0")) +.collect(Collectors.toList())); +iterator = +hfileReader.getRecordsByKeyPrefixIterator(Arrays.asList("key1", "key0"), avroSchema); +recordsByPrefix = +StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED), false) +.collect(Collectors.toList()); +assertEquals(expectedKey1sand0s, recordsByPrefix); } @ParameterizedTest diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileReader.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileReader.java index 899c2475da..0bf31d2a25 100644 --- a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileReader.java +++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileReader.java @@ -258,6 +258,12 @@ public class HoodieHFileReader implements HoodieFileRea if (!scanner.next()) { return Collections.emptyIterator(); } +} else if (val == -1) { + // If scanner is aleady on the top of hfile. avoid trigger seekTo again. + Option headerCell = Option.fromJavaOptional(scanner.getReader().getFirstKey()); + if (headerCell.isPresent() && !headerCell.get().equals(scanner.getCell())) { +scanner.seekTo(); + } } class KeyPrefixIterator implements Iterator {
[hudi] branch master updated: [HUDI-4162] Fixed some constant mapping issues. (#5700)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git The following commit(s) were added to refs/heads/master by this push: new 57dbe57bed [HUDI-4162] Fixed some constant mapping issues. (#5700) 57dbe57bed is described below commit 57dbe57bed963f529b2921b6bb6253a90a1a45af Author: watermelon12138 <49849410+watermelon12...@users.noreply.github.com> AuthorDate: Fri May 27 14:08:54 2022 +0800 [HUDI-4162] Fixed some constant mapping issues. (#5700) Co-authored-by: y00617041 --- .../src/main/scala/org/apache/hudi/DataSourceOptions.scala| 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala index ac4d0e5794..a62a402b6a 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala @@ -540,13 +540,13 @@ object DataSourceWriteOptions { val HIVE_SYNC_ENABLED_OPT_KEY = HiveSyncConfig.HIVE_SYNC_ENABLED.key() /** @deprecated Use {@link META_SYNC_ENABLED} and its methods instead */ @Deprecated - val META_SYNC_ENABLED_OPT_KEY = HoodieSyncConfig.META_SYNC_DATABASE_NAME.key() + val META_SYNC_ENABLED_OPT_KEY = HoodieSyncConfig.META_SYNC_ENABLED.key() /** @deprecated Use {@link HIVE_DATABASE} and its methods instead */ @Deprecated val HIVE_DATABASE_OPT_KEY = HoodieSyncConfig.META_SYNC_DATABASE_NAME.key() /** @deprecated Use {@link HIVE_TABLE} and its methods instead */ @Deprecated - val HIVE_TABLE_OPT_KEY = HoodieSyncConfig.META_SYNC_DATABASE_NAME.key() + val HIVE_TABLE_OPT_KEY = HoodieSyncConfig.META_SYNC_TABLE_NAME.key() /** @deprecated Use {@link HIVE_BASE_FILE_FORMAT} and its methods instead */ @Deprecated val HIVE_BASE_FILE_FORMAT_OPT_KEY = HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT.key()
[hudi] branch master updated (924e2e96a6 -> cacbd98687)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git from 924e2e96a6 Claim RFC 52 for Introduce Secondary Index to Improve HUDI Query Performance (#5441) add cacbd98687 [HUDI-3945] After the async compaction operation is complete, the task should exit. (#5391) No new revisions were added by this update. Summary of changes: .../src/main/java/org/apache/hudi/utilities/HoodieCompactor.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
[hudi] branch master updated: [HUDI-3781] fix spark delete sql can not delete record (#5215)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git The following commit(s) were added to refs/heads/master by this push: new 7a6272fba1 [HUDI-3781] fix spark delete sql can not delete record (#5215) 7a6272fba1 is described below commit 7a6272fba150fa9d7acd0b57cc4041ec49019faf Author: KnightChess <981159...@qq.com> AuthorDate: Fri Apr 8 14:26:40 2022 +0800 [HUDI-3781] fix spark delete sql can not delete record (#5215) --- .../spark/sql/hudi/ProvidesHoodieConfig.scala | 5 ++- .../apache/spark/sql/hudi/TestDeleteTable.scala| 47 ++ 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala index 74255473b5..31fb0ad6cb 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala @@ -255,7 +255,10 @@ trait ProvidesHoodieConfig extends Logging { val hoodieProps = getHoodieProps(catalogProperties, tableConfig, sparkSession.sqlContext.conf) val hiveSyncConfig = buildHiveSyncConfig(hoodieProps, hoodieCatalogTable) -withSparkConf(sparkSession, hoodieCatalogTable.catalogProperties) { +// operation can not be overwrite +val options = hoodieCatalogTable.catalogProperties.-(OPERATION.key()) + +withSparkConf(sparkSession, options) { Map( "path" -> path, RECORDKEY_FIELD.key -> hoodieCatalogTable.primaryKeys.mkString(","), diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestDeleteTable.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestDeleteTable.scala index f005a14d7f..9c693f9626 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestDeleteTable.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestDeleteTable.scala @@ -151,4 +151,51 @@ class TestDeleteTable extends TestHoodieSqlBase { } } } + + test("Test Delete Table with op upsert") { +withTempDir { tmp => + Seq("cow", "mor").foreach {tableType => +val tableName = generateTableName +// create table +spark.sql( + s""" + |create table $tableName ( + | id int, + | name string, + | price double, + | ts long + |) using hudi + | location '${tmp.getCanonicalPath}/$tableName' + | tblproperties ( + | type = '$tableType', + | primaryKey = 'id', + | preCombineField = 'ts', + | hoodie.datasource.write.operation = 'upsert' + | ) + """.stripMargin) +// insert data to table +spark.sql(s"insert into $tableName select 1, 'a1', 10, 1000") +checkAnswer(s"select id, name, price, ts from $tableName")( + Seq(1, "a1", 10.0, 1000) +) + +// delete data from table +spark.sql(s"delete from $tableName where id = 1") +checkAnswer(s"select count(1) from $tableName") ( + Seq(0) +) + +spark.sql(s"insert into $tableName select 2, 'a2', 10, 1000") +spark.sql(s"delete from $tableName where id = 1") +checkAnswer(s"select id, name, price, ts from $tableName")( + Seq(2, "a2", 10.0, 1000) +) + +spark.sql(s"delete from $tableName") +checkAnswer(s"select count(1) from $tableName")( + Seq(0) +) + } +} + } }
[hudi] branch master updated (b28f0d6ceb -> 3449e86989)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git from b28f0d6ceb [HUDI-3290] Different file formats for the partition metadata file. (#5179) add 3449e86989 [HUDI-3780] improve drop partitions (#5178) No new revisions were added by this update. Summary of changes: .../hudi/aws/sync/AWSGlueCatalogSyncClient.java| 2 +- .../hudi/common/table/TableSchemaResolver.java | 19 --- .../hudi/metadata/HoodieTableMetadataUtil.java | 18 +++ .../gcp/bigquery/HoodieBigQuerySyncClient.java | 12 +- .../sql/catalyst/catalog/HoodieCatalogTable.scala | 5 + .../spark/sql/hudi/ProvidesHoodieConfig.scala | 173 + .../AlterHoodieTableDropPartitionCommand.scala | 41 + .../hudi/command/DeleteHoodieTableCommand.scala| 44 +- .../hudi/command/MergeIntoHoodieTableCommand.scala | 24 +-- .../hudi/sync/datahub/DataHubSyncClient.java | 4 +- .../java/org/apache/hudi/dla/HoodieDLAClient.java | 4 +- .../java/org/apache/hudi/hive/HiveSyncTool.java| 2 +- .../org/apache/hudi/hive/HoodieHiveClient.java | 2 +- .../hudi/sync/common/AbstractSyncHoodieClient.java | 26 ++-- 14 files changed, 216 insertions(+), 160 deletions(-)
[hudi] branch master updated (a048e94 -> 98b4e97)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git. from a048e94 [HUDI-3743] Support DELETE_PARTITION for metadata table (#5169) add 98b4e97 [HUDI-3406] Rollback incorrectly relying on FS listing instead of Com… (#4957) No new revisions were added by this update. Summary of changes: .../hudi/client/utils/MetadataConversionUtils.java | 17 +- .../table/action/rollback/BaseRollbackHelper.java | 5 - .../rollback/ListingBasedRollbackHelper.java | 150 -- .../rollback/ListingBasedRollbackStrategy.java | 302 +++-- .../hudi/table/action/rollback/RollbackUtils.java | 167 .../SerializablePathFilter.java} | 9 +- .../table/upgrade/ZeroToOneUpgradeHandler.java | 34 +-- .../TestMergeOnReadRollbackActionExecutor.java | 5 +- .../hudi/common/model/HoodieCommitMetadata.java| 13 + 9 files changed, 329 insertions(+), 373 deletions(-) delete mode 100644 hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackHelper.java copy hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/{commit/Partitioner.java => rollback/SerializablePathFilter.java} (83%)
[hudi] branch master updated (73a21092 -> 7889c78)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git. from 73a21092 [HUDI-3732] Fixing rollback validation (#5157) add 7889c78 [HUDI-3729][SPARK] fixed the per regression by enable vectorizeReader for parquet file (#5168) No new revisions were added by this update. Summary of changes: .../org/apache/hudi/BaseFileOnlyRelation.scala | 5 + .../scala/org/apache/hudi/HoodieBaseRelation.scala | 2 +- .../hudi/MergeOnReadIncrementalRelation.scala | 5 + .../apache/hudi/MergeOnReadSnapshotRelation.scala | 5 + .../spark/hudi/benchmark/HoodieBenchmarkBase.scala | 9 ++ .../benchmark/CowTableReadBenchmark.scala | 136 + 6 files changed, 161 insertions(+), 1 deletion(-) create mode 100644 hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/execution/benchmark/CowTableReadBenchmark.scala
[hudi] branch master updated (d31cde2 -> 1d0f4cc)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git. from d31cde2 [MINOR] Fix call command parser use spark3.2 (#5144) add 1d0f4cc [HUDI-3538] Support Compaction Command Based on Call Procedure Command for Spark SQL (#4945) No new revisions were added by this update. Summary of changes: .../hudi/command/CompactionHoodiePathCommand.scala | 92 +++-- .../command/CompactionHoodieTableCommand.scala | 4 +- .../command/CompactionShowHoodiePathCommand.scala | 31 ++--- .../command/CompactionShowHoodieTableCommand.scala | 1 + .../command/procedures/HoodieProcedureUtils.scala | 33 +++-- .../hudi/command/procedures/HoodieProcedures.scala | 2 + .../procedures/RunCompactionProcedure.scala| 144 + ...ocedure.scala => ShowCompactionProcedure.scala} | 41 -- ...ocedure.scala => TestClusteringProcedure.scala} | 3 +- .../TestCompactionProcedure.scala} | 68 +- 10 files changed, 266 insertions(+), 153 deletions(-) copy hudi-common/src/main/java/org/apache/hudi/common/data/HoodieAccumulator.java => hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedureUtils.scala (54%) create mode 100644 hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RunCompactionProcedure.scala copy hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/{ShowClusteringProcedure.scala => ShowCompactionProcedure.scala} (61%) rename hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/procedure/{TestRunClusteringProcedure.scala => TestClusteringProcedure.scala} (99%) copy hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/{TestCompactionTable.scala => procedure/TestCompactionProcedure.scala} (54%)
[hudi] branch master updated (465d553 -> 4b75cb6)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git. from 465d553 [HUDI-3600] Tweak the default cleaning strategy to be more streaming friendly for flink (#5010) add 4b75cb6 fix NPE when run schdule using spark-sql if the commits time < hoodie.compact.inline.max.delta.commits (#4976) No new revisions were added by this update. Summary of changes: .../spark/sql/hudi/command/CompactionHoodiePathCommand.scala | 10 +++--- 1 file changed, 7 insertions(+), 3 deletions(-)
[hudi] branch master updated (548000b -> 8859b48)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git. from 548000b [HUDI-3568] Introduce ChainedSchemaPostProcessor to support setting multi processors at once (#4968) add 8859b48 [HUDI-3383] Sync column comments while syncing a hive table (#4960) No new revisions were added by this update. Summary of changes: .../org/apache/hudi/AvroConversionUtils.scala | 29 +-- .../{simple-test.avsc => simple-test-doced.avsc} | 4 +- .../main/java/org/apache/hudi/DataSourceUtils.java | 2 + .../scala/org/apache/hudi/DataSourceOptions.scala | 5 + .../org/apache/hudi/HoodieSparkSqlWriter.scala | 1 + .../org/apache/hudi/TestAvroConversionUtils.scala | 216 + .../java/org/apache/hudi/hive/HiveSyncConfig.java | 5 + .../java/org/apache/hudi/hive/HiveSyncTool.java| 16 ++ .../org/apache/hudi/hive/HoodieHiveClient.java | 51 - .../java/org/apache/hudi/hive/ddl/DDLExecutor.java | 10 + .../org/apache/hudi/hive/ddl/HMSDDLExecutor.java | 24 ++- .../hudi/hive/ddl/QueryBasedDDLExecutor.java | 19 ++ .../org/apache/hudi/hive/TestHiveSyncTool.java | 76 13 files changed, 442 insertions(+), 16 deletions(-) copy hudi-common/src/test/resources/{simple-test.avsc => simple-test-doced.avsc} (86%)
[hudi] branch master updated (be9a264 -> 62f534d)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git. from be9a264 [HUDI-3548] Fix if user specify key "hoodie.datasource.clustering.async.enable" directly, async clustering not work (#4905) add 62f534d [HUDI-3445] Support Clustering Command Based on Call Procedure Command for Spark SQL (#4901) No new revisions were added by this update. Summary of changes: .../apache/hudi/config/HoodieClusteringConfig.java | 12 + .../org/apache/hudi/config/HoodieWriteConfig.java | 4 + .../cluster/ClusteringPlanPartitionFilter.java | 16 +- .../PartitionAwareClusteringPlanStrategy.java | 14 +- .../TestPartitionAwareClusteringPlanStrategy.java | 2 +- .../org/apache/hudi/BaseHoodieTableFileIndex.java | 11 +- .../scala/org/apache/hudi/HoodieCommonUtils.scala | 286 + .../scala/org/apache/hudi/HoodieFileIndex.scala| 11 +- .../apache/hudi/SparkHoodieTableFileIndex.scala| 182 + .../org/apache/hudi/HoodieDataSourceHelpers.java | 17 ++ .../hudi/command/CompactionHoodiePathCommand.scala | 20 +- .../hudi/command/procedures/BaseProcedure.scala| 56 +++- .../hudi/command/procedures/HoodieProcedures.scala | 2 + .../RollbackToInstantTimeProcedure.scala | 4 +- .../procedures/RunClusteringProcedure.scala| 176 + .../procedures/ShowClusteringProcedure.scala | 69 + .../command/procedures/ShowCommitsProcedure.scala | 4 +- .../sql/parser/HoodieSqlCommonAstBuilder.scala | 14 +- .../org/apache/hudi/TestDataSkippingUtils.scala| 29 +-- .../apache/spark/sql/hudi/TestCallProcedure.scala | 225 20 files changed, 908 insertions(+), 246 deletions(-) create mode 100644 hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieCommonUtils.scala create mode 100644 hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RunClusteringProcedure.scala create mode 100644 hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowClusteringProcedure.scala
[hudi] branch master updated (e78b2f1 -> c0e8b03)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git. from e78b2f1 [HUDI-2943] Complete pending clustering before deltastreamer sync (#4572) add c0e8b03 [HUDI-1977] Fix Hudi CLI tempview query issue (#4626) No new revisions were added by this update. Summary of changes: .../hudi/cli/utils/SparkTempViewProvider.java | 25 +- 1 file changed, 15 insertions(+), 10 deletions(-)
[hudi] branch master updated (518488c -> 4f6cdd7)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git. from 518488c [HUDI-3185] HoodieConfig#getBoolean should return false when default not set (#4536) add 4f6cdd7 [HUDI-3192] Spark metastore schema evolution broken (#4533) No new revisions were added by this update. Summary of changes: .../src/main/java/org/apache/hudi/hive/HiveSyncTool.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
[hudi] branch master updated (37b15ff -> a66212d)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git. from 37b15ff [HUDI-3147] Add endpoint_url to dynamodb lock provider (#4500) add a66212d [HUDI-2966] Closing LogRecordScanner in compactor (#4478) No new revisions were added by this update. Summary of changes: .../apache/hudi/hadoop/realtime/RealtimeCompactedRecordReader.java | 7 --- 1 file changed, 4 insertions(+), 3 deletions(-)
[hudi] branch master updated: [HUDI-3107]Fix HiveSyncTool drop partitions using JDBC or hivesql or hms (#4453)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git The following commit(s) were added to refs/heads/master by this push: new ef9923f [HUDI-3107]Fix HiveSyncTool drop partitions using JDBC or hivesql or hms (#4453) ef9923f is described below commit ef9923fc5551851ec4ee71896a62c0615da45ee8 Author: YueZhang <69956021+zhangyue19921...@users.noreply.github.com> AuthorDate: Fri Dec 31 15:56:33 2021 +0800 [HUDI-3107]Fix HiveSyncTool drop partitions using JDBC or hivesql or hms (#4453) * constructDropPartitions when drop partitions using jdbc * done * done * code style * code review Co-authored-by: yuezhang --- .../org/apache/hudi/hive/ddl/HMSDDLExecutor.java | 4 +- .../apache/hudi/hive/ddl/HiveQueryDDLExecutor.java | 4 +- .../org/apache/hudi/hive/ddl/JDBCExecutor.java | 51 -- .../hudi/hive/ddl/QueryBasedDDLExecutor.java | 4 +- .../apache/hudi/hive/util/HivePartitionUtil.java | 51 ++ .../org/apache/hudi/hive/TestHiveSyncTool.java | 42 ++ .../apache/hudi/hive/testutils/HiveTestUtil.java | 21 + 7 files changed, 169 insertions(+), 8 deletions(-) diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/HMSDDLExecutor.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/HMSDDLExecutor.java index d3efebe..c3c5226 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/HMSDDLExecutor.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/HMSDDLExecutor.java @@ -24,6 +24,7 @@ import org.apache.hudi.common.fs.StorageSchemes; import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.hive.HoodieHiveSyncException; import org.apache.hudi.hive.PartitionValueExtractor; +import org.apache.hudi.hive.util.HivePartitionUtil; import org.apache.hudi.hive.util.HiveSchemaUtil; import org.apache.hadoop.fs.FileSystem; @@ -236,7 +237,8 @@ public class HMSDDLExecutor implements DDLExecutor { LOG.info("Drop partitions " + partitionsToDrop.size() + " on " + tableName); try { for (String dropPartition : partitionsToDrop) { -client.dropPartition(syncConfig.databaseName, tableName, dropPartition, false); +String partitionClause = HivePartitionUtil.getPartitionClauseForDrop(dropPartition, partitionValueExtractor, syncConfig); +client.dropPartition(syncConfig.databaseName, tableName, partitionClause, false); LOG.info("Drop partition " + dropPartition + " on " + tableName); } } catch (TException e) { diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/HiveQueryDDLExecutor.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/HiveQueryDDLExecutor.java index 7161194..a4debfb 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/HiveQueryDDLExecutor.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/HiveQueryDDLExecutor.java @@ -34,6 +34,7 @@ import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse; import org.apache.hadoop.hive.ql.session.SessionState; import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hudi.hive.util.HivePartitionUtil; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -136,7 +137,8 @@ public class HiveQueryDDLExecutor extends QueryBasedDDLExecutor { LOG.info("Drop partitions " + partitionsToDrop.size() + " on " + tableName); try { for (String dropPartition : partitionsToDrop) { -metaStoreClient.dropPartition(config.databaseName, tableName, dropPartition, false); +String partitionClause = HivePartitionUtil.getPartitionClauseForDrop(dropPartition, partitionValueExtractor, config); +metaStoreClient.dropPartition(config.databaseName, tableName, partitionClause, false); LOG.info("Drop partition " + dropPartition + " on " + tableName); } } catch (Exception e) { diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/JDBCExecutor.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/JDBCExecutor.java index 493d4ee..997d6e0 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/JDBCExecutor.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/JDBCExecutor.java @@ -18,6 +18,8 @@ package org.apache.hudi.hive.ddl; +import static org.apache.hudi.hive.util.HiveSchemaUtil.HIVE_ESCAPE_CHARACTER; + import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.hive.HoodieHiveSyncException; @@ -31,6 +33,7 @@ import java.sql.DriverManager;
[hudi] branch master updated: [HUDI-3008] Fixing HoodieFileIndex partition column parsing for nested fields
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git The following commit(s) were added to refs/heads/master by this push: new 7d046f9 [HUDI-3008] Fixing HoodieFileIndex partition column parsing for nested fields new b5890cd Merge pull request #4308 from harsh1231/HUDI-3008 7d046f9 is described below commit 7d046f914a059b2623d7f2a7627c44b15ccc0ddb Author: harshal patil AuthorDate: Tue Dec 14 17:28:18 2021 +0530 [HUDI-3008] Fixing HoodieFileIndex partition column parsing for nested fields --- .../scala/org/apache/hudi/HoodieFileIndex.scala| 25 .../org/apache/hudi/TestHoodieFileIndex.scala | 27 -- 2 files changed, 46 insertions(+), 6 deletions(-) diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieFileIndex.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieFileIndex.scala index f9b68cb..0ed1b48 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieFileIndex.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieFileIndex.scala @@ -18,7 +18,6 @@ package org.apache.hudi import org.apache.hadoop.fs.{FileStatus, Path} - import org.apache.hudi.DataSourceReadOptions.{QUERY_TYPE, QUERY_TYPE_SNAPSHOT_OPT_VAL} import org.apache.hudi.client.common.HoodieSparkEngineContext import org.apache.hudi.common.config.HoodieMetadataConfig @@ -27,7 +26,6 @@ import org.apache.hudi.common.model.FileSlice import org.apache.hudi.common.model.HoodieTableType.MERGE_ON_READ import org.apache.hudi.common.table.view.{FileSystemViewStorageConfig, HoodieTableFileSystemView} import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} - import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, BoundReference, Expression, InterpretedPredicate} @@ -37,7 +35,7 @@ import org.apache.spark.sql.execution.datasources.{FileIndex, FileStatusCache, N import org.apache.spark.sql.hudi.DataSkippingUtils.createColumnStatsIndexFilterExpr import org.apache.spark.sql.hudi.HoodieSqlUtils import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.sql.{AnalysisException, Column, SparkSession} import org.apache.spark.unsafe.types.UTF8String @@ -108,7 +106,7 @@ case class HoodieFileIndex( private lazy val _partitionSchemaFromProperties: StructType = { val tableConfig = metaClient.getTableConfig val partitionColumns = tableConfig.getPartitionFields -val nameFieldMap = schema.fields.map(filed => filed.name -> filed).toMap +val nameFieldMap = generateNameFieldMap(Right(schema)) if (partitionColumns.isPresent) { val partitionFields = partitionColumns.get().map(column => @@ -123,6 +121,25 @@ case class HoodieFileIndex( } } + /** + * This method traverses StructType recursively to build map of columnName -> StructField + * Note : If there is nesting of columns like ["a.b.c.d", "a.b.c.e"] -> final map will have keys corresponding + * only to ["a.b.c.d", "a.b.c.e"] and not for subsets like ["a.b.c", "a.b"] + * @param structField + * @return map of ( columns names -> StructField ) + */ + private def generateNameFieldMap(structField: Either[StructField, StructType]) : Map[String, StructField] = { +structField match { + case Right(field) => field.fields.map(f => generateNameFieldMap(Left(f))).flatten.toMap + case Left(field) => field.dataType match { +case struct: StructType => generateNameFieldMap(Right(struct)).map { + case (key: String, sf: StructField) => (field.name + "." + key, sf) +} +case _ => Map(field.name -> field) + } +} + } + private lazy val engineContext = new HoodieSparkEngineContext(new JavaSparkContext(spark.sparkContext)) private lazy val configProperties = { diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala index 0c3918b..62f98cf 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala @@ -18,7 +18,6 @@ package org.apache.hudi import java.util.Properties - import org.apache.hudi.DataSourceWriteOptions._ import org.apache.hudi.common.config.HoodieMetadataConfig import org.apache.hudi.common.table.HoodieTableMetaClient @@ -31,6 +30,7 @@ import org