[hudi] branch master updated: [HUDI-5977] Fix Date to String column schema evolution (#8280)

2023-03-28 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new 04ec593413e [HUDI-5977] Fix Date to String column schema evolution 
(#8280)
04ec593413e is described below

commit 04ec593413e0be986dc021a7385b0d66d5659749
Author: voonhous 
AuthorDate: Tue Mar 28 15:23:37 2023 +0800

[HUDI-5977] Fix Date to String column schema evolution (#8280)
---
 .../org/apache/spark/sql/hudi/TestSpark3DDL.scala  | 48 ++
 .../parquet/Spark24HoodieParquetFileFormat.scala   |  6 ++-
 .../parquet/Spark31HoodieParquetFileFormat.scala   |  6 ++-
 .../Spark32PlusHoodieParquetFileFormat.scala   |  6 ++-
 4 files changed, 63 insertions(+), 3 deletions(-)

diff --git 
a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestSpark3DDL.scala
 
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestSpark3DDL.scala
index de77b07fd83..7b1cf43b739 100644
--- 
a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestSpark3DDL.scala
+++ 
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestSpark3DDL.scala
@@ -735,4 +735,52 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase {
   }
 }
   }
+
+  test("Test DATE to STRING conversions when vectorized reading is not 
enabled") {
+withTempDir { tmp =>
+  Seq("COPY_ON_WRITE", "MERGE_ON_READ").foreach { tableType =>
+val tableName = generateTableName
+val tablePath = s"${new Path(tmp.getCanonicalPath, 
tableName).toUri.toString}"
+if (HoodieSparkUtils.gteqSpark3_1) {
+  // adding a struct column to force reads to use non-vectorized 
readers
+  spark.sql(
+s"""
+   | create table $tableName (
+   |  id int,
+   |  name string,
+   |  price double,
+   |  struct_col struct,
+   |  ts long
+   |) using hudi
+   | location '$tablePath'
+   | options (
+   |  type = '$tableType',
+   |  primaryKey = 'id',
+   |  preCombineField = 'ts'
+   | )
+   | partitioned by (ts)
+ """.stripMargin)
+  spark.sql(
+s"""
+   | insert into $tableName
+   | values (1, 'a1', 10, struct(1, 'f_1'), 1000)
+  """.stripMargin)
+  spark.sql(s"select * from $tableName")
+
+  spark.sql("set hoodie.schema.on.read.enable=true")
+  spark.sql(s"alter table $tableName add columns(date_to_string_col 
date)")
+  spark.sql(
+s"""
+   | insert into $tableName
+   | values (2, 'a2', 20, struct(2, 'f_2'), date '2023-03-22', 
1001)
+  """.stripMargin)
+  spark.sql(s"alter table $tableName alter column date_to_string_col 
type string")
+
+  // struct and string (converted from date) column must be read to 
ensure that non-vectorized reader is used
+  // not checking results as we just need to ensure that the table can 
be read without any errors thrown
+  spark.sql(s"select * from $tableName")
+}
+  }
+}
+  }
 }
diff --git 
a/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark24HoodieParquetFileFormat.scala
 
b/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark24HoodieParquetFileFormat.scala
index 1a8585b38aa..c168911302e 100644
--- 
a/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark24HoodieParquetFileFormat.scala
+++ 
b/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark24HoodieParquetFileFormat.scala
@@ -107,6 +107,7 @@ class Spark24HoodieParquetFileFormat(private val 
shouldAppendPartitionValues: Bo
 val pushDownStringStartWith = sqlConf.parquetFilterPushDownStringStartWith
 val pushDownInFilterThreshold = 
sqlConf.parquetFilterPushDownInFilterThreshold
 val isCaseSensitive = sqlConf.caseSensitiveAnalysis
+val timeZoneId = Option(sqlConf.sessionLocalTimeZone)
 
 (file: PartitionedFile) => {
   assert(!shouldAppendPartitionValues || file.partitionValues.numFields == 
partitionSchema.size)
@@ -238,7 +239,10 @@ class Spark24HoodieParquetFileFormat(private val 
shouldAppendPartitionValues: Bo
   }).toAttributes ++ partitionSchema.toAttributes
   val castS

[hudi] branch master updated: Fix some comments on schema change (#8084)

2023-03-06 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new dcedf139d32 Fix some comments on schema change (#8084)
dcedf139d32 is described below

commit dcedf139d32b681ef196df279f93da5c01fc5d61
Author: mwish <1506118...@qq.com>
AuthorDate: Mon Mar 6 19:29:18 2023 +0800

Fix some comments on schema change (#8084)
---
 .../org/apache/hudi/client/utils/SparkInternalSchemaConverter.java  | 4 ++--
 .../org/apache/hudi/internal/schema/utils/SchemaChangeUtils.java| 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git 
a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkInternalSchemaConverter.java
 
b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkInternalSchemaConverter.java
index 198b097134b..2b14bb3a066 100644
--- 
a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkInternalSchemaConverter.java
+++ 
b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkInternalSchemaConverter.java
@@ -287,7 +287,7 @@ public class SparkInternalSchemaConverter {
 
   /**
* Convert Int/long type to other Type.
-   * Now only support int/long -> long/float/double/string
+   * Now only support int/long -> long/float/double/string/Decimal
* TODO: support more types
*/
   private static boolean convertIntLongType(WritableColumnVector oldV, 
WritableColumnVector newV, DataType newType, int len) {
@@ -321,7 +321,7 @@ public class SparkInternalSchemaConverter {
 
   /**
* Convert float type to other Type.
-   * Now only support float -> double/String
+   * Now only support float -> double/String/Decimal
* TODO: support more types
*/
   private static boolean convertFloatType(WritableColumnVector oldV, 
WritableColumnVector newV, DataType newType, int len) {
diff --git 
a/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/SchemaChangeUtils.java
 
b/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/SchemaChangeUtils.java
index f768830737c..ff2ca89e98e 100644
--- 
a/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/SchemaChangeUtils.java
+++ 
b/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/SchemaChangeUtils.java
@@ -38,9 +38,9 @@ public class SchemaChangeUtils {
   /**
* Whether to allow the column type to be updated.
* now only support:
-   * int => long/float/double/string
-   * long => float/double/string
-   * float => double/String
+   * int => long/float/double/String/Decimal
+   * long => float/double/String/Decimal
+   * float => double/String/Decimal
* double => String/Decimal
* Decimal => Decimal/String
* String => date/decimal



[hudi] branch master updated: [MINOR] schema evolution relax decimal type conversion conditions (#8063)

2023-02-27 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new 30175e992ef [MINOR] schema evolution  relax decimal type conversion 
conditions (#8063)
30175e992ef is described below

commit 30175e992ef68592f574fc90ccf415623a29827d
Author: watermelon12138 <49849410+watermelon12...@users.noreply.github.com>
AuthorDate: Tue Feb 28 09:25:17 2023 +0800

[MINOR] schema evolution  relax decimal type conversion conditions (#8063)
---
 .../java/org/apache/hudi/internal/schema/utils/SchemaChangeUtils.java  | 3 +++
 1 file changed, 3 insertions(+)

diff --git 
a/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/SchemaChangeUtils.java
 
b/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/SchemaChangeUtils.java
index 290e3489a3e..f768830737c 100644
--- 
a/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/SchemaChangeUtils.java
+++ 
b/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/SchemaChangeUtils.java
@@ -77,6 +77,9 @@ public class SchemaChangeUtils {
   if (decimalDsr.isWiderThan(decimalSrc)) {
 return true;
   }
+  if (decimalDsr.precision() >= decimalSrc.precision() && 
decimalDsr.scale() == decimalSrc.scale()) {
+return true;
+  }
 } else if (dsr.typeId() == Type.TypeID.STRING) {
   return true;
 }



[hudi] branch master updated (3e12b46d717 -> 31e94abf902)

2023-02-27 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from 3e12b46d717 [MINOR] fix  the compilation failure for master branch. 
(#8059)
 add 31e94abf902 [HUDI-5835] After performing the update operation, the 
hoodie table cannot be read normally by spark (#8026)

No new revisions were added by this update.

Summary of changes:
 .../scala/org/apache/hudi/HoodieBaseRelation.scala | 28 +-
 .../org/apache/hudi/HoodieBootstrapRelation.scala  |  9 +++---
 .../apache/hudi/MergeOnReadSnapshotRelation.scala  |  2 +-
 .../apache/spark/sql/hudi/TestUpdateTable.scala| 34 ++
 4 files changed, 55 insertions(+), 18 deletions(-)



[hudi] branch master updated (1377143656a -> 252c4033010)

2023-01-30 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from 1377143656a [HUDI-5487] Reduce duplicate logs in ExternalSpillableMap 
(#7579)
 add 252c4033010 [MINOR] Standardise schema concepts on Flink Engine (#7761)

No new revisions were added by this update.

Summary of changes:
 .../internal/schema/utils/InternalSchemaUtils.java |  4 +-
 .../hudi/table/format/InternalSchemaManager.java   | 57 +-
 .../apache/hudi/table/format/RecordIterators.java  |  8 +--
 3 files changed, 41 insertions(+), 28 deletions(-)



[hudi] branch master updated: [HUDI-5400] Fix read issues when Hudi-FULL schema evolution is not enabled (#7480)

2022-12-23 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new 64b814ea23 [HUDI-5400] Fix read issues when Hudi-FULL schema evolution 
is not enabled (#7480)
64b814ea23 is described below

commit 64b814ea237bd1576af3673d04c7bb965218fdef
Author: voonhous 
AuthorDate: Sat Dec 24 15:41:59 2022 +0800

[HUDI-5400] Fix read issues when Hudi-FULL schema evolution is not enabled 
(#7480)
---
 .../parquet/HoodieParquetFileFormatHelper.scala|  72 ++
 .../hudi/TestAvroSchemaResolutionSupport.scala | 794 +
 ...Spark24HoodieVectorizedParquetRecordReader.java | 185 +
 .../parquet/Spark24HoodieParquetFileFormat.scala   |  62 +-
 .../parquet/Spark31HoodieParquetFileFormat.scala   |  12 +-
 .../Spark32PlusHoodieParquetFileFormat.scala   |  10 +-
 6 files changed, 1116 insertions(+), 19 deletions(-)

diff --git 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieParquetFileFormatHelper.scala
 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieParquetFileFormatHelper.scala
new file mode 100644
index 00..ce1a719cb9
--- /dev/null
+++ 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieParquetFileFormatHelper.scala
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.parquet
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.parquet.hadoop.metadata.FileMetaData
+import org.apache.spark.sql.types.{ArrayType, DataType, MapType, StructField, 
StructType}
+
+object HoodieParquetFileFormatHelper {
+
+  def buildImplicitSchemaChangeInfo(hadoopConf: Configuration,
+parquetFileMetaData: FileMetaData,
+requiredSchema: StructType): 
(java.util.Map[Integer, org.apache.hudi.common.util.collection.Pair[DataType, 
DataType]], StructType) = {
+val implicitTypeChangeInfo: java.util.Map[Integer, 
org.apache.hudi.common.util.collection.Pair[DataType, DataType]] = new 
java.util.HashMap()
+val convert = new ParquetToSparkSchemaConverter(hadoopConf)
+val fileStruct = convert.convert(parquetFileMetaData.getSchema)
+val fileStructMap = fileStruct.fields.map(f => (f.name, f.dataType)).toMap
+val sparkRequestStructFields = requiredSchema.map(f => {
+  val requiredType = f.dataType
+  if (fileStructMap.contains(f.name) && !isDataTypeEqual(requiredType, 
fileStructMap(f.name))) {
+implicitTypeChangeInfo.put(new 
Integer(requiredSchema.fieldIndex(f.name)), 
org.apache.hudi.common.util.collection.Pair.of(requiredType, 
fileStructMap(f.name)))
+StructField(f.name, fileStructMap(f.name), f.nullable)
+  } else {
+f
+  }
+})
+(implicitTypeChangeInfo, StructType(sparkRequestStructFields))
+  }
+
+  def isDataTypeEqual(requiredType: DataType, fileType: DataType): Boolean = 
(requiredType, fileType) match {
+case (requiredType, fileType) if requiredType == fileType => true
+
+case (ArrayType(rt, _), ArrayType(ft, _)) =>
+  // Do not care about nullability as schema evolution require fields to 
be nullable
+  isDataTypeEqual(rt, ft)
+
+case (MapType(requiredKey, requiredValue, _), MapType(fileKey, fileValue, 
_)) =>
+  // Likewise, do not care about nullability as schema evolution require 
fields to be nullable
+  isDataTypeEqual(requiredKey, fileKey) && isDataTypeEqual(requiredValue, 
fileValue)
+
+case (StructType(requiredFields), StructType(fileFields)) =>
+  // Find fields that are in requiredFields and fileFields as they might 
not be the same during add column + change column operations
+  val commonFieldNames = requiredFields.map(_.name) intersect 
fileFields.map(_.name)
+
+  // Need to match by name instead of StructField as name will stay the 
same whilst type may chan

[hudi] branch master updated (74f8d94b73b -> 06c8fa5a62f)

2022-12-06 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from 74f8d94b73b [HUDI-5331] Add schema settings with stream api (#7384)
 add 06c8fa5a62f [HUDI-5294] Support type change for schema on read + 
reconcile schema (#7326)

No new revisions were added by this update.

Summary of changes:
 .../apache/hudi/client/BaseHoodieWriteClient.java  |   2 +-
 .../table/action/commit/HoodieMergeHelper.java |  14 ++-
 .../hudi/common/util/InternalSchemaCache.java  |  26 -
 .../schema/utils/AvroSchemaEvolutionUtils.java |  33 ---
 .../org/apache/hudi/HoodieSparkSqlWriter.scala |  35 +--
 .../org/apache/spark/sql/hudi/TestSpark3DDL.scala  | 106 -
 6 files changed, 186 insertions(+), 30 deletions(-)



[hudi] branch master updated: [HUDI-5244] Fix bugs in schema evolution client with lost operation field and not found schema (#7248)

2022-11-20 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new 4c218231bd [HUDI-5244] Fix bugs in schema evolution client with lost 
operation field and not found schema (#7248)
4c218231bd is described below

commit 4c218231bdb2a24310145b721cf87ba7b0f1534a
Author: Alexander Trushev 
AuthorDate: Mon Nov 21 10:54:01 2022 +0700

[HUDI-5244] Fix bugs in schema evolution client with lost operation field 
and not found schema (#7248)

* [HUDI-5244] Fix bugs in schema evolution client with lost operation field 
and not found schema
---
 .../apache/hudi/client/BaseHoodieWriteClient.java  | 21 +++--
 .../action/commit/TestSchemaEvolutionClient.java   | 96 ++
 .../java/org/apache/hudi/avro/HoodieAvroUtils.java |  4 +
 3 files changed, 114 insertions(+), 7 deletions(-)

diff --git 
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieWriteClient.java
 
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieWriteClient.java
index 4a3f6bd311..133dfce9e9 100644
--- 
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieWriteClient.java
+++ 
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieWriteClient.java
@@ -280,7 +280,7 @@ public abstract class BaseHoodieWriteClient 
getInternalSchemaAndMetaClient() {
 HoodieTableMetaClient metaClient = createMetaClient(true);
 TableSchemaResolver schemaUtil = new TableSchemaResolver(metaClient);
-Option internalSchemaOption = 
schemaUtil.getTableInternalSchemaFromCommitMetadata();
-if (!internalSchemaOption.isPresent()) {
-  throw new HoodieException(String.format("cannot find schema for current 
table: %s", config.getBasePath()));
-}
-return Pair.of(internalSchemaOption.get(), metaClient);
+return Pair.of(getInternalSchema(schemaUtil), metaClient);
   }
 
   private void commitTableChange(InternalSchema newSchema, 
HoodieTableMetaClient metaClient) {
 TableSchemaResolver schemaUtil = new TableSchemaResolver(metaClient);
-String historySchemaStr = 
schemaUtil.getTableHistorySchemaStrFromCommitMetadata().orElse("");
+String historySchemaStr = 
schemaUtil.getTableHistorySchemaStrFromCommitMetadata().orElseGet(
+() -> SerDeHelper.inheritSchemas(getInternalSchema(schemaUtil), ""));
 Schema schema = AvroInternalSchemaConverter.convert(newSchema, 
config.getTableName());
 String commitActionType = 
CommitUtils.getCommitActionType(WriteOperationType.ALTER_SCHEMA, 
metaClient.getTableType());
 String instantTime = HoodieActiveTimeline.createNewInstantTime();
@@ -1793,4 +1790,14 @@ public abstract class BaseHoodieWriteClient {
+  try {
+return 
AvroInternalSchemaConverter.convert(schemaUtil.getTableAvroSchema());
+  } catch (Exception e) {
+throw new HoodieException(String.format("cannot find schema for 
current table: %s", config.getBasePath()));
+  }
+});
+  }
 }
diff --git 
a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestSchemaEvolutionClient.java
 
b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestSchemaEvolutionClient.java
new file mode 100644
index 00..bda4a3267d
--- /dev/null
+++ 
b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestSchemaEvolutionClient.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.table.action.commit;
+
+import org.apache.hudi.client.HoodieJavaWriteClient;
+import org.apache.hudi.common.engine.EngineType;
+import org.apache.hudi.common.model.HoodieAvroRecord;
+import org.apache.hudi.common.model.HoodieKey;
+import org.apache.hudi.common.table.TableSchemaResolver;
+import org.apache.hudi.common.testutils.RawTripTestPayload;
+import org.apache.hudi.config.HoodieWriteConfig;
+import org.apache.hudi.internal.schema.Types;
+import org.apache.hudi.testutils.Hoodie

[hudi] branch master updated: [HUDI-5237] Support for HoodieUnMergedLogRecordScanner with InternalSchema (#7237)

2022-11-20 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new 21bcbfd792 [HUDI-5237] Support for HoodieUnMergedLogRecordScanner with 
InternalSchema (#7237)
21bcbfd792 is described below

commit 21bcbfd7923c65acd7e458d6ca2a05f1ff9df109
Author: Alexander Trushev 
AuthorDate: Mon Nov 21 10:26:16 2022 +0700

[HUDI-5237] Support for HoodieUnMergedLogRecordScanner with InternalSchema 
(#7237)

* [HUDI-5237] Support for HoodieUnMergedLogRecordScanner with InternalSchema
---
 .../common/table/log/AbstractHoodieLogRecordReader.java   |  2 ++
 .../common/table/log/HoodieMergedLogRecordScanner.java|  3 ++-
 .../common/table/log/HoodieUnMergedLogRecordScanner.java  | 15 ---
 .../metadata/HoodieMetadataMergedLogRecordReader.java |  5 +
 4 files changed, 21 insertions(+), 4 deletions(-)

diff --git 
a/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java
 
b/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java
index 88da6aa1f0..0c8d8b3f6c 100644
--- 
a/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java
+++ 
b/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java
@@ -817,6 +817,8 @@ public abstract class AbstractHoodieLogRecordReader {
 
 public abstract Builder withReaderSchema(Schema schema);
 
+public abstract Builder withInternalSchema(InternalSchema internalSchema);
+
 public abstract Builder withLatestInstantTime(String latestInstantTime);
 
 public abstract Builder withReadBlocksLazily(boolean readBlocksLazily);
diff --git 
a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java
 
b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java
index e846637493..708015b1c1 100644
--- 
a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java
+++ 
b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java
@@ -308,8 +308,9 @@ public class HoodieMergedLogRecordScanner extends 
AbstractHoodieLogRecordReader
   return this;
 }
 
+@Override
 public Builder withInternalSchema(InternalSchema internalSchema) {
-  this.internalSchema = internalSchema == null ? 
InternalSchema.getEmptyInternalSchema() : internalSchema;
+  this.internalSchema = internalSchema;
   return this;
 }
 
diff --git 
a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java
 
b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java
index 7ddb9f1236..b0d127c562 100644
--- 
a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java
+++ 
b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieUnMergedLogRecordScanner.java
@@ -40,9 +40,10 @@ public class HoodieUnMergedLogRecordScanner extends 
AbstractHoodieLogRecordReade
 
   private HoodieUnMergedLogRecordScanner(FileSystem fs, String basePath, 
List logFilePaths, Schema readerSchema,
  String latestInstantTime, boolean 
readBlocksLazily, boolean reverseReader, int bufferSize,
- LogRecordScannerCallback callback, 
Option instantRange, boolean useScanV2) {
+ LogRecordScannerCallback callback, 
Option instantRange, InternalSchema internalSchema,
+ boolean useScanV2) {
 super(fs, basePath, logFilePaths, readerSchema, latestInstantTime, 
readBlocksLazily, reverseReader, bufferSize, instantRange,
-false, true, Option.empty(), InternalSchema.getEmptyInternalSchema(), 
useScanV2);
+false, true, Option.empty(), internalSchema, useScanV2);
 this.callback = callback;
   }
 
@@ -81,6 +82,7 @@ public class HoodieUnMergedLogRecordScanner extends 
AbstractHoodieLogRecordReade
 private String basePath;
 private List logFilePaths;
 private Schema readerSchema;
+private InternalSchema internalSchema;
 private String latestInstantTime;
 private boolean readBlocksLazily;
 private boolean reverseReader;
@@ -112,6 +114,12 @@ public class HoodieUnMergedLogRecordScanner extends 
AbstractHoodieLogRecordReade
   return this;
 }
 
+@Override
+public Builder withInternalSchema(InternalSchema internalSchema) {
+  this.internalSchema = internalSchema;
+  return this;
+}
+
 public Builder withLatestInstantTime(String latestInstantTime) {
   this.latestInstantTime = latestInstantTime;
   return this;
@@ -151,7 +159,8 @@ public class HoodieUnMergedLogRecordScanner extends

[hudi] branch master updated (c5590cdd0f -> e4e28836c2)

2022-11-17 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from c5590cdd0f [HUDI-4812] Lazy fetching partition path & file slice for 
HoodieFileIndex (#6680)
 add e4e28836c2 [HUDI-5233] Fix bug when 
InternalSchemaUtils.collectTypeChangedCols returns all columns (#7228)

No new revisions were added by this update.

Summary of changes:
 .../java/org/apache/hudi/internal/schema/Type.java | 22 ++
 .../schema/utils/TestInternalSchemaUtils.java  | 14 ++
 2 files changed, 36 insertions(+)



[hudi] branch master updated: [HUDI-5178] Add Call show_table_properties for spark sql (#7161)

2022-11-08 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new 1d1181a441 [HUDI-5178] Add Call show_table_properties for spark sql 
(#7161)
1d1181a441 is described below

commit 1d1181a4410154ff0615f374cfee97630b425e88
Author: ForwardXu 
AuthorDate: Wed Nov 9 10:41:03 2022 +0800

[HUDI-5178] Add Call show_table_properties for spark sql (#7161)
---
 .../hudi/command/procedures/BaseProcedure.scala|  4 +-
 .../hudi/command/procedures/HoodieProcedures.scala |  1 +
 .../procedures/ShowTablePropertiesProcedure.scala  | 71 ++
 .../TestShowTablePropertiesProcedure.scala | 45 ++
 4 files changed, 119 insertions(+), 2 deletions(-)

diff --git 
a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/BaseProcedure.scala
 
b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/BaseProcedure.scala
index d0404664f4..67930cb3ed 100644
--- 
a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/BaseProcedure.scala
+++ 
b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/BaseProcedure.scala
@@ -22,7 +22,7 @@ import org.apache.hudi.client.SparkRDDWriteClient
 import org.apache.hudi.client.common.HoodieSparkEngineContext
 import org.apache.hudi.common.model.HoodieRecordPayload
 import org.apache.hudi.config.{HoodieIndexConfig, HoodieWriteConfig}
-import org.apache.hudi.exception.HoodieClusteringException
+import org.apache.hudi.exception.HoodieException
 import org.apache.hudi.index.HoodieIndex.IndexType
 import org.apache.spark.api.java.JavaSparkContext
 import org.apache.spark.sql.SparkSession
@@ -111,7 +111,7 @@ abstract class BaseProcedure extends Procedure {
   t => HoodieCLIUtils.getHoodieCatalogTable(sparkSession, 
t.asInstanceOf[String]).tableLocation)
   .getOrElse(
 tablePath.map(p => p.asInstanceOf[String]).getOrElse(
-  throw new HoodieClusteringException("Table name or table path must 
be given one"))
+  throw new HoodieException("Table name or table path must be given 
one"))
   )
   }
 
diff --git 
a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala
 
b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala
index f14d96139b..3b38925455 100644
--- 
a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala
+++ 
b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala
@@ -82,6 +82,7 @@ object HoodieProcedures {
   ,(HiveSyncProcedure.NAME, HiveSyncProcedure.builder)
   ,(CopyToTempView.NAME, CopyToTempView.builder)
   ,(ShowCommitExtraMetadataProcedure.NAME, 
ShowCommitExtraMetadataProcedure.builder)
+  ,(ShowTablePropertiesProcedure.NAME, 
ShowTablePropertiesProcedure.builder)
 )
   }
 }
diff --git 
a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowTablePropertiesProcedure.scala
 
b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowTablePropertiesProcedure.scala
new file mode 100644
index 00..d75df07fc9
--- /dev/null
+++ 
b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowTablePropertiesProcedure.scala
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hudi.command.procedures
+
+import org.apache.hudi.HoodieCLIUtils
+import org.apache.hudi.common.table.HoodieTableMetaClient
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, 
StructType}
+
+import java.util
+import java.util.function.Supplier
+import scala.collection.JavaConversions._
+
+class ShowTablePropertiesProcedure(

[hudi] branch master updated: [HUDI-5083]Fixed a bug when schema evolution (#7045)

2022-10-28 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new b4563ddca5 [HUDI-5083]Fixed a bug when schema evolution (#7045)
b4563ddca5 is described below

commit b4563ddca566358be0897523decd29fe84ab332a
Author: 申胜利 <48829688+shenshen...@users.noreply.github.com>
AuthorDate: Sat Oct 29 09:24:16 2022 +0800

[HUDI-5083]Fixed a bug when schema evolution (#7045)
---
 .../java/org/apache/hudi/avro/HoodieAvroUtils.java |  4 +--
 .../org/apache/spark/sql/hudi/TestSpark3DDL.scala  | 38 ++
 2 files changed, 40 insertions(+), 2 deletions(-)

diff --git 
a/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java 
b/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java
index a352e86b96..5288f7fa0c 100644
--- a/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java
+++ b/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java
@@ -767,14 +767,14 @@ public class HoodieAvroUtils {
   Schema.Field field = fields.get(i);
   String fieldName = field.name();
   fieldNames.push(fieldName);
-  if (oldSchema.getField(field.name()) != null) {
+  if (oldSchema.getField(field.name()) != null && 
!renameCols.containsKey(field.name())) {
 Schema.Field oldField = oldSchema.getField(field.name());
 newRecord.put(i, 
rewriteRecordWithNewSchema(indexedRecord.get(oldField.pos()), 
oldField.schema(), fields.get(i).schema(), renameCols, fieldNames));
   } else {
 String fieldFullName = createFullName(fieldNames);
 String fieldNameFromOldSchema = 
renameCols.getOrDefault(fieldFullName, "");
 // deal with rename
-if (oldSchema.getField(field.name()) == null && 
oldSchema.getField(fieldNameFromOldSchema) != null) {
+if (oldSchema.getField(fieldNameFromOldSchema) != null) {
   // find rename
   Schema.Field oldField = 
oldSchema.getField(fieldNameFromOldSchema);
   newRecord.put(i, 
rewriteRecordWithNewSchema(indexedRecord.get(oldField.pos()), 
oldField.schema(), fields.get(i).schema(), renameCols, fieldNames));
diff --git 
a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestSpark3DDL.scala
 
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestSpark3DDL.scala
index 65357b903b..9d955cb831 100644
--- 
a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestSpark3DDL.scala
+++ 
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestSpark3DDL.scala
@@ -385,6 +385,44 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase {
 }
   }
 
+  test("Test Alter Table multiple times") {
+withTempDir { tmp =>
+  Seq("cow", "mor").foreach { tableType =>
+val tableName = generateTableName
+val tablePath = s"${new Path(tmp.getCanonicalPath, 
tableName).toUri.toString}"
+if (HoodieSparkUtils.gteqSpark3_1) {
+  spark.sql("set hoodie.schema.on.read.enable=true")
+  spark.sql(
+s"""
+   |create table $tableName (
+   |  id int,
+   |  col1 string,
+   |  col2 string,
+   |  ts long
+   |) using hudi
+   | location '$tablePath'
+   | options (
+   |  type = '$tableType',
+   |  primaryKey = 'id',
+   |  preCombineField = 'ts'
+   | )
+ """.stripMargin)
+  spark.sql(s"show create table ${tableName}").show(false)
+  spark.sql(s"insert into ${tableName} values (1, 'aaa', 'bbb', 1000)")
+
+  // Rename to a previously existing column name + insert
+  spark.sql(s"alter table ${tableName} drop column col1")
+  spark.sql(s"alter table ${tableName} rename column col2 to col1")
+
+  spark.sql(s"insert into ${tableName} values (2, 'aaa', 1000)")
+  checkAnswer(spark.sql(s"select col1 from ${tableName} order by 
id").collect())(
+Seq("bbb"), Seq("aaa")
+  )
+}
+  }
+}
+  }
+
   test("Test Alter Table complex") {
 withTempDir { tmp =>
   Seq("cow", "mor").foreach { tableType =>



[hudi] branch master updated (c6e5fd0271 -> 48d23ff233)

2022-10-28 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from c6e5fd0271 [HUDI-3287] Remove hudi-spark dependencies from 
hudi-kafka-connect-bundle (#6079)
 add 48d23ff233 [HUDI-5000] Support schema evolution for Hive/presto (#6989)

No new revisions were added by this update.

Summary of changes:
 .../hudi/hadoop/HoodieParquetInputFormat.java  |   3 +
 .../apache/hudi/hadoop/SchemaEvolutionContext.java | 353 +
 .../realtime/AbstractRealtimeRecordReader.java |  29 +-
 .../realtime/RealtimeCompactedRecordReader.java|   4 +-
 .../functional/TestHiveTableSchemaEvolution.java   | 155 +
 packaging/hudi-hadoop-mr-bundle/pom.xml|   1 +
 packaging/hudi-presto-bundle/pom.xml   |   1 +
 7 files changed, 539 insertions(+), 7 deletions(-)
 create mode 100644 
hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/SchemaEvolutionContext.java
 create mode 100644 
hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestHiveTableSchemaEvolution.java



[hudi] branch master updated: [HUDI-5085]When a flink job has multiple sink tables, the index loading status is abnormal (#7051)

2022-10-26 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new 22ccdc1540 [HUDI-5085]When a flink job has multiple sink tables, the 
index loading status is abnormal (#7051)
22ccdc1540 is described below

commit 22ccdc154008e38b74a4361c2a5aedfaef56a87e
Author: YangXiao <919869...@qq.com>
AuthorDate: Wed Oct 26 16:44:26 2022 +0800

[HUDI-5085]When a flink job has multiple sink tables, the index loading 
status is abnormal (#7051)
---
 .../src/main/java/org/apache/hudi/sink/bootstrap/BootstrapOperator.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git 
a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/BootstrapOperator.java
 
b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/BootstrapOperator.java
index 10d46abc94..09250e3132 100644
--- 
a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/BootstrapOperator.java
+++ 
b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/BootstrapOperator.java
@@ -160,7 +160,7 @@ public class BootstrapOperator>
 int readyTaskNum = 1;
 while (taskNum != readyTaskNum) {
   try {
-readyTaskNum = 
aggregateManager.updateGlobalAggregate(BootstrapAggFunction.NAME, taskID, new 
BootstrapAggFunction());
+readyTaskNum = 
aggregateManager.updateGlobalAggregate(BootstrapAggFunction.NAME + 
conf.getString(FlinkOptions.TABLE_NAME), taskID, new BootstrapAggFunction());
 LOG.info("Waiting for other bootstrap tasks to complete, taskId = 
{}.", taskID);
 
 TimeUnit.SECONDS.sleep(5);



[hudi] branch master updated (488f58d770 -> 22d6019559)

2022-09-15 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from 488f58d770 [HUDI-4785] Fix partition discovery in bootstrap operation 
(#6673)
 add 22d6019559 [HUDI-4706] Fix InternalSchemaChangeApplier#applyAddChange 
error to add nest type (#6486)

No new revisions were added by this update.

Summary of changes:
 .../schema/action/InternalSchemaChangeApplier.java |  3 +-
 .../internal/schema/action/TableChangesHelper.java |  5 ++
 .../internal/schema/action/TestTableChanges.java   | 86 ++
 3 files changed, 93 insertions(+), 1 deletion(-)



[hudi] branch master updated (b228a2788a -> b6b515917c)

2022-08-28 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from b228a2788a [HUDI-4399][RFC-57] Protobuf support in DeltaStreamer 
(#6111)
 add b6b515917c [HUDI-4703] use the historical schema to response time 
travel query (#6499)

No new revisions were added by this update.

Summary of changes:
 .../hudi/common/table/TableSchemaResolver.java | 29 +++
 .../scala/org/apache/hudi/HoodieBaseRelation.scala | 11 ++-
 .../hudi/functional/TestTimeTravelQuery.scala  | 96 +++---
 .../spark/sql/hudi/TestTimeTravelTable.scala   | 59 +
 4 files changed, 184 insertions(+), 11 deletions(-)



[hudi] branch master updated (4bda6afe0b -> ded197800a)

2022-07-17 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from 4bda6afe0b [HUDI-4249] Fixing in-memory `HoodieData` implementation to 
operate lazily  (#5855)
 add ded197800a [HUDI-4170] Make user can use hoodie.datasource.read.paths 
to read necessary files (#5722)

No new revisions were added by this update.

Summary of changes:
 .../main/scala/org/apache/hudi/DefaultSource.scala | 16 +++-
 .../scala/org/apache/hudi/HoodieBaseRelation.scala | 44 +--
 .../scala/org/apache/hudi/HoodieFileIndex.scala| 17 ++--
 .../apache/hudi/MergeOnReadSnapshotRelation.scala  | 20 +
 .../apache/hudi/functional/TestCOWDataSource.scala | 35 -
 .../apache/hudi/functional/TestMORDataSource.scala | 90 ++
 6 files changed, 186 insertions(+), 36 deletions(-)



[hudi] branch master updated (fc8d96246a -> b686c07407)

2022-07-08 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from fc8d96246a [HUDI-4335] Bug fixes in AWSGlueCatalogSyncClient post 
schema evolution. (#5995)
 add b686c07407 [HUDI-4276] Reconcile schema-inject null values for missing 
fields and add new fields (#6017)

No new revisions were added by this update.

Summary of changes:
 .../apache/hudi/client/BaseHoodieWriteClient.java  | 17 +++--
 .../table/action/commit/HoodieMergeHelper.java |  2 +-
 .../scala/org/apache/hudi/HoodieSparkUtils.scala   |  9 +--
 .../java/org/apache/hudi/avro/HoodieAvroUtils.java | 42 ++--
 .../hudi/common/config/HoodieCommonConfig.java |  7 ++
 .../table/log/AbstractHoodieLogRecordReader.java   |  4 +-
 .../schema/action/InternalSchemaMerger.java| 10 ++-
 .../schema/utils/AvroSchemaEvolutionUtils.java | 74 ++--
 .../internal/schema/utils/InternalSchemaUtils.java |  7 +-
 .../org/apache/hudi/avro/TestHoodieAvroUtils.java  | 30 +
 .../schema/utils/TestAvroSchemaEvolutionUtils.java | 78 ++
 .../scala/org/apache/hudi/DataSourceOptions.scala  |  7 +-
 .../org/apache/hudi/HoodieSparkSqlWriter.scala | 24 +--
 .../org/apache/hudi/TestHoodieSparkUtils.scala |  4 +-
 .../org/apache/spark/sql/hudi/TestSpark3DDL.scala  | 68 ++-
 15 files changed, 273 insertions(+), 110 deletions(-)



[hudi] branch master updated (3670e82af5 -> b18c32379f)

2022-07-05 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from 3670e82af5 [HUDI-4356] Fix the error when sync hive in CTAS (#6029)
 add b18c32379f [HUDI-4219] Merge Into when update expression "col=s.col+2" 
on precombine cause exception (#5828)

No new revisions were added by this update.

Summary of changes:
 .../hudi/command/MergeIntoHoodieTableCommand.scala |  40 -
 .../apache/spark/sql/hudi/TestMergeIntoTable.scala | 181 +
 2 files changed, 215 insertions(+), 6 deletions(-)



[hudi] branch master updated (e8fbd4daf4 -> 360df576a9)

2022-06-24 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from e8fbd4daf4 [TEST][DO_NOT_MERGE]fix random failed for ci (#5948)
 add 360df576a9 Revert "[TEST][DO_NOT_MERGE]fix random failed for ci 
(#5948)" (#5971)

No new revisions were added by this update.

Summary of changes:
 .../src/main/scala/org/apache/hudi/BaseFileOnlyRelation.scala  | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)



[hudi] branch revert-5948-testflaky created (now 5cdf95a030)

2022-06-24 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a change to branch revert-5948-testflaky
in repository https://gitbox.apache.org/repos/asf/hudi.git


  at 5cdf95a030 Revert "[TEST][DO_NOT_MERGE]fix random failed for ci 
(#5948)"

This branch includes the following new commits:

 new 5cdf95a030 Revert "[TEST][DO_NOT_MERGE]fix random failed for ci 
(#5948)"

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.




[hudi] 01/01: Revert "[TEST][DO_NOT_MERGE]fix random failed for ci (#5948)"

2022-06-24 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a commit to branch revert-5948-testflaky
in repository https://gitbox.apache.org/repos/asf/hudi.git

commit 5cdf95a03006a0e7c9f923fa468774341e41c8e3
Author: xiarixiaoyao 
AuthorDate: Sat Jun 25 10:45:00 2022 +0800

Revert "[TEST][DO_NOT_MERGE]fix random failed for ci (#5948)"

This reverts commit e8fbd4daf49802f60f800ccc92e66369d44f07f6.
---
 .../src/main/scala/org/apache/hudi/BaseFileOnlyRelation.scala  | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/BaseFileOnlyRelation.scala
 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/BaseFileOnlyRelation.scala
index d6ec645920..4160c34b0c 100644
--- 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/BaseFileOnlyRelation.scala
+++ 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/BaseFileOnlyRelation.scala
@@ -166,9 +166,7 @@ class BaseFileOnlyRelation(sqlContext: SQLContext,
   DataSource.apply(
 sparkSession = sparkSession,
 paths = extraReadPaths,
-// Here we should specify the schema to the latest commit schema since
-// the table schema evolution.
-userSpecifiedSchema = userSchema.orElse(Some(tableStructSchema)),
+userSpecifiedSchema = userSchema,
 className = formatClassName,
 // Since we're reading the table as just collection of files we have 
to make sure
 // we only read the latest version of every Hudi's file-group, which 
might be compacted, clustered, etc.
@@ -177,7 +175,8 @@ class BaseFileOnlyRelation(sqlContext: SQLContext,
 // We rely on [[HoodieROTablePathFilter]], to do proper filtering to 
assure that
 options = optParams ++ Map(
   "mapreduce.input.pathFilter.class" -> 
classOf[HoodieROTablePathFilter].getName
-)
+),
+partitionColumns = partitionColumns
   )
 .resolveRelation()
 .asInstanceOf[HadoopFsRelation]



[hudi] branch master updated (eeafaeacd2 -> e8fbd4daf4)

2022-06-24 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from eeafaeacd2 [HUDI-3512] Add call procedure for StatsCommand (#5955)
 add e8fbd4daf4 [TEST][DO_NOT_MERGE]fix random failed for ci (#5948)

No new revisions were added by this update.

Summary of changes:
 .../src/main/scala/org/apache/hudi/BaseFileOnlyRelation.scala  | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)



[hudi] branch master updated: [HUDI-3508] Add call procedure for FileSystemViewCommand (#5929)

2022-06-22 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new 1bb017d396 [HUDI-3508] Add call procedure for FileSystemViewCommand 
(#5929)
1bb017d396 is described below

commit 1bb017d396562b891065187bac417c52edea24da
Author: jiz <31836510+microbe...@users.noreply.github.com>
AuthorDate: Wed Jun 22 17:50:20 2022 +0800

[HUDI-3508] Add call procedure for FileSystemViewCommand (#5929)

* [HUDI-3508] Add call procedure for FileSystemView

* minor

Co-authored-by: jiimmyzhan 
---
 .../hudi/command/procedures/HoodieProcedures.scala |   2 +
 .../procedures/ShowFileSystemViewProcedure.scala   | 258 +
 .../sql/hudi/procedure/TestFsViewProcedure.scala   |  95 
 3 files changed, 355 insertions(+)

diff --git 
a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala
 
b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala
index ff129964fa..d974e216d5 100644
--- 
a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala
+++ 
b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala
@@ -48,6 +48,8 @@ object HoodieProcedures {
 mapBuilder.put(ShowRollbacksProcedure.NAME, ShowRollbacksProcedure.builder)
 mapBuilder.put(ShowRollbackDetailProcedure.NAME, 
ShowRollbackDetailProcedure.builder)
 mapBuilder.put(ExportInstantsProcedure.NAME, 
ExportInstantsProcedure.builder)
+mapBuilder.put(ShowAllFileSystemViewProcedure.NAME, 
ShowAllFileSystemViewProcedure.builder)
+mapBuilder.put(ShowLatestFileSystemViewProcedure.NAME, 
ShowLatestFileSystemViewProcedure.builder)
 mapBuilder.build
   }
 }
diff --git 
a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowFileSystemViewProcedure.scala
 
b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowFileSystemViewProcedure.scala
new file mode 100644
index 00..8c861cf0f6
--- /dev/null
+++ 
b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowFileSystemViewProcedure.scala
@@ -0,0 +1,258 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hudi.command.procedures
+
+import com.google.common.collect.Lists
+import org.apache.hadoop.fs.{FileStatus, Path}
+import org.apache.hudi.common.fs.FSUtils
+import org.apache.hudi.common.model.{FileSlice, HoodieLogFile}
+import org.apache.hudi.common.table.HoodieTableMetaClient
+import org.apache.hudi.common.table.timeline.{HoodieDefaultTimeline, 
HoodieInstant, HoodieTimeline}
+import org.apache.hudi.common.table.view.HoodieTableFileSystemView
+import org.apache.hudi.common.util
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, 
StructType}
+
+import java.util.function.{Function, Supplier}
+import java.util.stream.Collectors
+import scala.collection.JavaConverters.{asJavaIteratorConverter, 
asScalaIteratorConverter}
+
+class ShowFileSystemViewProcedure(showLatest: Boolean) extends BaseProcedure 
with ProcedureBuilder {
+  private val PARAMETERS_ALL: Array[ProcedureParameter] = 
Array[ProcedureParameter](
+ProcedureParameter.required(0, "table", DataTypes.StringType, None),
+ProcedureParameter.optional(1, "max_instant", DataTypes.StringType, ""),
+ProcedureParameter.optional(2, "include_max", DataTypes.BooleanType, 
false),
+ProcedureParameter.optional(3, "include_inflight", DataTypes.BooleanType, 
false),
+ProcedureParameter.optional(4, "exclude_compaction", 
DataTypes.BooleanType, false),
+ProcedureParameter.optional(5, "limit", DataTypes.IntegerType, 10),
+ProcedureParameter.optional(6, "path_regex", DataTypes.StringType, "*/*/*&q

[hudi] branch master updated: [HUDI-3507] Support export command based on Call Produce Command (#5901)

2022-06-19 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new c5c4cfec91 [HUDI-3507] Support export command based on Call Produce 
Command (#5901)
c5c4cfec91 is described below

commit c5c4cfec915fad198ccb57d00bc8d875a78e794b
Author: ForwardXu 
AuthorDate: Sun Jun 19 18:48:22 2022 +0800

[HUDI-3507] Support export command based on Call Produce Command (#5901)
---
 .../apache/hudi/cli/commands/ExportCommand.java|  36 ++--
 .../procedures/ExportInstantsProcedure.scala   | 239 +
 .../hudi/command/procedures/HoodieProcedures.scala |   1 +
 .../procedure/TestExportInstantsProcedure.scala|  52 +
 4 files changed, 309 insertions(+), 19 deletions(-)

diff --git 
a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ExportCommand.java 
b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ExportCommand.java
index fa6e15b7af..95e7caa8a9 100644
--- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ExportCommand.java
+++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ExportCommand.java
@@ -18,6 +18,12 @@
 
 package org.apache.hudi.cli.commands;
 
+import org.apache.avro.generic.GenericRecord;
+import org.apache.avro.generic.IndexedRecord;
+import org.apache.avro.specific.SpecificData;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
 import org.apache.hudi.avro.HoodieAvroUtils;
 import org.apache.hudi.avro.model.HoodieArchivedMetaEntry;
 import org.apache.hudi.avro.model.HoodieCleanMetadata;
@@ -36,14 +42,6 @@ import org.apache.hudi.common.table.timeline.HoodieTimeline;
 import org.apache.hudi.common.table.timeline.TimelineMetadataUtils;
 import org.apache.hudi.common.util.ClosableIterator;
 import org.apache.hudi.exception.HoodieException;
-
-import org.apache.avro.generic.GenericRecord;
-import org.apache.avro.generic.IndexedRecord;
-import org.apache.avro.specific.SpecificData;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.Path;
-import org.apache.log4j.LogManager;
-import org.apache.log4j.Logger;
 import org.springframework.shell.core.CommandMarker;
 import org.springframework.shell.core.annotation.CliCommand;
 import org.springframework.shell.core.annotation.CliOption;
@@ -60,10 +58,10 @@ import java.util.stream.Collectors;
 
 /**
  * CLI commands to export various information from a HUDI dataset.
- *
+ * 
  * "export instants": Export Instants and their metadata from the Timeline to 
a local
- *directory specified by the parameter --localFolder
- *  The instants are exported in the json format.
+ * directory specified by the parameter --localFolder
+ * The instants are exported in the json format.
  */
 @Component
 public class ExportCommand implements CommandMarker {
@@ -83,7 +81,7 @@ public class ExportCommand implements CommandMarker {
 int numExports = limit == -1 ? Integer.MAX_VALUE : limit;
 int numCopied = 0;
 
-if (! new File(localFolder).isDirectory()) {
+if (!new File(localFolder).isDirectory()) {
   throw new HoodieException(localFolder + " is not a valid local 
directory");
 }
 
@@ -94,7 +92,7 @@ public class ExportCommand implements CommandMarker {
 
 // Archived instants are in the commit archive files
 FileStatus[] statuses = FSUtils.getFs(basePath, 
HoodieCLI.conf).globStatus(archivePath);
-List archivedStatuses = Arrays.stream(statuses).sorted((f1, 
f2) -> (int)(f1.getModificationTime() - 
f2.getModificationTime())).collect(Collectors.toList());
+List archivedStatuses = Arrays.stream(statuses).sorted((f1, 
f2) -> (int) (f1.getModificationTime() - 
f2.getModificationTime())).collect(Collectors.toList());
 
 if (descending) {
   Collections.reverse(nonArchivedInstants);
@@ -115,11 +113,11 @@ public class ExportCommand implements CommandMarker {
 
   private int copyArchivedInstants(List statuses, Set 
actionSet, int limit, String localFolder) throws Exception {
 int copyCount = 0;
+FileSystem fileSystem = 
FSUtils.getFs(HoodieCLI.getTableMetaClient().getBasePath(), HoodieCLI.conf);
 
 for (FileStatus fs : statuses) {
   // read the archived file
-  Reader reader = 
HoodieLogFormat.newReader(FSUtils.getFs(HoodieCLI.getTableMetaClient().getBasePath(),
 HoodieCLI.conf),
-  new HoodieLogFile(fs.getPath()), 
HoodieArchivedMetaEntry.getClassSchema());
+  Reader reader = HoodieLogFormat.newReader(fileSystem, new 
HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema());
 
   // read the avro blocks
   while (reader.hasNext() && copyCount < limit) {
@@ -130,7 +128,7 @@ public class ExportCommand implements CommandMarker {
 // Archived instants are saved as arvo encoded 
HoodieArchived

[hudi] branch master updated: [HUDI-3499] Add Call Procedure for show rollbacks (#5848)

2022-06-15 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new 7b946cf351 [HUDI-3499] Add Call Procedure for show rollbacks (#5848)
7b946cf351 is described below

commit 7b946cf35142139ab90320d943c90d905722989f
Author: superche <73096722+hechao-u...@users.noreply.github.com>
AuthorDate: Wed Jun 15 16:50:15 2022 +0800

[HUDI-3499] Add Call Procedure for show rollbacks (#5848)

* Add Call Procedure for show rollbacks

* fix

* add ut for show_rollback_detail and exception handle

Co-authored-by: superche 
---
 .../hudi/command/procedures/HoodieProcedures.scala |   2 +
 .../procedures/ShowRollbacksProcedure.scala| 147 +
 .../sql/hudi/procedure/TestCallProcedure.scala | 105 +++
 3 files changed, 254 insertions(+)

diff --git 
a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala
 
b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala
index 2b720bb94d..7cfeaaa0b6 100644
--- 
a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala
+++ 
b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala
@@ -45,6 +45,8 @@ object HoodieProcedures {
 mapBuilder.put(ShowCommitsMetadataProcedure.NAME, 
ShowCommitsMetadataProcedure.builder)
 mapBuilder.put(ShowSavepointsProcedure.NAME, 
ShowSavepointsProcedure.builder)
 mapBuilder.put(DeleteMarkerProcedure.NAME, DeleteMarkerProcedure.builder)
+mapBuilder.put(ShowRollbacksProcedure.NAME, ShowRollbacksProcedure.builder)
+mapBuilder.put(ShowRollbackDetailProcedure.NAME, 
ShowRollbackDetailProcedure.builder)
 mapBuilder.build
   }
 }
diff --git 
a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowRollbacksProcedure.scala
 
b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowRollbacksProcedure.scala
new file mode 100644
index 00..e5cacdb062
--- /dev/null
+++ 
b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowRollbacksProcedure.scala
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hudi.command.procedures
+
+import java.io.IOException
+import java.util
+import java.util.function.Supplier
+
+import org.apache.hudi.avro.model.HoodieRollbackMetadata
+import org.apache.hudi.common.table.HoodieTableMetaClient
+import org.apache.hudi.common.table.timeline.HoodieInstant.State
+import org.apache.hudi.common.table.timeline.HoodieTimeline.ROLLBACK_ACTION
+import org.apache.hudi.common.table.timeline.{HoodieActiveTimeline, 
HoodieInstant, HoodieTimeline, TimelineMetadataUtils}
+import org.apache.hudi.common.util.CollectionUtils
+import org.apache.hudi.exception.HoodieException
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, 
StructType}
+
+import scala.collection.JavaConversions.asScalaBuffer
+import scala.collection.JavaConverters._
+
+class ShowRollbacksProcedure(showDetails: Boolean) extends BaseProcedure with 
ProcedureBuilder {
+  private val ROLLBACKS_PARAMETERS = Array[ProcedureParameter](
+ProcedureParameter.required(0, "table", DataTypes.StringType, None),
+ProcedureParameter.optional(1, "limit", DataTypes.IntegerType, 10)
+  )
+
+  private val ROLLBACK_PARAMETERS = Array[ProcedureParameter](
+ProcedureParameter.required(0, "table", DataTypes.StringType, None),
+ProcedureParameter.optional(1, "limit", DataTypes.IntegerType, 10),
+ProcedureParameter.required(2, "instant_time", DataTypes.StringType, None)
+  )
+
+  private val ROLLBACKS_OUTPUT_TYPE = new StructType(Array[StructField](
+StructField("instant", DataTypes.StringType, nullab

[hudi] branch master updated: [HUDI-4192] HoodieHFileReader scan top cells after bottom cells throw NullPointerException (#5755)

2022-06-05 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new 73b0be3c96 [HUDI-4192] HoodieHFileReader scan top cells after bottom 
cells throw NullPointerException (#5755)
73b0be3c96 is described below

commit 73b0be3c962112efe541ae04fe0ea6f298558f17
Author: marchpure 
AuthorDate: Mon Jun 6 12:07:26 2022 +0800

[HUDI-4192] HoodieHFileReader scan top cells after bottom cells throw 
NullPointerException (#5755)

SeekTo top cells avoid NullPointerException
---
 .../io/storage/TestHoodieHFileReaderWriter.java| 32 ++
 .../apache/hudi/io/storage/HoodieHFileReader.java  |  6 
 2 files changed, 38 insertions(+)

diff --git 
a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieHFileReaderWriter.java
 
b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieHFileReaderWriter.java
index da6f717258..baede154c9 100644
--- 
a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieHFileReaderWriter.java
+++ 
b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieHFileReaderWriter.java
@@ -294,6 +294,38 @@ public class TestHoodieHFileReaderWriter extends 
TestHoodieReaderWriterBase {
 StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator, 
Spliterator.ORDERED), false)
 .collect(Collectors.toList());
 assertEquals(Collections.emptyList(), recordsByPrefix);
+
+// filter for "key50" and "key1" : entries from key50 and 'key10 to key19' 
should be matched.
+List expectedKey50and1s = allRecords.stream().filter(entry 
-> (entry.get("_row_key").toString()).contains("key1")
+|| 
(entry.get("_row_key").toString()).contains("key50")).collect(Collectors.toList());
+iterator =
+hfileReader.getRecordsByKeyPrefixIterator(Arrays.asList("key50", 
"key1"), avroSchema);
+recordsByPrefix =
+StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator, 
Spliterator.ORDERED), false)
+.collect(Collectors.toList());
+assertEquals(expectedKey50and1s, recordsByPrefix);
+
+// filter for "key50" and "key0" : entries from key50 and 'key00 to key09' 
should be matched.
+List expectedKey50and0s = allRecords.stream().filter(entry 
-> (entry.get("_row_key").toString()).contains("key0")
+|| 
(entry.get("_row_key").toString()).contains("key50")).collect(Collectors.toList());
+iterator =
+hfileReader.getRecordsByKeyPrefixIterator(Arrays.asList("key50", 
"key0"), avroSchema);
+recordsByPrefix =
+StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator, 
Spliterator.ORDERED), false)
+.collect(Collectors.toList());
+assertEquals(expectedKey50and0s, recordsByPrefix);
+
+// filter for "key1" and "key0" : entries from 'key10 to key19' and 'key00 
to key09' should be matched.
+List expectedKey1sand0s = expectedKey1s;
+expectedKey1sand0s.addAll(allRecords.stream()
+.filter(entry -> (entry.get("_row_key").toString()).contains("key0"))
+.collect(Collectors.toList()));
+iterator =
+hfileReader.getRecordsByKeyPrefixIterator(Arrays.asList("key1", 
"key0"), avroSchema);
+recordsByPrefix =
+StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator, 
Spliterator.ORDERED), false)
+.collect(Collectors.toList());
+assertEquals(expectedKey1sand0s, recordsByPrefix);
   }
 
   @ParameterizedTest
diff --git 
a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileReader.java 
b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileReader.java
index 899c2475da..0bf31d2a25 100644
--- 
a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileReader.java
+++ 
b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileReader.java
@@ -258,6 +258,12 @@ public class HoodieHFileReader 
implements HoodieFileRea
   if (!scanner.next()) {
 return Collections.emptyIterator();
   }
+} else if (val == -1) {
+  // If scanner is aleady on the top of hfile. avoid trigger seekTo again.
+  Option headerCell = 
Option.fromJavaOptional(scanner.getReader().getFirstKey());
+  if (headerCell.isPresent() && 
!headerCell.get().equals(scanner.getCell())) {
+scanner.seekTo();
+  }
 }
 
 class KeyPrefixIterator implements Iterator {



[hudi] branch master updated: [HUDI-4162] Fixed some constant mapping issues. (#5700)

2022-05-26 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new 57dbe57bed [HUDI-4162] Fixed some constant mapping issues. (#5700)
57dbe57bed is described below

commit 57dbe57bed963f529b2921b6bb6253a90a1a45af
Author: watermelon12138 <49849410+watermelon12...@users.noreply.github.com>
AuthorDate: Fri May 27 14:08:54 2022 +0800

[HUDI-4162] Fixed some constant mapping issues. (#5700)

Co-authored-by: y00617041 
---
 .../src/main/scala/org/apache/hudi/DataSourceOptions.scala| 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala
 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala
index ac4d0e5794..a62a402b6a 100644
--- 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala
+++ 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala
@@ -540,13 +540,13 @@ object DataSourceWriteOptions {
   val HIVE_SYNC_ENABLED_OPT_KEY = HiveSyncConfig.HIVE_SYNC_ENABLED.key()
   /** @deprecated Use {@link META_SYNC_ENABLED} and its methods instead */
   @Deprecated
-  val META_SYNC_ENABLED_OPT_KEY = 
HoodieSyncConfig.META_SYNC_DATABASE_NAME.key()
+  val META_SYNC_ENABLED_OPT_KEY = HoodieSyncConfig.META_SYNC_ENABLED.key()
   /** @deprecated Use {@link HIVE_DATABASE} and its methods instead */
   @Deprecated
   val HIVE_DATABASE_OPT_KEY = HoodieSyncConfig.META_SYNC_DATABASE_NAME.key()
   /** @deprecated Use {@link HIVE_TABLE} and its methods instead */
   @Deprecated
-  val HIVE_TABLE_OPT_KEY = HoodieSyncConfig.META_SYNC_DATABASE_NAME.key()
+  val HIVE_TABLE_OPT_KEY = HoodieSyncConfig.META_SYNC_TABLE_NAME.key()
   /** @deprecated Use {@link HIVE_BASE_FILE_FORMAT} and its methods instead */
   @Deprecated
   val HIVE_BASE_FILE_FORMAT_OPT_KEY = 
HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT.key()



[hudi] branch master updated (924e2e96a6 -> cacbd98687)

2022-04-27 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from 924e2e96a6 Claim RFC 52 for Introduce Secondary Index to Improve HUDI 
Query Performance (#5441)
 add cacbd98687 [HUDI-3945] After the async compaction operation is 
complete, the task should exit. (#5391)

No new revisions were added by this update.

Summary of changes:
 .../src/main/java/org/apache/hudi/utilities/HoodieCompactor.java  | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)



[hudi] branch master updated: [HUDI-3781] fix spark delete sql can not delete record (#5215)

2022-04-07 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new 7a6272fba1 [HUDI-3781] fix spark delete sql can not delete record 
(#5215)
7a6272fba1 is described below

commit 7a6272fba150fa9d7acd0b57cc4041ec49019faf
Author: KnightChess <981159...@qq.com>
AuthorDate: Fri Apr 8 14:26:40 2022 +0800

[HUDI-3781] fix spark delete sql can not delete record (#5215)
---
 .../spark/sql/hudi/ProvidesHoodieConfig.scala  |  5 ++-
 .../apache/spark/sql/hudi/TestDeleteTable.scala| 47 ++
 2 files changed, 51 insertions(+), 1 deletion(-)

diff --git 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala
 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala
index 74255473b5..31fb0ad6cb 100644
--- 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala
+++ 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala
@@ -255,7 +255,10 @@ trait ProvidesHoodieConfig extends Logging {
 val hoodieProps = getHoodieProps(catalogProperties, tableConfig, 
sparkSession.sqlContext.conf)
 val hiveSyncConfig = buildHiveSyncConfig(hoodieProps, hoodieCatalogTable)
 
-withSparkConf(sparkSession, hoodieCatalogTable.catalogProperties) {
+// operation can not be overwrite
+val options = hoodieCatalogTable.catalogProperties.-(OPERATION.key())
+
+withSparkConf(sparkSession, options) {
   Map(
 "path" -> path,
 RECORDKEY_FIELD.key -> hoodieCatalogTable.primaryKeys.mkString(","),
diff --git 
a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestDeleteTable.scala
 
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestDeleteTable.scala
index f005a14d7f..9c693f9626 100644
--- 
a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestDeleteTable.scala
+++ 
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestDeleteTable.scala
@@ -151,4 +151,51 @@ class TestDeleteTable extends TestHoodieSqlBase {
   }
 }
   }
+
+  test("Test Delete Table with op upsert") {
+withTempDir { tmp =>
+  Seq("cow", "mor").foreach {tableType =>
+val tableName = generateTableName
+// create table
+spark.sql(
+  s"""
+ |create table $tableName (
+ |  id int,
+ |  name string,
+ |  price double,
+ |  ts long
+ |) using hudi
+ | location '${tmp.getCanonicalPath}/$tableName'
+ | tblproperties (
+ |  type = '$tableType',
+ |  primaryKey = 'id',
+ |  preCombineField = 'ts',
+ |  hoodie.datasource.write.operation = 'upsert'
+ | )
+   """.stripMargin)
+// insert data to table
+spark.sql(s"insert into $tableName select 1, 'a1', 10, 1000")
+checkAnswer(s"select id, name, price, ts from $tableName")(
+  Seq(1, "a1", 10.0, 1000)
+)
+
+// delete data from table
+spark.sql(s"delete from $tableName where id = 1")
+checkAnswer(s"select count(1) from $tableName") (
+  Seq(0)
+)
+
+spark.sql(s"insert into $tableName select 2, 'a2', 10, 1000")
+spark.sql(s"delete from $tableName where id = 1")
+checkAnswer(s"select id, name, price, ts from $tableName")(
+  Seq(2, "a2", 10.0, 1000)
+)
+
+spark.sql(s"delete from $tableName")
+checkAnswer(s"select count(1) from $tableName")(
+  Seq(0)
+)
+  }
+}
+  }
 }



[hudi] branch master updated (b28f0d6ceb -> 3449e86989)

2022-04-04 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from b28f0d6ceb [HUDI-3290] Different file formats for the partition 
metadata file. (#5179)
 add 3449e86989 [HUDI-3780] improve drop partitions (#5178)

No new revisions were added by this update.

Summary of changes:
 .../hudi/aws/sync/AWSGlueCatalogSyncClient.java|   2 +-
 .../hudi/common/table/TableSchemaResolver.java |  19 ---
 .../hudi/metadata/HoodieTableMetadataUtil.java |  18 +++
 .../gcp/bigquery/HoodieBigQuerySyncClient.java |  12 +-
 .../sql/catalyst/catalog/HoodieCatalogTable.scala  |   5 +
 .../spark/sql/hudi/ProvidesHoodieConfig.scala  | 173 +
 .../AlterHoodieTableDropPartitionCommand.scala |  41 +
 .../hudi/command/DeleteHoodieTableCommand.scala|  44 +-
 .../hudi/command/MergeIntoHoodieTableCommand.scala |  24 +--
 .../hudi/sync/datahub/DataHubSyncClient.java   |   4 +-
 .../java/org/apache/hudi/dla/HoodieDLAClient.java  |   4 +-
 .../java/org/apache/hudi/hive/HiveSyncTool.java|   2 +-
 .../org/apache/hudi/hive/HoodieHiveClient.java |   2 +-
 .../hudi/sync/common/AbstractSyncHoodieClient.java |  26 ++--
 14 files changed, 216 insertions(+), 160 deletions(-)



[hudi] branch master updated (a048e94 -> 98b4e97)

2022-03-31 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git.


from a048e94  [HUDI-3743] Support DELETE_PARTITION for metadata table 
(#5169)
 add 98b4e97  [HUDI-3406] Rollback incorrectly relying on FS listing 
instead of Com… (#4957)

No new revisions were added by this update.

Summary of changes:
 .../hudi/client/utils/MetadataConversionUtils.java |  17 +-
 .../table/action/rollback/BaseRollbackHelper.java  |   5 -
 .../rollback/ListingBasedRollbackHelper.java   | 150 --
 .../rollback/ListingBasedRollbackStrategy.java | 302 +++--
 .../hudi/table/action/rollback/RollbackUtils.java  | 167 
 .../SerializablePathFilter.java}   |   9 +-
 .../table/upgrade/ZeroToOneUpgradeHandler.java |  34 +--
 .../TestMergeOnReadRollbackActionExecutor.java |   5 +-
 .../hudi/common/model/HoodieCommitMetadata.java|  13 +
 9 files changed, 329 insertions(+), 373 deletions(-)
 delete mode 100644 
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackHelper.java
 copy 
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/{commit/Partitioner.java
 => rollback/SerializablePathFilter.java} (83%)


[hudi] branch master updated (73a21092 -> 7889c78)

2022-03-31 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git.


from 73a21092 [HUDI-3732] Fixing rollback validation (#5157)
 add 7889c78  [HUDI-3729][SPARK] fixed the per regression by enable 
vectorizeReader for parquet file (#5168)

No new revisions were added by this update.

Summary of changes:
 .../org/apache/hudi/BaseFileOnlyRelation.scala |   5 +
 .../scala/org/apache/hudi/HoodieBaseRelation.scala |   2 +-
 .../hudi/MergeOnReadIncrementalRelation.scala  |   5 +
 .../apache/hudi/MergeOnReadSnapshotRelation.scala  |   5 +
 .../spark/hudi/benchmark/HoodieBenchmarkBase.scala |   9 ++
 .../benchmark/CowTableReadBenchmark.scala  | 136 +
 6 files changed, 161 insertions(+), 1 deletion(-)
 create mode 100644 
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/execution/benchmark/CowTableReadBenchmark.scala


[hudi] branch master updated (d31cde2 -> 1d0f4cc)

2022-03-27 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git.


from d31cde2  [MINOR] Fix call command parser use spark3.2 (#5144)
 add 1d0f4cc  [HUDI-3538] Support Compaction Command Based on Call 
Procedure Command for Spark SQL (#4945)

No new revisions were added by this update.

Summary of changes:
 .../hudi/command/CompactionHoodiePathCommand.scala |  92 +++--
 .../command/CompactionHoodieTableCommand.scala |   4 +-
 .../command/CompactionShowHoodiePathCommand.scala  |  31 ++---
 .../command/CompactionShowHoodieTableCommand.scala |   1 +
 .../command/procedures/HoodieProcedureUtils.scala  |  33 +++--
 .../hudi/command/procedures/HoodieProcedures.scala |   2 +
 .../procedures/RunCompactionProcedure.scala| 144 +
 ...ocedure.scala => ShowCompactionProcedure.scala} |  41 --
 ...ocedure.scala => TestClusteringProcedure.scala} |   3 +-
 .../TestCompactionProcedure.scala} |  68 +-
 10 files changed, 266 insertions(+), 153 deletions(-)
 copy 
hudi-common/src/main/java/org/apache/hudi/common/data/HoodieAccumulator.java => 
hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedureUtils.scala
 (54%)
 create mode 100644 
hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RunCompactionProcedure.scala
 copy 
hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/{ShowClusteringProcedure.scala
 => ShowCompactionProcedure.scala} (61%)
 rename 
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/procedure/{TestRunClusteringProcedure.scala
 => TestClusteringProcedure.scala} (99%)
 copy 
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/{TestCompactionTable.scala
 => procedure/TestCompactionProcedure.scala} (54%)


[hudi] branch master updated (465d553 -> 4b75cb6)

2022-03-14 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git.


from 465d553  [HUDI-3600] Tweak the default cleaning strategy to be more 
streaming friendly for flink (#5010)
 add 4b75cb6  fix NPE when run schdule using spark-sql if the commits time 
< hoodie.compact.inline.max.delta.commits (#4976)

No new revisions were added by this update.

Summary of changes:
 .../spark/sql/hudi/command/CompactionHoodiePathCommand.scala   | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)


[hudi] branch master updated (548000b -> 8859b48)

2022-03-09 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git.


from 548000b  [HUDI-3568] Introduce ChainedSchemaPostProcessor to support 
setting multi processors at once (#4968)
 add 8859b48  [HUDI-3383] Sync column comments while syncing a hive table 
(#4960)

No new revisions were added by this update.

Summary of changes:
 .../org/apache/hudi/AvroConversionUtils.scala  |  29 +--
 .../{simple-test.avsc => simple-test-doced.avsc}   |   4 +-
 .../main/java/org/apache/hudi/DataSourceUtils.java |   2 +
 .../scala/org/apache/hudi/DataSourceOptions.scala  |   5 +
 .../org/apache/hudi/HoodieSparkSqlWriter.scala |   1 +
 .../org/apache/hudi/TestAvroConversionUtils.scala  | 216 +
 .../java/org/apache/hudi/hive/HiveSyncConfig.java  |   5 +
 .../java/org/apache/hudi/hive/HiveSyncTool.java|  16 ++
 .../org/apache/hudi/hive/HoodieHiveClient.java |  51 -
 .../java/org/apache/hudi/hive/ddl/DDLExecutor.java |  10 +
 .../org/apache/hudi/hive/ddl/HMSDDLExecutor.java   |  24 ++-
 .../hudi/hive/ddl/QueryBasedDDLExecutor.java   |  19 ++
 .../org/apache/hudi/hive/TestHiveSyncTool.java |  76 
 13 files changed, 442 insertions(+), 16 deletions(-)
 copy hudi-common/src/test/resources/{simple-test.avsc => 
simple-test-doced.avsc} (86%)


[hudi] branch master updated (be9a264 -> 62f534d)

2022-03-03 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git.


from be9a264  [HUDI-3548] Fix if user specify key 
"hoodie.datasource.clustering.async.enable" directly, async clustering not work 
(#4905)
 add 62f534d  [HUDI-3445] Support Clustering Command Based on Call 
Procedure Command for Spark SQL (#4901)

No new revisions were added by this update.

Summary of changes:
 .../apache/hudi/config/HoodieClusteringConfig.java |  12 +
 .../org/apache/hudi/config/HoodieWriteConfig.java  |   4 +
 .../cluster/ClusteringPlanPartitionFilter.java |  16 +-
 .../PartitionAwareClusteringPlanStrategy.java  |  14 +-
 .../TestPartitionAwareClusteringPlanStrategy.java  |   2 +-
 .../org/apache/hudi/BaseHoodieTableFileIndex.java  |  11 +-
 .../scala/org/apache/hudi/HoodieCommonUtils.scala  | 286 +
 .../scala/org/apache/hudi/HoodieFileIndex.scala|  11 +-
 .../apache/hudi/SparkHoodieTableFileIndex.scala| 182 +
 .../org/apache/hudi/HoodieDataSourceHelpers.java   |  17 ++
 .../hudi/command/CompactionHoodiePathCommand.scala |  20 +-
 .../hudi/command/procedures/BaseProcedure.scala|  56 +++-
 .../hudi/command/procedures/HoodieProcedures.scala |   2 +
 .../RollbackToInstantTimeProcedure.scala   |   4 +-
 .../procedures/RunClusteringProcedure.scala| 176 +
 .../procedures/ShowClusteringProcedure.scala   |  69 +
 .../command/procedures/ShowCommitsProcedure.scala  |   4 +-
 .../sql/parser/HoodieSqlCommonAstBuilder.scala |  14 +-
 .../org/apache/hudi/TestDataSkippingUtils.scala|  29 +--
 .../apache/spark/sql/hudi/TestCallProcedure.scala  | 225 
 20 files changed, 908 insertions(+), 246 deletions(-)
 create mode 100644 
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieCommonUtils.scala
 create mode 100644 
hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RunClusteringProcedure.scala
 create mode 100644 
hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowClusteringProcedure.scala


[hudi] branch master updated (e78b2f1 -> c0e8b03)

2022-01-28 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git.


from e78b2f1  [HUDI-2943] Complete pending clustering before deltastreamer 
sync (#4572)
 add c0e8b03  [HUDI-1977] Fix Hudi CLI tempview query issue (#4626)

No new revisions were added by this update.

Summary of changes:
 .../hudi/cli/utils/SparkTempViewProvider.java  | 25 +-
 1 file changed, 15 insertions(+), 10 deletions(-)


[hudi] branch master updated (518488c -> 4f6cdd7)

2022-01-07 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git.


from 518488c  [HUDI-3185] HoodieConfig#getBoolean should return false when 
default not set (#4536)
 add 4f6cdd7  [HUDI-3192] Spark metastore schema evolution broken (#4533)

No new revisions were added by this update.

Summary of changes:
 .../src/main/java/org/apache/hudi/hive/HiveSyncTool.java  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)


[hudi] branch master updated (37b15ff -> a66212d)

2022-01-04 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git.


from 37b15ff  [HUDI-3147] Add endpoint_url to dynamodb lock provider (#4500)
 add a66212d  [HUDI-2966] Closing LogRecordScanner in compactor (#4478)

No new revisions were added by this update.

Summary of changes:
 .../apache/hudi/hadoop/realtime/RealtimeCompactedRecordReader.java | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)


[hudi] branch master updated: [HUDI-3107]Fix HiveSyncTool drop partitions using JDBC or hivesql or hms (#4453)

2021-12-30 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new ef9923f  [HUDI-3107]Fix HiveSyncTool drop partitions using JDBC or 
hivesql or hms (#4453)
ef9923f is described below

commit ef9923fc5551851ec4ee71896a62c0615da45ee8
Author: YueZhang <69956021+zhangyue19921...@users.noreply.github.com>
AuthorDate: Fri Dec 31 15:56:33 2021 +0800

[HUDI-3107]Fix HiveSyncTool drop partitions using JDBC or hivesql or hms 
(#4453)

* constructDropPartitions when drop partitions using jdbc

* done

* done

* code style

* code review

Co-authored-by: yuezhang 
---
 .../org/apache/hudi/hive/ddl/HMSDDLExecutor.java   |  4 +-
 .../apache/hudi/hive/ddl/HiveQueryDDLExecutor.java |  4 +-
 .../org/apache/hudi/hive/ddl/JDBCExecutor.java | 51 --
 .../hudi/hive/ddl/QueryBasedDDLExecutor.java   |  4 +-
 .../apache/hudi/hive/util/HivePartitionUtil.java   | 51 ++
 .../org/apache/hudi/hive/TestHiveSyncTool.java | 42 ++
 .../apache/hudi/hive/testutils/HiveTestUtil.java   | 21 +
 7 files changed, 169 insertions(+), 8 deletions(-)

diff --git 
a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/HMSDDLExecutor.java
 
b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/HMSDDLExecutor.java
index d3efebe..c3c5226 100644
--- 
a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/HMSDDLExecutor.java
+++ 
b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/HMSDDLExecutor.java
@@ -24,6 +24,7 @@ import org.apache.hudi.common.fs.StorageSchemes;
 import org.apache.hudi.hive.HiveSyncConfig;
 import org.apache.hudi.hive.HoodieHiveSyncException;
 import org.apache.hudi.hive.PartitionValueExtractor;
+import org.apache.hudi.hive.util.HivePartitionUtil;
 import org.apache.hudi.hive.util.HiveSchemaUtil;
 
 import org.apache.hadoop.fs.FileSystem;
@@ -236,7 +237,8 @@ public class HMSDDLExecutor implements DDLExecutor {
 LOG.info("Drop partitions " + partitionsToDrop.size() + " on " + 
tableName);
 try {
   for (String dropPartition : partitionsToDrop) {
-client.dropPartition(syncConfig.databaseName, tableName, 
dropPartition, false);
+String partitionClause = 
HivePartitionUtil.getPartitionClauseForDrop(dropPartition, 
partitionValueExtractor, syncConfig);
+client.dropPartition(syncConfig.databaseName, tableName, 
partitionClause, false);
 LOG.info("Drop partition " + dropPartition + " on " + tableName);
   }
 } catch (TException e) {
diff --git 
a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/HiveQueryDDLExecutor.java
 
b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/HiveQueryDDLExecutor.java
index 7161194..a4debfb 100644
--- 
a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/HiveQueryDDLExecutor.java
+++ 
b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/HiveQueryDDLExecutor.java
@@ -34,6 +34,7 @@ import org.apache.hadoop.hive.ql.metadata.HiveException;
 import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse;
 import org.apache.hadoop.hive.ql.session.SessionState;
 import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.hudi.hive.util.HivePartitionUtil;
 import org.apache.log4j.LogManager;
 import org.apache.log4j.Logger;
 
@@ -136,7 +137,8 @@ public class HiveQueryDDLExecutor extends 
QueryBasedDDLExecutor {
 LOG.info("Drop partitions " + partitionsToDrop.size() + " on " + 
tableName);
 try {
   for (String dropPartition : partitionsToDrop) {
-metaStoreClient.dropPartition(config.databaseName, tableName, 
dropPartition, false);
+String partitionClause = 
HivePartitionUtil.getPartitionClauseForDrop(dropPartition, 
partitionValueExtractor, config);
+metaStoreClient.dropPartition(config.databaseName, tableName, 
partitionClause, false);
 LOG.info("Drop partition " + dropPartition + " on " + tableName);
   }
 } catch (Exception e) {
diff --git 
a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/JDBCExecutor.java
 
b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/JDBCExecutor.java
index 493d4ee..997d6e0 100644
--- 
a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/JDBCExecutor.java
+++ 
b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/ddl/JDBCExecutor.java
@@ -18,6 +18,8 @@
 
 package org.apache.hudi.hive.ddl;
 
+import static org.apache.hudi.hive.util.HiveSchemaUtil.HIVE_ESCAPE_CHARACTER;
+
 import org.apache.hudi.hive.HiveSyncConfig;
 import org.apache.hudi.hive.HoodieHiveSyncException;
 
@@ -31,6 +33,7 @@ import java.sql.DriverManager;

[hudi] branch master updated: [HUDI-3008] Fixing HoodieFileIndex partition column parsing for nested fields

2021-12-22 Thread mengtao
This is an automated email from the ASF dual-hosted git repository.

mengtao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new 7d046f9  [HUDI-3008] Fixing HoodieFileIndex partition column parsing 
for nested fields
 new b5890cd  Merge pull request #4308 from harsh1231/HUDI-3008
7d046f9 is described below

commit 7d046f914a059b2623d7f2a7627c44b15ccc0ddb
Author: harshal patil 
AuthorDate: Tue Dec 14 17:28:18 2021 +0530

[HUDI-3008] Fixing HoodieFileIndex partition column parsing for nested 
fields
---
 .../scala/org/apache/hudi/HoodieFileIndex.scala| 25 
 .../org/apache/hudi/TestHoodieFileIndex.scala  | 27 --
 2 files changed, 46 insertions(+), 6 deletions(-)

diff --git 
a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieFileIndex.scala
 
b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieFileIndex.scala
index f9b68cb..0ed1b48 100644
--- 
a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieFileIndex.scala
+++ 
b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieFileIndex.scala
@@ -18,7 +18,6 @@
 package org.apache.hudi
 
 import org.apache.hadoop.fs.{FileStatus, Path}
-
 import org.apache.hudi.DataSourceReadOptions.{QUERY_TYPE, 
QUERY_TYPE_SNAPSHOT_OPT_VAL}
 import org.apache.hudi.client.common.HoodieSparkEngineContext
 import org.apache.hudi.common.config.HoodieMetadataConfig
@@ -27,7 +26,6 @@ import org.apache.hudi.common.model.FileSlice
 import org.apache.hudi.common.model.HoodieTableType.MERGE_ON_READ
 import org.apache.hudi.common.table.view.{FileSystemViewStorageConfig, 
HoodieTableFileSystemView}
 import org.apache.hudi.common.table.{HoodieTableMetaClient, 
TableSchemaResolver}
-
 import org.apache.spark.api.java.JavaSparkContext
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, 
BoundReference, Expression, InterpretedPredicate}
@@ -37,7 +35,7 @@ import org.apache.spark.sql.execution.datasources.{FileIndex, 
FileStatusCache, N
 import 
org.apache.spark.sql.hudi.DataSkippingUtils.createColumnStatsIndexFilterExpr
 import org.apache.spark.sql.hudi.HoodieSqlUtils
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.types.{StructField, StructType}
 import org.apache.spark.sql.{AnalysisException, Column, SparkSession}
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -108,7 +106,7 @@ case class HoodieFileIndex(
   private lazy val _partitionSchemaFromProperties: StructType = {
 val tableConfig = metaClient.getTableConfig
 val partitionColumns = tableConfig.getPartitionFields
-val nameFieldMap = schema.fields.map(filed => filed.name -> filed).toMap
+val nameFieldMap = generateNameFieldMap(Right(schema))
 
 if (partitionColumns.isPresent) {
   val partitionFields = partitionColumns.get().map(column =>
@@ -123,6 +121,25 @@ case class HoodieFileIndex(
 }
   }
 
+  /**
+   * This method traverses StructType recursively to build map of columnName 
-> StructField
+   * Note : If there is nesting of columns like ["a.b.c.d", "a.b.c.e"]  -> 
final map will have keys corresponding
+   * only to ["a.b.c.d", "a.b.c.e"] and not for subsets like ["a.b.c", "a.b"]
+   * @param structField
+   * @return map of ( columns names -> StructField )
+   */
+  private def generateNameFieldMap(structField: Either[StructField, 
StructType]) : Map[String, StructField] = {
+structField match {
+  case Right(field) => field.fields.map(f => 
generateNameFieldMap(Left(f))).flatten.toMap
+  case Left(field) => field.dataType match {
+case struct: StructType => generateNameFieldMap(Right(struct)).map {
+  case (key: String, sf: StructField)  => (field.name + "." + key, sf)
+}
+case _ => Map(field.name -> field)
+  }
+}
+  }
+
   private lazy val engineContext = new HoodieSparkEngineContext(new 
JavaSparkContext(spark.sparkContext))
 
   private lazy val configProperties = {
diff --git 
a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala
 
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala
index 0c3918b..62f98cf 100644
--- 
a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala
+++ 
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala
@@ -18,7 +18,6 @@
 package org.apache.hudi
 
 import java.util.Properties
-
 import org.apache.hudi.DataSourceWriteOptions._
 import org.apache.hudi.common.config.HoodieMetadataConfig
 import org.apache.hudi.common.table.HoodieTableMetaClient
@@ -31,6 +30,7 @@ import org