[hudi] branch master updated: [HUDI-915][HUDI-5656] Rebased `HoodieBootstrapRelation` onto `HoodieBaseRelation` (#7804)

2023-02-24 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new 2770ff50714 [HUDI-915][HUDI-5656] Rebased `HoodieBootstrapRelation` 
onto `HoodieBaseRelation` (#7804)
2770ff50714 is described below

commit 2770ff507141f013f7500354595137b52a543e8b
Author: Alexey Kudinkin 
AuthorDate: Fri Feb 24 08:43:49 2023 -0800

[HUDI-915][HUDI-5656] Rebased `HoodieBootstrapRelation` onto 
`HoodieBaseRelation` (#7804)

Currently `HoodieBootstrapRelation` is treats partitioned tables improperly 
resulting in NPE while trying to read bootstrapped table.

To address that `HoodieBootstrapRelation` have been rebased onto 
`HoodieBaseRelation` sharing core of the reading semantic with other Hudi's 
file-based Relation implementations for COW, MOR (such as schema handling, 
file-listing, etc)
---
 .../scala/org/apache/hudi/HoodieBaseRelation.scala |  47 ++--
 .../scala/org/apache/hudi/HoodieBootstrapRDD.scala | 103 
 .../org/apache/hudi/HoodieBootstrapRelation.scala  | 259 +++--
 .../apache/hudi/MergeOnReadSnapshotRelation.scala  |   2 +-
 .../spark/sql/hudi/HoodieSqlCommonUtils.scala  |  30 ++-
 .../functional/TestDataSourceForBootstrap.scala| 166 +++--
 .../deltastreamer/TestHoodieDeltaStreamer.java |   4 +
 7 files changed, 344 insertions(+), 267 deletions(-)

diff --git 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala
 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala
index 99b5b5c87ba..cb02c59a690 100644
--- 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala
+++ 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala
@@ -48,6 +48,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
 import 
org.apache.spark.sql.HoodieCatalystExpressionUtils.{convertToCatalystExpression,
 generateUnsafeProjection}
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.analysis.Resolver
 import org.apache.spark.sql.catalyst.expressions.{Expression, 
SubqueryExpression}
 import org.apache.spark.sql.execution.FileRelation
 import org.apache.spark.sql.execution.datasources._
@@ -66,7 +67,12 @@ import scala.util.{Failure, Success, Try}
 
 trait HoodieFileSplit {}
 
-case class HoodieTableSchema(structTypeSchema: StructType, avroSchemaStr: 
String, internalSchema: Option[InternalSchema] = None)
+case class HoodieTableSchema(structTypeSchema: StructType, avroSchemaStr: 
String, internalSchema: Option[InternalSchema] = None) {
+
+  def this(structTypeSchema: StructType) =
+this(structTypeSchema, convertToAvroSchema(structTypeSchema).toString)
+
+}
 
 case class HoodieTableState(tablePath: String,
 latestCommitTimestamp: Option[String],
@@ -98,6 +104,8 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext,
 
   protected val sparkSession: SparkSession = sqlContext.sparkSession
 
+  protected lazy val resolver: Resolver = 
sparkSession.sessionState.analyzer.resolver
+
   protected lazy val conf: Configuration = new 
Configuration(sqlContext.sparkContext.hadoopConfiguration)
   protected lazy val jobConf = new JobConf(conf)
 
@@ -174,8 +182,6 @@ abstract class HoodieBaseRelation(val sqlContext: 
SQLContext,
 
   protected lazy val tableStructSchema: StructType = {
 val converted = 
AvroConversionUtils.convertAvroSchemaToStructType(tableAvroSchema)
-
-val resolver = sparkSession.sessionState.analyzer.resolver
 val metaFieldMetadata = sparkAdapter.createCatalystMetadataForMetaField
 
 // NOTE: Here we annotate meta-fields with corresponding metadata such 
that Spark (>= 3.2)
@@ -466,10 +472,15 @@ abstract class HoodieBaseRelation(val sqlContext: 
SQLContext,
* For enable hoodie.datasource.write.drop.partition.columns, need to create 
an InternalRow on partition values
* and pass this reader on parquet file. So that, we can query the partition 
columns.
*/
-  protected def getPartitionColumnsAsInternalRow(file: FileStatus): 
InternalRow = {
+
+  protected def getPartitionColumnsAsInternalRow(file: FileStatus): 
InternalRow =
+getPartitionColumnsAsInternalRowInternal(file, 
shouldExtractPartitionValuesFromPartitionPath)
+
+  protected def getPartitionColumnsAsInternalRowInternal(file: FileStatus,
+ 
extractPartitionValuesFromPartitionPath: Boolean): InternalRow = {
 try {
   val tableConfig = metaClient.getTableConfig
-  if (shouldExtractPartitionValuesFromPartitionPath) {
+  if (extractPartitionValuesFromPartitionPath) {
 val relativePath = new URI(metaClient.getBasePath).relativize(new 

[hudi] branch master updated (96032b9b768 -> 25fc4f2ab52)

2023-02-17 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from 96032b9b768 [HUDI-5557]Avoid converting columns that are not indexed 
in CSI (#7672)
 add 25fc4f2ab52 [MINOR] De-duplicating Iterator implementations (#7752)

No new revisions were added by this update.

Summary of changes:
 .../hudi/cli/commands/ArchivedCommitsCommand.java  |  2 +-
 .../apache/hudi/cli/commands/ExportCommand.java|  2 +-
 .../hudi/cli/commands/HoodieLogFileCommand.java|  2 +-
 .../table/action/commit/HoodieMergeHelper.java |  2 +-
 .../MultipleSparkJobExecutionStrategy.java |  6 +--
 .../strategy/SingleSparkJobExecutionStrategy.java  |  6 +--
 .../hudi/io/storage/HoodieSparkParquetReader.java  |  6 +--
 .../functional/TestHoodieBackedMetadata.java   |  2 +-
 .../functional/TestHoodieBackedTableMetadata.java  |  2 +-
 .../table/log/AbstractHoodieLogRecordReader.java   | 29 +++-
 .../table/log/HoodieCDCLogRecordIterator.java  |  6 +--
 .../table/log/block/HoodieAvroDataBlock.java   |  9 ++--
 .../common/table/log/block/HoodieDataBlock.java|  2 +-
 .../table/log/block/HoodieHFileDataBlock.java  |  6 +--
 .../table/log/block/HoodieParquetDataBlock.java|  2 +-
 .../table/timeline/HoodieArchivedTimeline.java |  2 +-
 .../org/apache/hudi/common/util/BaseFileUtils.java |  1 +
 .../common/util/ClosableIteratorWithSchema.java| 55 --
 .../apache/hudi/common/util/IdentityIterator.java  | 44 -
 .../apache/hudi/common/util/MappingIterator.java   | 47 --
 .../apache/hudi/common/util/OrcReaderIterator.java |  1 +
 .../java/org/apache/hudi/common/util/OrcUtils.java |  1 +
 .../hudi/common/util/ParquetReaderIterator.java|  1 +
 .../org/apache/hudi/common/util/ParquetUtils.java  |  1 +
 .../common/util/collection/BitCaskDiskMap.java |  1 -
 .../util/{ => collection}/ClosableIterator.java| 23 -
 .../util/collection/CloseableMappingIterator.java  |  9 ++--
 .../common/util/collection/LazyFileIterable.java   |  1 -
 .../util/queue/IteratorBasedQueueProducer.java |  2 +-
 .../hudi/io/storage/HoodieAvroFileReaderBase.java  |  6 +--
 .../hudi/io/storage/HoodieAvroHFileReader.java |  8 ++--
 .../hudi/io/storage/HoodieAvroOrcReader.java   |  2 +-
 .../hudi/io/storage/HoodieAvroParquetReader.java   |  6 +--
 .../apache/hudi/io/storage/HoodieFileReader.java   |  2 +-
 .../hudi/io/storage/HoodieSeekingFileReader.java   |  2 +-
 .../hudi/metadata/HoodieBackedTableMetadata.java   |  2 +-
 .../common/functional/TestHoodieLogFormat.java |  2 +-
 .../hudi/sink/bootstrap/BootstrapOperator.java |  2 +-
 .../hudi/sink/clustering/ClusteringOperator.java   |  4 +-
 .../table/format/ParquetSplitRecordIterator.java   |  2 +-
 .../apache/hudi/table/format/RecordIterators.java  |  2 +-
 .../table/format/SchemaEvolvedRecordIterator.java  |  2 +-
 .../hudi/table/format/cdc/CdcInputFormat.java  |  2 +-
 .../table/format/cow/CopyOnWriteInputFormat.java   |  2 +-
 .../table/format/mor/MergeOnReadInputFormat.java   |  2 +-
 .../reader/DFSHoodieDatasetInputReader.java|  4 +-
 .../hudi/utilities/deltastreamer/DeltaSync.java|  6 +--
 47 files changed, 105 insertions(+), 226 deletions(-)
 delete mode 100644 
hudi-common/src/main/java/org/apache/hudi/common/util/ClosableIteratorWithSchema.java
 delete mode 100644 
hudi-common/src/main/java/org/apache/hudi/common/util/IdentityIterator.java
 delete mode 100644 
hudi-common/src/main/java/org/apache/hudi/common/util/MappingIterator.java
 rename hudi-common/src/main/java/org/apache/hudi/common/util/{ => 
collection}/ClosableIterator.java (70%)



[hudi] branch master updated: [HUDI-5557]Avoid converting columns that are not indexed in CSI (#7672)

2023-02-17 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new 96032b9b768 [HUDI-5557]Avoid converting columns that are not indexed 
in CSI (#7672)
96032b9b768 is described below

commit 96032b9b768cc0d4c23eb6a6afd5ce019d56826b
Author: rfyu <39233058+r...@users.noreply.github.com>
AuthorDate: Sat Feb 18 01:11:57 2023 +0800

[HUDI-5557]Avoid converting columns that are not indexed in CSI (#7672)

Avoid converting columns that are not indexed in CSI

If we directly set the non-indexed columns statistics to null instead of 
filtering out these columns,the result of computing pruned list of candidate 
base-files according to Metadata Table's Column Statistics index may be wrong.

-
Co-authored-by: yrf余若凡 
---
 .../org/apache/hudi/ColumnStatsIndexSupport.scala  |  35 +++
 .../org/apache/hudi/ColumnStatsIndexHelper.java|   7 +-
 .../org/apache/hudi/TestDataSkippingUtils.scala|   2 +-
 .../hudi/functional/TestColumnStatsIndex.scala | 115 -
 4 files changed, 131 insertions(+), 28 deletions(-)

diff --git 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/ColumnStatsIndexSupport.scala
 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/ColumnStatsIndexSupport.scala
index 5cf7a5ec035..68f7a9f16c1 100644
--- 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/ColumnStatsIndexSupport.scala
+++ 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/ColumnStatsIndexSupport.scala
@@ -209,12 +209,16 @@ class ColumnStatsIndexSupport(spark: SparkSession,
 // NOTE: We're sorting the columns to make sure final index schema matches 
layout
 //   of the transposed table
 val sortedTargetColumnsSet = TreeSet(queryColumns:_*)
-val sortedTargetColumns = sortedTargetColumnsSet.toSeq
 
 // NOTE: This is a trick to avoid pulling all of 
[[ColumnStatsIndexSupport]] object into the lambdas'
 //   closures below
 val indexedColumns = this.indexedColumns
 
+// NOTE: It's crucial to maintain appropriate ordering of the columns
+//   matching table layout: hence, we cherry-pick individual columns
+//   instead of simply filtering in the ones we're interested in the 
schema
+val (indexSchema, targetIndexedColumns) = 
composeIndexSchema(sortedTargetColumnsSet.toSeq, indexedColumns, tableSchema)
+
 // Here we perform complex transformation which requires us to modify the 
layout of the rows
 // of the dataset, and therefore we rely on low-level RDD API to avoid 
incurring encoding/decoding
 // penalty of the [[Dataset]], since it's required to adhere to its schema 
at all times, while
@@ -257,7 +261,7 @@ class ColumnStatsIndexSupport(spark: SparkSession,
 // to align existing column-stats for individual file with the list of 
expected ones for the
 // whole transposed projection (a superset of all files)
 val columnRecordsMap = columnRecordsSeq.map(r => (r.getColumnName, 
r)).toMap
-val alignedColStatRecordsSeq = 
sortedTargetColumns.map(columnRecordsMap.get)
+val alignedColStatRecordsSeq = 
targetIndexedColumns.map(columnRecordsMap.get)
 
 val coalescedRowValuesSeq =
   alignedColStatRecordsSeq.foldLeft(ListBuffer[Any](fileName, 
valueCount)) {
@@ -267,31 +271,19 @@ class ColumnStatsIndexSupport(spark: SparkSession,
   acc ++= Seq(colStatRecord.getMinValue, 
colStatRecord.getMaxValue, colStatRecord.getNullCount)
 case None =>
   // NOTE: This could occur in either of the following cases:
-  //1. Column is not indexed in Column Stats Index: in 
this case we won't be returning
-  //   any statistics for such column (ie all stats will 
be null)
-  //2. Particular file does not have this particular 
column (which is indexed by Column Stats Index):
+  //1. Particular file does not have this particular 
column (which is indexed by Column Stats Index):
   //   in this case we're assuming missing column to 
essentially contain exclusively
   //   null values, we set min/max values as null and 
null-count to be equal to value-count (this
   //   behavior is consistent with reading non-existent 
columns from Parquet)
   //
   // This is a way to determine current column's index without 
explicit iteration (we're adding 3 stats / column)
-  val idx = acc.length / 3
-  val colName = sortedTargetColumns(idx)
-  val indexed = indexedColumns.contains(colName)
-
-  val nullCount = if (indexed

[hudi] branch master updated (653aa86145f -> 67f4c78c5a0)

2023-02-10 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from 653aa86145f [MINOR] Fix wrong assertion in TestHoodieTableFactory.java 
(#7916)
 add 67f4c78c5a0 [HUDI-5758] Restoring state of `HoodieKey` to make sure 
it's binary compatible w/ its state in 0.12 (#7917)

No new revisions were added by this update.

Summary of changes:
 .../apache/spark/HoodieSparkKryoRegistrar.scala| 25 +--
 .../org/apache/hudi/common/model/DeleteRecord.java |  9 +++
 .../org/apache/hudi/common/model/HoodieKey.java| 28 --
 .../common/table/log/block/HoodieDeleteBlock.java  |  2 ++
 4 files changed, 44 insertions(+), 20 deletions(-)



[hudi] branch master updated: Cleaning up unnecessary relocation for com.google.common packages (#7900)

2023-02-09 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new 60dfe4d766d Cleaning up unnecessary relocation for com.google.common 
packages (#7900)
60dfe4d766d is described below

commit 60dfe4d766d6050bb811e4f7d51b88126b5d5575
Author: Alexey Kudinkin 
AuthorDate: Thu Feb 9 08:44:23 2023 -0800

Cleaning up unnecessary relocation for com.google.common packages (#7900)

Addresses an issue of following relocations configs in MR/Spark bundles 
stranded after removal of Guava from Hudi Spark and MR bundles:

```

  com.google.common.
  org.apache.hudi.com.google.common.

```

Such relocations entailed that all references from any class (included into 
the Hudi bundle) referencing Guava would be shaded, even though Hudi isn't 
packaging Guava anymore, potentially resulting in exception when these classes 
try to access Guava provided by Spark for ex:

```
Caused by: java.lang.NoClassDefFoundError: 
org/apache/hudi/com/google/common/base/Preconditions
at 
org.apache.curator.ensemble.fixed.FixedEnsembleProvider.(FixedEnsembleProvider.java:39)
at 
org.apache.curator.framework.CuratorFrameworkFactory$Builder.connectString(CuratorFrameworkFactory.java:193)
at 
org.apache.kyuubi.ha.client.zookeeper.ZookeeperClientProvider$.buildZookeeperClient(ZookeeperClientProvider.scala:62)
at 
org.apache.kyuubi.ha.client.zookeeper.ZookeeperDiscoveryClient.(ZookeeperDiscoveryClient.scala:65)
... 45 more
```
---
 packaging/hudi-hadoop-mr-bundle/pom.xml | 4 
 packaging/hudi-spark-bundle/pom.xml | 4 
 2 files changed, 8 deletions(-)

diff --git a/packaging/hudi-hadoop-mr-bundle/pom.xml 
b/packaging/hudi-hadoop-mr-bundle/pom.xml
index 1df69d6ef8c..ec28d7f 100644
--- a/packaging/hudi-hadoop-mr-bundle/pom.xml
+++ b/packaging/hudi-hadoop-mr-bundle/pom.xml
@@ -117,10 +117,6 @@
   org.apache.parquet.avro.
   
org.apache.hudi.org.apache.parquet.avro.
 
-
-  com.google.common.
-  
org.apache.hudi.com.google.common.
-
 
   org.openjdk.jol.
   
org.apache.hudi.org.openjdk.jol.
diff --git a/packaging/hudi-spark-bundle/pom.xml 
b/packaging/hudi-spark-bundle/pom.xml
index 4e93f399275..32c7ee8bc90 100644
--- a/packaging/hudi-spark-bundle/pom.xml
+++ b/packaging/hudi-spark-bundle/pom.xml
@@ -189,10 +189,6 @@
   org.eclipse.jetty.
   
org.apache.hudi.org.apache.jetty.
 
-
-  com.google.common.
-  
org.apache.hudi.com.google.common.
-
 
   org.openjdk.jol.
   
org.apache.hudi.org.openjdk.jol.



[hudi] branch master updated: Make most of the Spark SQL DML operations configs overridable (#7850)

2023-02-04 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new 13ed95a9fd2 Make most of the Spark SQL DML operations configs 
overridable (#7850)
13ed95a9fd2 is described below

commit 13ed95a9fd2e0b330f8e16cb2b08714a432a70b4
Author: Alexey Kudinkin 
AuthorDate: Sat Feb 4 10:54:17 2023 -0800

Make most of the Spark SQL DML operations configs overridable (#7850)

This PR makes most of the Spark SQL operations configs are overridable, 
leaving only a few ones as those that should not and couldn't be overridden by 
the user
---
 .../spark/sql/hudi/ProvidesHoodieConfig.scala  | 69 +++---
 1 file changed, 36 insertions(+), 33 deletions(-)

diff --git 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala
 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala
index 0c766f5135b..ffc4079824f 100644
--- 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala
+++ 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala
@@ -61,20 +61,10 @@ trait ProvidesHoodieConfig extends Logging {
 val hiveSyncConfig = buildHiveSyncConfig(sparkSession, hoodieCatalogTable, 
tableConfig)
 
 val defaultOpts = Map[String, String](
+  OPERATION.key -> UPSERT_OPERATION_OPT_VAL,
   KEYGENERATOR_CLASS_NAME.key -> classOf[SqlKeyGenerator].getCanonicalName,
-  OPERATION.key -> UPSERT_OPERATION_OPT_VAL
-)
-
-val overridingOpts = Map[String, String](
-  "path" -> hoodieCatalogTable.tableLocation,
-  RECORDKEY_FIELD.key -> hoodieCatalogTable.primaryKeys.mkString(","),
-  TBL_NAME.key -> hoodieCatalogTable.tableName,
-  PRECOMBINE_FIELD.key -> preCombineField,
-  HIVE_STYLE_PARTITIONING.key -> 
tableConfig.getHiveStylePartitioningEnable,
-  URL_ENCODE_PARTITIONING.key -> tableConfig.getUrlEncodePartitioning,
   SqlKeyGenerator.ORIGINAL_KEYGEN_CLASS_NAME -> 
tableConfig.getKeyGeneratorClassName,
   SqlKeyGenerator.PARTITION_SCHEMA -> 
hoodieCatalogTable.partitionSchema.toDDL,
-  PARTITIONPATH_FIELD.key -> tableConfig.getPartitionFieldProp,
   HoodieSyncConfig.META_SYNC_ENABLED.key -> 
hiveSyncConfig.getString(HoodieSyncConfig.META_SYNC_ENABLED.key),
   HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key -> 
hiveSyncConfig.getString(HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key),
   HiveSyncConfigHolder.HIVE_SYNC_MODE.key -> 
hiveSyncConfig.getStringOrDefault(HiveSyncConfigHolder.HIVE_SYNC_MODE, 
HiveSyncMode.HMS.name()),
@@ -85,6 +75,16 @@ trait ProvidesHoodieConfig extends Logging {
   HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE.key -> 
hiveSyncConfig.getBoolean(HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE).toString
 )
 
+val overridingOpts = Map[String, String](
+  "path" -> hoodieCatalogTable.tableLocation,
+  RECORDKEY_FIELD.key -> hoodieCatalogTable.primaryKeys.mkString(","),
+  TBL_NAME.key -> hoodieCatalogTable.tableName,
+  PRECOMBINE_FIELD.key -> preCombineField,
+  HIVE_STYLE_PARTITIONING.key -> 
tableConfig.getHiveStylePartitioningEnable,
+  URL_ENCODE_PARTITIONING.key -> tableConfig.getUrlEncodePartitioning,
+  PARTITIONPATH_FIELD.key -> tableConfig.getPartitionFieldProp
+)
+
 combineOptions(hoodieCatalogTable, tableConfig, 
sparkSession.sqlContext.conf,
   defaultOpts = defaultOpts, overridingOpts = overridingOpts)
   }
@@ -183,22 +183,10 @@ trait ProvidesHoodieConfig extends Logging {
   PAYLOAD_CLASS_NAME.key -> payloadClassName,
   // NOTE: By default insert would try to do deduplication in case that 
pre-combine column is specified
   //   for the table
-  HoodieWriteConfig.COMBINE_BEFORE_INSERT.key -> 
String.valueOf(hasPrecombineColumn)
-)
-
-val overridingOpts = extraOptions ++ Map(
-  "path" -> path,
-  TABLE_TYPE.key -> tableType,
-  TBL_NAME.key -> hoodieCatalogTable.tableName,
-  OPERATION.key -> operation,
-  HIVE_STYLE_PARTITIONING.key -> hiveStylePartitioningEnable,
-  URL_ENCODE_PARTITIONING.key -> urlEncodePartitioning,
+  HoodieWriteConfig.COMBINE_BEFORE_INSERT.key -> 
String.valueOf(hasPrecombineColumn),
   KEYGENERATOR_CLASS_NAME.key -> classOf[SqlKeyGenerator].getCanonicalName,
   SqlKeyGenerator.ORIGINAL_KEYGEN_CLASS_NAME -> keyGeneratorClassName,
   SqlKeyGenerator.PARTITION_SCHEMA -> 
hoodieCatalogTable.partitionSchema.toDDL,
-  RECORDKEY_FIELD.key -> hoodieCatalogTable.primaryKeys.mkString(","),
-  PRE

[hudi] branch master updated (3e1171cf70d -> f3ac50935bf)

2023-02-03 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from 3e1171cf70d [HUDI-5653] Reset TestDataSource for 
TestHoodieDeltaStreamerWithMultiWriter (#7831)
 add f3ac50935bf [MINOR] Fixing CTAS configuration not propagated properly  
(#7832)

No new revisions were added by this update.

Summary of changes:
 .../spark/sql/hudi/ProvidesHoodieConfig.scala  |  2 +-
 .../apache/spark/sql/hudi/TestCreateTable.scala| 28 ++
 2 files changed, 29 insertions(+), 1 deletion(-)



[hudi] branch master updated: [HUDI-5691] Fixing `HoodiePruneFileSourcePartitions` to properly handle non-partitioned tables (#7833)

2023-02-02 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new 9c969380233 [HUDI-5691] Fixing `HoodiePruneFileSourcePartitions` to 
properly handle non-partitioned tables (#7833)
9c969380233 is described below

commit 9c969380233a27c6729aea75d618891e449e19bf
Author: Alexey Kudinkin 
AuthorDate: Thu Feb 2 21:39:40 2023 -0800

[HUDI-5691] Fixing `HoodiePruneFileSourcePartitions` to properly handle 
non-partitioned tables (#7833)

This change addresses the issue of `HoodiePruneFileSourcePartition` rule 
not being applied to non-partitioned table resulting into their corresponding 
size being incorrectly estimated by Spark
---
 .../analysis/HoodiePruneFileSourcePartitions.scala |  2 +-
 .../TestHoodiePruneFileSourcePartitions.scala  | 40 ++
 2 files changed, 27 insertions(+), 15 deletions(-)

diff --git 
a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodiePruneFileSourcePartitions.scala
 
b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodiePruneFileSourcePartitions.scala
index 3b86777e16e..46cb931a59b 100644
--- 
a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodiePruneFileSourcePartitions.scala
+++ 
b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodiePruneFileSourcePartitions.scala
@@ -41,7 +41,7 @@ case class HoodiePruneFileSourcePartitions(spark: 
SparkSession) extends Rule[Log
 
   override def apply(plan: LogicalPlan): LogicalPlan = plan transformDown {
 case op @ PhysicalOperation(projects, filters, lr @ 
LogicalRelation(HoodieRelationMatcher(fileIndex), _, _, _))
-  if sparkAdapter.isHoodieTable(lr, spark) && 
fileIndex.partitionSchema.nonEmpty && !fileIndex.hasPredicatesPushedDown =>
+  if sparkAdapter.isHoodieTable(lr, spark) && 
!fileIndex.hasPredicatesPushedDown =>
 
   val deterministicFilters = filters.filter(f => f.deterministic && 
!SubqueryExpression.hasSubquery(f))
   val normalizedFilters = exprUtils.normalizeExprs(deterministicFilters, 
lr.output)
diff --git 
a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/analysis/TestHoodiePruneFileSourcePartitions.scala
 
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/analysis/TestHoodiePruneFileSourcePartitions.scala
index 06239697db9..aac2a4027a2 100644
--- 
a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/analysis/TestHoodiePruneFileSourcePartitions.scala
+++ 
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/analysis/TestHoodiePruneFileSourcePartitions.scala
@@ -54,8 +54,11 @@ class TestHoodiePruneFileSourcePartitions extends 
HoodieClientTestBase with Scal
 )
 
   @ParameterizedTest
-  @CsvSource(value = Array("cow", "mor"))
-  def testPartitionFiltersPushDown(tableType: String): Unit = {
+  @CsvSource(value = Array(
+"cow,true", "cow,false",
+"mor,true", "mor,false"
+  ))
+  def testPartitionFiltersPushDown(tableType: String, partitioned: Boolean): 
Unit = {
 spark.sql(
   s"""
  |CREATE TABLE $tableName (
@@ -65,7 +68,7 @@ class TestHoodiePruneFileSourcePartitions extends 
HoodieClientTestBase with Scal
  |  ts long,
  |  partition string
  |) USING hudi
- |PARTITIONED BY (partition)
+ |${if (partitioned) "PARTITIONED BY (partition)" else ""}
  |TBLPROPERTIES (
  |  type = '$tableType',
  |  primaryKey = 'id',
@@ -103,27 +106,37 @@ class TestHoodiePruneFileSourcePartitions extends 
HoodieClientTestBase with Scal
 //  support (for partition-pruning) will only occur during 
execution phase, while file-listing
 //  actually happens during analysis stage
 case "eager" =>
-  assertEquals(1275, f.stats.sizeInBytes.longValue() / 1024)
-  assertEquals(1275, lr.stats.sizeInBytes.longValue() / 1024)
+  // NOTE: In case of partitioned table 3 files will be created, 
while in case of non-partitioned just 1
+  if (partitioned) {
+assertEquals(1275, f.stats.sizeInBytes.longValue() / 1024)
+assertEquals(1275, lr.stats.sizeInBytes.longValue() / 1024)
+  } else {
+// NOTE: We're adding 512 to make sure we always round to the 
next integer value
+assertEquals(425, (f.stats.sizeInBytes.longValue() + 512) / 
1024)
+assertEquals(425, (lr.stats.sizeInBytes.longValue() + 512) / 
1024)
+  }
 
 // Case 

[hudi] branch master updated: [HUDI-5678] Fix `deduceShuffleParallelism` in row-writing Bulk Insert helper (#7818)

2023-02-02 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new e3b95e88a76 [HUDI-5678] Fix `deduceShuffleParallelism` in row-writing 
Bulk Insert helper (#7818)
e3b95e88a76 is described below

commit e3b95e88a7645b3a03bf109671e69e354079fe5d
Author: Jon Vexler 
AuthorDate: Thu Feb 2 11:50:48 2023 -0500

[HUDI-5678] Fix `deduceShuffleParallelism` in row-writing Bulk Insert 
helper (#7818)

`deduceShuffleParallelism` returns 0 in some situations which should never 
occur.
---
 .../hudi/HoodieDatasetBulkInsertHelper.scala   |  4 +-
 .../org/apache/spark/sql/HoodieUnsafeUtils.scala   | 20 ++-
 .../apache/spark/sql/hudi/TestInsertTable.scala| 66 ++
 3 files changed, 85 insertions(+), 5 deletions(-)

diff --git 
a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieDatasetBulkInsertHelper.scala
 
b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieDatasetBulkInsertHelper.scala
index 7e235993e33..a6488b07b51 100644
--- 
a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieDatasetBulkInsertHelper.scala
+++ 
b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieDatasetBulkInsertHelper.scala
@@ -34,7 +34,7 @@ import 
org.apache.hudi.util.JFunction.toJavaSerializableFunctionUnchecked
 import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.HoodieUnsafeRowUtils.{composeNestedFieldPath, 
getNestedInternalRowValue}
-import org.apache.spark.sql.HoodieUnsafeUtils.getOutputPartitioning
+import org.apache.spark.sql.HoodieUnsafeUtils.getNumPartitions
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Alias, Literal}
 import org.apache.spark.sql.catalyst.plans.logical.Project
@@ -45,7 +45,7 @@ import org.apache.spark.unsafe.types.UTF8String
 import scala.collection.JavaConverters.{asScalaBufferConverter, 
seqAsJavaListConverter}
 
 object HoodieDatasetBulkInsertHelper
-  extends ParallelismHelper[DataFrame](toJavaSerializableFunctionUnchecked(df 
=> getOutputPartitioning(df).numPartitions)) with Logging {
+  extends ParallelismHelper[DataFrame](toJavaSerializableFunctionUnchecked(df 
=> getNumPartitions(df))) with Logging {
 
   /**
* Prepares [[DataFrame]] for bulk-insert into Hudi table, taking following 
steps:
diff --git 
a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieUnsafeUtils.scala
 
b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieUnsafeUtils.scala
index dfd416b6f52..ee22f714c9c 100644
--- 
a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieUnsafeUtils.scala
+++ 
b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieUnsafeUtils.scala
@@ -22,7 +22,8 @@ import org.apache.hudi.HoodieUnsafeRDD
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
-import org.apache.spark.sql.catalyst.plans.physical.Partitioning
+import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, 
UnknownPartitioning}
+import org.apache.spark.sql.execution.LogicalRDD
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.MutablePair
 
@@ -38,8 +39,21 @@ object HoodieUnsafeUtils {
*   but instead will just execute Spark resolution, optimization and 
actual execution planning stages
*   returning instance of [[SparkPlan]] ready for execution
*/
-  def getOutputPartitioning(df: DataFrame): Partitioning =
-df.queryExecution.executedPlan.outputPartitioning
+  def getNumPartitions(df: DataFrame): Int = {
+// NOTE: In general we'd rely on [[outputPartitioning]] of the executable 
[[SparkPlan]] to determine
+//   number of partitions plan is going to be executed with.
+//   However in case of [[LogicalRDD]] plan's output-partitioning will 
be stubbed as [[UnknownPartitioning]]
+//   and therefore we will be falling back to determine number of 
partitions by looking at the RDD itself
+df.queryExecution.logical match {
+  case LogicalRDD(_, rdd, outputPartitioning, _, _) =>
+outputPartitioning match {
+  case _: UnknownPartitioning => rdd.getNumPartitions
+  case _ => outputPartitioning.numPartitions
+}
+
+  case _ => df.queryExecution.executedPlan.outputPartitioning.numPartitions
+}
+  }
 
   /**
* Creates [[DataFrame]] from provided [[plan]]
diff --git 
a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestInsertTable.scala
 
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestInsertTable.scala
index b092a68e20d..b33deebdf72 100644
--- 
a/hudi-spark-datasource/h

[hudi] branch master updated (7064c380506 -> e93fbeee4ac)

2023-02-01 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from 7064c380506 [MINOR] Restoring existing behavior for `DeltaStreamer` 
Incremental Source (#7810)
 add e93fbeee4ac [HUDI-5681] Fixing Kryo being instantiated w/ invalid 
`SparkConf` (#7821)

No new revisions were added by this update.

Summary of changes:
 .../org/apache/spark/sql/hudi/SerDeUtils.scala | 44 --
 .../hudi/command/MergeIntoHoodieTableCommand.scala |  8 ++--
 .../hudi/command/payload/ExpressionPayload.scala   | 54 --
 3 files changed, 54 insertions(+), 52 deletions(-)
 delete mode 100644 
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/SerDeUtils.scala



[hudi] branch master updated: [MINOR] Restoring existing behavior for `DeltaStreamer` Incremental Source (#7810)

2023-02-01 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new 7064c380506 [MINOR] Restoring existing behavior for `DeltaStreamer` 
Incremental Source (#7810)
7064c380506 is described below

commit 7064c380506814964dd85773e2ee7b7f187b88c3
Author: Alexey Kudinkin 
AuthorDate: Wed Feb 1 11:19:45 2023 -0800

[MINOR] Restoring existing behavior for `DeltaStreamer` Incremental Source 
(#7810)

This is restoring existing behavior for DeltaStreamer Incremental Source, 
as the change in #7769 removed _hoodie_partition_path field from the dataset 
making it impossible to be accessed from the DS Transformers for ex
---
 .../org/apache/hudi/config/HoodieWriteConfig.java  |  2 +-
 .../apache/hudi/common/config/HoodieConfig.java|  8 
 .../org/apache/hudi/utilities/UtilHelpers.java | 13 
 .../hudi/utilities/deltastreamer/DeltaSync.java|  8 ++--
 .../hudi/utilities/sources/HoodieIncrSource.java   | 23 --
 5 files changed, 37 insertions(+), 17 deletions(-)

diff --git 
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java
 
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java
index e6525a2b1dc..f56defe7eac 100644
--- 
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java
+++ 
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java
@@ -1034,7 +1034,7 @@ public class HoodieWriteConfig extends HoodieConfig {
   }
 
   public HoodieRecordMerger getRecordMerger() {
-List mergers = 
getSplitStringsOrDefault(RECORD_MERGER_IMPLS).stream()
+List mergers = 
StringUtils.split(getStringOrDefault(RECORD_MERGER_IMPLS), ",").stream()
 .map(String::trim)
 .distinct()
 .collect(Collectors.toList());
diff --git 
a/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieConfig.java 
b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieConfig.java
index a48e4202bf9..223b93e5744 100644
--- a/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieConfig.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieConfig.java
@@ -142,14 +142,6 @@ public class HoodieConfig implements Serializable {
 return StringUtils.split(getString(configProperty), delimiter);
   }
 
-  public  List getSplitStringsOrDefault(ConfigProperty 
configProperty) {
-return getSplitStringsOrDefault(configProperty, ",");
-  }
-
-  public  List getSplitStringsOrDefault(ConfigProperty 
configProperty, String delimiter) {
-return StringUtils.split(getStringOrDefault(configProperty), delimiter);
-  }
-
   public String getString(String key) {
 return props.getProperty(key);
   }
diff --git 
a/hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java 
b/hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java
index d159fee0be4..45a9750c3b3 100644
--- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java
+++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java
@@ -29,12 +29,16 @@ import org.apache.hudi.client.WriteStatus;
 import org.apache.hudi.client.common.HoodieSparkEngineContext;
 import org.apache.hudi.common.config.DFSPropertiesConfiguration;
 import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.common.engine.EngineType;
 import org.apache.hudi.common.fs.FSUtils;
 import org.apache.hudi.common.model.HoodieCommitMetadata;
+import org.apache.hudi.common.model.HoodieRecordMerger;
 import org.apache.hudi.common.model.HoodieRecordPayload;
 import org.apache.hudi.common.model.HoodieWriteStat;
 import org.apache.hudi.common.table.HoodieTableMetaClient;
 import org.apache.hudi.common.table.TableSchemaResolver;
+import org.apache.hudi.common.util.ConfigUtils;
+import org.apache.hudi.common.util.HoodieRecordUtils;
 import org.apache.hudi.common.util.Option;
 import org.apache.hudi.common.util.ReflectionUtils;
 import org.apache.hudi.common.util.StringUtils;
@@ -109,6 +113,15 @@ public class UtilHelpers {
 
   private static final Logger LOG = LogManager.getLogger(UtilHelpers.class);
 
+  public static HoodieRecordMerger createRecordMerger(Properties props) {
+List recordMergerImplClasses = 
ConfigUtils.split2List(props.getProperty(HoodieWriteConfig.RECORD_MERGER_IMPLS.key(),
+HoodieWriteConfig.RECORD_MERGER_IMPLS.defaultValue()));
+HoodieRecordMerger recordMerger = 
HoodieRecordUtils.createRecordMerger(null, EngineType.SPARK, 
recordMergerImplClasses,
+props.getProperty(HoodieWriteConfig.RECORD_MERGER_STRATEGY.key(), 
HoodieWriteConfig.RECORD_MERGER_STRATEGY.defaultValue()));
+
+return recordMerger;
+  }
+
   public static Source createSourc

[hudi] branch master updated (d8576933a9b -> 628dc8cd851)

2023-01-31 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from d8576933a9b [HUDI-5553] Prevent partition(s) from being dropped if 
there are pending… (#7669)
 add 628dc8cd851 [HUDI-5633] Fixing performance regression in 
`HoodieSparkRecord` (#7769)

No new revisions were added by this update.

Summary of changes:
 .../hudi/execution/HoodieLazyInsertIterable.java   |   4 +-
 .../org/apache/hudi/io/HoodieAppendHandle.java |  32 +-
 .../org/apache/hudi/io/HoodieBootstrapHandle.java  |  24 +-
 .../org/apache/hudi/io/HoodieCreateHandle.java |  20 +-
 .../java/org/apache/hudi/io/HoodieMergeHandle.java |  14 +-
 .../java/org/apache/hudi/io/HoodieWriteHandle.java |   2 +-
 .../table/action/commit/HoodieMergeHelper.java |  59 +-
 .../java/org/apache/hudi/util/ExecutorFactory.java |  32 +-
 .../hudi/common/model/HoodieSparkRecord.java   | 104 ++--
 .../hudi/execution/SparkLazyInsertIterable.java|  14 +-
 .../hudi/io/storage/HoodieSparkFileWriter.java |  17 -
 .../hudi/io/storage/HoodieSparkParquetReader.java  |   3 +-
 .../hudi/io/storage/HoodieSparkParquetWriter.java  |  46 +-
 .../apache/hudi/keygen/BuiltinKeyGenerator.java|   2 +-
 .../bootstrap/BaseBootstrapMetadataHandler.java|   9 +-
 .../bootstrap/OrcBootstrapMetadataHandler.java |   5 +-
 .../bootstrap/ParquetBootstrapMetadataHandler.java |  87 ++-
 .../apache/hudi/util/HoodieSparkRecordUtils.java   |  69 ---
 .../hudi/HoodieDatasetBulkInsertHelper.scala   |   2 +
 .../apache/spark/sql/HoodieInternalRowUtils.scala  | 595 -
 .../apache/spark/sql/HoodieUnsafeRowUtils.scala|  33 +-
 .../spark/sql/TestHoodieUnsafeRowUtils.scala   |  36 +-
 .../hudi/common/model/HoodieAvroIndexedRecord.java |  34 +-
 .../apache/hudi/common/model/HoodieAvroRecord.java |  42 +-
 .../hudi/common/model/HoodieEmptyRecord.java   |   9 +-
 .../org/apache/hudi/common/model/HoodieRecord.java |  20 +-
 .../apache/hudi/common/model/MetadataValues.java   |  76 ++-
 .../table/log/AbstractHoodieLogRecordReader.java   |  15 +-
 .../org/apache/hudi/HoodieSparkSqlWriter.scala | 110 ++--
 .../org/apache/hudi/HoodieStreamingSink.scala  |   6 +-
 .../apache/hudi/TestHoodieInternalRowUtils.scala   |  89 ---
 .../sql/hudi/TestHoodieInternalRowUtils.scala} |  99 +++-
 .../hudi/utilities/sources/HoodieIncrSource.java   |  37 +-
 .../TestHoodieDeltaStreamerWithMultiWriter.java|   1 +
 34 files changed, 904 insertions(+), 843 deletions(-)
 delete mode 100644 
hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/util/HoodieSparkRecordUtils.java
 delete mode 100644 
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieInternalRowUtils.scala
 rename 
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/{hudi/TestStructTypeSchemaEvolutionUtils.scala
 => spark/sql/hudi/TestHoodieInternalRowUtils.scala} (77%)



[hudi] branch master updated (b00dac54642 -> 88d8e5e96d5)

2023-01-29 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from b00dac54642 [HUDI-5503] Optimize flink table factory option check 
(#7608)
 add 88d8e5e96d5 [MINOR] Cleaning up recently introduced configs (#7772)

No new revisions were added by this update.

Summary of changes:
 .../org/apache/hudi/config/HoodieWriteConfig.java   | 12 ++--
 .../java/org/apache/hudi/util/ExecutorFactory.java  |  2 +-
 .../execution/TestDisruptorExecutionInSpark.java|  8 +---
 .../hudi/execution/TestDisruptorMessageQueue.java   |  4 ++--
 .../scala/org/apache/hudi/DataSourceOptions.scala   | 13 +
 .../org/apache/hudi/HoodieSparkSqlWriter.scala  | 21 +
 .../hudi/command/MergeIntoHoodieTableCommand.scala  |  3 ++-
 .../TestHoodiePruneFileSourcePartitions.scala   |  4 ++--
 8 files changed, 36 insertions(+), 31 deletions(-)



[hudi] branch master updated: [HUDI-5624] Fix HoodieAvroRecordMerger to use new precombine API (#7759)

2023-01-28 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new 9bbd71eb67a [HUDI-5624] Fix HoodieAvroRecordMerger to use new 
precombine API (#7759)
9bbd71eb67a is described below

commit 9bbd71eb67a0b71726f9997412b4b49b73da8fc7
Author: bschell 
AuthorDate: Sat Jan 28 12:29:31 2023 -0800

[HUDI-5624] Fix HoodieAvroRecordMerger to use new precombine API (#7759)

Updates the HoodieAvroRecordMerger to use the new precombine API instead of 
the deprecated one. This fixes issues with backwards compatibility with certain 
payloads.
---
 .../hudi/common/model/HoodieAvroRecordMerger.java  |  13 +--
 .../functional/TestPartialUpdateAvroPayload.scala  | 125 +
 2 files changed, 129 insertions(+), 9 deletions(-)

diff --git 
a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroRecordMerger.java
 
b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroRecordMerger.java
index e49d560b74c..252e11135af 100644
--- 
a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroRecordMerger.java
+++ 
b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroRecordMerger.java
@@ -27,7 +27,6 @@ import 
org.apache.hudi.common.model.HoodieRecord.HoodieRecordType;
 import org.apache.hudi.common.util.Option;
 import org.apache.hudi.common.util.ValidationUtils;
 import org.apache.hudi.common.util.collection.Pair;
-import org.apache.hudi.metadata.HoodieMetadataPayload;
 
 import java.io.IOException;
 import java.util.Properties;
@@ -50,7 +49,7 @@ public class HoodieAvroRecordMerger implements 
HoodieRecordMerger {
 
 switch (legacyOperatingMode) {
   case PRE_COMBINING:
-HoodieRecord res = preCombine(older, newer);
+HoodieRecord res = preCombine(older, newer, newSchema, props);
 if (res == older) {
   return Option.of(Pair.of(res, oldSchema));
 } else {
@@ -71,13 +70,9 @@ public class HoodieAvroRecordMerger implements 
HoodieRecordMerger {
 return HoodieRecordType.AVRO;
   }
 
-  private HoodieRecord preCombine(HoodieRecord older, HoodieRecord newer) {
-HoodieRecordPayload picked = unsafeCast(((HoodieAvroRecord) 
newer).getData().preCombine(((HoodieAvroRecord) older).getData()));
-if (picked instanceof HoodieMetadataPayload) {
-  // NOTE: HoodieMetadataPayload return a new payload
-  return new HoodieAvroRecord(newer.getKey(), picked, 
newer.getOperation());
-}
-return picked.equals(((HoodieAvroRecord) newer).getData()) ? newer : older;
+  private HoodieRecord preCombine(HoodieRecord older, HoodieRecord newer, 
Schema schema, Properties props) {
+HoodieRecordPayload payload = unsafeCast(((HoodieAvroRecord) 
newer).getData().preCombine(((HoodieAvroRecord) older).getData(), schema, 
props));
+return new HoodieAvroRecord(newer.getKey(), payload, newer.getOperation());
   }
 
   private Option combineAndGetUpdateValue(HoodieRecord older, 
HoodieRecord newer, Schema schema, Properties props) throws IOException {
diff --git 
a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestPartialUpdateAvroPayload.scala
 
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestPartialUpdateAvroPayload.scala
new file mode 100644
index 000..172d0a7f945
--- /dev/null
+++ 
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestPartialUpdateAvroPayload.scala
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.functional
+
+import java.util.function.Consumer
+
+import org.apache.hadoop.fs.FileSystem
+import org.apache.hudi.HoodieConversionUtils.toJavaOption
+import org.apache.hudi.{DataSourceWriteOptions, QuickstartUtils}
+import org.apache.hudi.QuickstartUtils.{convertToStringList, 
getQuickstartWriteConfigs}
+import org.apache.hudi.common.model.HoodieTableType
+import org.apache.hudi.common.util
+import org.apache.hudi.config.HoodieWriteConfig
+import org.apache.hudi.testutils.HoodieCl

[hudi] branch master updated (1ecc0401eef -> ff590c6d72c)

2023-01-27 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from 1ecc0401eef [HUDI-5623] Increase default time to wait between retries 
by lock provider client (#7758)
 add ff590c6d72c [HUDI-5023] Switching default Write Executor type to 
`SIMPLE` (#7476)

No new revisions were added by this update.

Summary of changes:
 .../org/apache/hudi/config/HoodieWriteConfig.java  | 48 +++-
 .../hudi/execution/HoodieLazyInsertIterable.java   | 30 +++--
 .../java/org/apache/hudi/util/ExecutorFactory.java | 24 +-
 .../hudi/execution/FlinkLazyInsertIterable.java|  2 +-
 .../hudi/execution/JavaLazyInsertIterable.java |  2 +-
 .../hudi/execution/SparkLazyInsertIterable.java|  2 +-
 .../TestBoundedInMemoryExecutorInSpark.java| 28 ++--
 .../hudi/execution/TestBoundedInMemoryQueue.java   | 23 ++
 .../execution/TestDisruptorExecutionInSpark.java   | 48 ++--
 .../hudi/execution/TestDisruptorMessageQueue.java  | 30 +++--
 .../hudi/execution/TestSimpleExecutionInSpark.java | 51 +-
 .../common/util/queue/BoundedInMemoryExecutor.java |  2 +-
 .../hudi/common/util/queue/DisruptorExecutor.java  | 19 +---
 .../common/util/queue/DisruptorMessageQueue.java   | 18 +---
 .../hudi/common/util/queue/ExecutorType.java   | 15 +--
 ...mpleHoodieExecutor.java => SimpleExecutor.java} | 51 ++
 .../common/util/queue/WaitStrategyFactory.java |  8 ++--
 17 files changed, 200 insertions(+), 201 deletions(-)
 rename 
hudi-common/src/main/java/org/apache/hudi/common/util/queue/{SimpleHoodieExecutor.java
 => SimpleExecutor.java} (53%)



[hudi] branch master updated: [HUDI-5534] Optimizing Bloom Index lookup when using Bloom Filters from Metadata Table (#7642)

2023-01-27 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new e270924c5eb [HUDI-5534] Optimizing Bloom Index lookup when using Bloom 
Filters from Metadata Table (#7642)
e270924c5eb is described below

commit e270924c5eb1548d9022fe3e52d5dd64f61fbdc1
Author: Alexey Kudinkin 
AuthorDate: Fri Jan 27 15:47:12 2023 -0800

[HUDI-5534] Optimizing Bloom Index lookup when using Bloom Filters from 
Metadata Table (#7642)

Most recently, while trying to use Metadata Table in Bloom Index it was 
resulting in failures due to exhaustion of S3 connection pool no matter how 
(reasonably big) we're setting the pool size (we've tested up to 3k 
connections).

This PR focuses on optimizing the Bloom Index lookup sequence in case when 
it's leveraging Bloom Filter partition in Metadata Table. The premise of this 
change is based on the following observations:

Increasing the size of the batch of the requests to MT allows to amortize 
the cost of processing it (bigger the batch, lesser the cost).

Having too few partitions in the Bloom Index path however, starts to hurt 
parallelism when we actually probe individual files whether they actually 
contain target keys or not. Solution to this is to split these 2 in different 
stages w/ drastically different parallelism levels: constrain parallelism when 
reading from MT (10s of tasks) and keep at the current level for probing 
individual files (100s of tasks)

Current way of partitioning records (relying on Spark's default 
partitioner) was entailing that every Spark executor with high likelihood will 
be opening up (and processing) every file-group of the MT Bloom Filter 
partition. To alleviate that same hashing algorithm used by MT should be used 
to partition records into Spark's individual partitions, so that we can limit 
every task to open no more than 1 file-group in Bloom Filter's partition of MT

To achieve that following changes in Bloom Index sequence (leveraging MT) 
are implemented

Bloom Filter probing and actual File Probing are split into 2 separate 
operations (so that parallelism of each of them could be controlled 
individually)
Requests to MT are replaced to invoke batch APIs
Custom partitioner is introduced AffineBloomIndexFileGroupPartitioner 
repartitioning dataset of filenames with corresponding record keys in a way 
that is affine w/ MT Bloom Filters' partitioning (allowing us to open no more 
than a single file-group per Spark's task)
Additionally, this PR addresses some of the low-hanging performance 
optimizations that could considerably improve performance of the Bloom Index 
lookup sequence like mapping file-comparison pairs to PairRDD (where key is 
file-name, and value is record-key) instead of RDD so that we could:

Do in-partition sorting by filename (to make sure we check all records w/in 
the file all at once) w/in a single Spark partition instead of global one 
(reducing shuffling as well)
Avoid re-shuffling (by re-mapping from RDD to PairRDD later)
---
 .../org/apache/hudi/index/HoodieIndexUtils.java|   5 +-
 .../index/bloom/BaseHoodieBloomIndexHelper.java|   5 +-
 .../apache/hudi/index/bloom/HoodieBloomIndex.java  |  77 +++
 ...ion.java => HoodieBloomIndexCheckFunction.java} |  59 +++---
 .../hudi/index/bloom/HoodieGlobalBloomIndex.java   |  14 +-
 .../bloom/ListBasedHoodieBloomIndexHelper.java |  26 +--
 .../index/bloom/TestFlinkHoodieBloomIndex.java |   8 +-
 .../org/apache/hudi/data/HoodieJavaPairRDD.java|   5 +
 .../java/org/apache/hudi/data/HoodieJavaRDD.java   |  14 +-
 .../bloom/BucketizedBloomCheckPartitioner.java |  18 +-
 .../bloom/HoodieBloomFilterProbingResult.java  |  35 
 .../index/bloom/HoodieBloomIndexCheckFunction.java | 120 ---
 .../index/bloom/HoodieFileProbingFunction.java | 143 +
 .../HoodieMetadataBloomFilterProbingFunction.java  | 157 ++
 .../HoodieMetadataBloomIndexCheckFunction.java | 154 --
 .../index/bloom/SparkHoodieBloomIndexHelper.java   | 229 ++---
 .../org/apache/spark/sql/hudi/SparkAdapter.scala   |   4 +-
 .../bloom/TestBucketizedBloomCheckPartitioner.java |  43 ++--
 .../hudi/index/bloom/TestHoodieBloomIndex.java |  10 +-
 .../index/bloom/TestHoodieGlobalBloomIndex.java|  10 +-
 .../org/apache/hudi/common/data/HoodieData.java|  17 +-
 .../apache/hudi/common/data/HoodieListData.java|  10 +
 .../hudi/common/data/HoodieListPairData.java   |  23 +++
 .../apache/hudi/common/data/HoodiePairData.java|  11 +-
 .../ThrowingConsumer.java} |  31 +--
 .../hudi/common/model/HoodieAvroIndexedRecord.java |   2 +-
 .../hudi/common/table/HoodieTableMetaClient.java   |   4 +
 .../common/util/collect

[hudi] branch master updated: [HUDI-5363] Removing default value for shuffle parallelism configs (#7723)

2023-01-25 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new 3a08bdc3f97 [HUDI-5363] Removing default value for shuffle parallelism 
configs (#7723)
3a08bdc3f97 is described below

commit 3a08bdc3f971b3534e8fb6f34772340cfdf055a9
Author: Alexey Kudinkin 
AuthorDate: Wed Jan 25 19:29:42 2023 -0800

[HUDI-5363] Removing default value for shuffle parallelism configs (#7723)


Currently, we always override the parallelism (translating into the # of 
partitions in Spark for ex) of the incoming datasets no matter whether user 
requested that or not:

 1. If user specified shuffle parallelism explicitly, we'd use it to 
override the original one
 2. If user did NOT specify shuffle parallelism, we'd use default value of 
200

Second case is problematic: we're blindly overriding parallelism of the 
data deduced by Spark (determined based on the source of the data) replacing it 
with _static_ value (having nothing to do w/ the data itself).

Instead, we should only be overriding the parallelism in cases when 
corresponding configuration has been explicitly provide by the user.
---
 .../org/apache/hudi/config/HoodieWriteConfig.java  | 10 +++---
 .../table/action/commit/BaseBulkInsertHelper.java  |  7 +++-
 .../hudi/table/action/commit/BaseDeleteHelper.java |  7 +++-
 .../hudi/table/action/commit/BaseWriteHelper.java  | 14 ++--
 .../table/action/commit/HoodieDeleteHelper.java| 35 ++
 .../table/action/commit/HoodieWriteHelper.java |  3 +-
 .../table/action/commit/ParallelismHelper.java | 42 ++
 .../table/action/commit/FlinkDeleteHelper.java |  1 +
 .../hudi/table/action/commit/FlinkWriteHelper.java |  3 +-
 .../table/action/commit/JavaBulkInsertHelper.java  | 12 ---
 .../hudi/table/action/commit/JavaDeleteHelper.java |  1 +
 .../hudi/table/action/commit/JavaWriteHelper.java  |  1 +
 .../table/action/commit/SparkBulkInsertHelper.java | 24 -
 .../hudi/HoodieDatasetBulkInsertHelper.scala   | 16 ++---
 .../scala/org/apache/hudi/util/JFunction.scala |  7 +++-
 .../org/apache/spark/sql/HoodieUnsafeUtils.scala   | 11 ++
 16 files changed, 148 insertions(+), 46 deletions(-)

diff --git 
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java
 
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java
index fcee1b4b0d6..2f36aa725e3 100644
--- 
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java
+++ 
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java
@@ -228,12 +228,12 @@ public class HoodieWriteConfig extends HoodieConfig {
 
   public static final ConfigProperty INSERT_PARALLELISM_VALUE = 
ConfigProperty
   .key("hoodie.insert.shuffle.parallelism")
-  .defaultValue("200")
+  .defaultValue("0")
   .withDocumentation("Parallelism for inserting records into the table. 
Inserts can shuffle data before writing to tune file sizes and optimize the 
storage layout.");
 
   public static final ConfigProperty BULKINSERT_PARALLELISM_VALUE = 
ConfigProperty
   .key("hoodie.bulkinsert.shuffle.parallelism")
-  .defaultValue("200")
+  .defaultValue("0")
   .withDocumentation("For large initial imports using bulk_insert 
operation, controls the parallelism to use for sort modes or custom 
partitioning done"
   + "before writing records to the table.");
 
@@ -252,13 +252,13 @@ public class HoodieWriteConfig extends HoodieConfig {
 
   public static final ConfigProperty UPSERT_PARALLELISM_VALUE = 
ConfigProperty
   .key("hoodie.upsert.shuffle.parallelism")
-  .defaultValue("200")
+  .defaultValue("0")
   .withDocumentation("Parallelism to use for upsert operation on the 
table. Upserts can shuffle data to perform index lookups, file sizing, bin 
packing records optimally"
   + "into file groups.");
 
   public static final ConfigProperty DELETE_PARALLELISM_VALUE = 
ConfigProperty
   .key("hoodie.delete.shuffle.parallelism")
-  .defaultValue("200")
+  .defaultValue("0")
   .withDocumentation("Parallelism used for “delete” operation. Delete 
operations also performs shuffles, similar to upsert operation.");
 
   public static final ConfigProperty ROLLBACK_PARALLELISM_VALUE = 
ConfigProperty
@@ -1156,7 +1156,7 @@ public class HoodieWriteConfig extends HoodieConfig {
   }
 
   public int getDeleteShuffleParallelism() {
-return Math.max(getInt(DELETE_PARALLELISM_VALUE), 1);
+return getInt(DELETE_PARALLELISM_VALUE

[hudi] branch master updated (25afb357df9 -> 20969c26059)

2023-01-24 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from 25afb357df9 [HUDI-5401] Ensure user-provided hive metastore uri is set 
in HiveConf if not already set (#7543)
 add 20969c26059 [HUDI-5392] Fixing Bootstrapping flow handling of arrays 
(#7461)

No new revisions were added by this update.

Summary of changes:
 .../apache/hudi/client/utils/MergingIterator.java  | 12 ++---
 .../table/action/commit/HoodieMergeHelper.java |  8 ++-
 .../functional/TestDataSourceForBootstrap.scala| 62 --
 3 files changed, 57 insertions(+), 25 deletions(-)



[hudi] branch master updated (4f6b831ea11 -> fc1831b22c3)

2023-01-23 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from 4f6b831ea11 [HUDI-5235] Clustering target size should larger than 
small file limit (#7232)
 add fc1831b22c3 Fixing FS `InputStream` leaks (#7741)

No new revisions were added by this update.

Summary of changes:
 .../hudi/common/bootstrap/index/HFileBootstrapIndex.java   | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)



[hudi] branch master updated (febff4afd2a -> a70355f4457)

2023-01-21 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from febff4afd2a [HUDI-5417] support to read avro from non-legacy map/list 
in parquet log (#7512)
 add a70355f4457 [HUDI-5579] Fixing Kryo registration to be properly wired 
into Spark sessions (#7702)

No new revisions were added by this update.

Summary of changes:
 README.md  |  4 ++-
 docker/demo/config/spark-defaults.conf |  2 ++
 .../templates/spark_command.txt.template   |  1 +
 .../hudi/cli/utils/SparkTempViewProvider.java  |  5 +--
 .../java/org/apache/hudi/cli/utils/SparkUtil.java  |  3 +-
 .../cli/functional/CLIFunctionalTestHarness.java   |  4 +--
 hudi-client/hudi-client-common/pom.xml |  6 
 hudi-client/hudi-java-client/pom.xml   |  6 
 ...ovider.scala => HoodieSparkKryoRegistrar.scala} | 36 ++
 .../hudi/testutils/FunctionalTestHarness.java  |  4 +--
 .../hudi/testutils/HoodieClientTestUtils.java  | 11 ++-
 .../SparkClientFunctionalTestHarness.java  |  4 +--
 .../hudi/testutils/providers/SparkProvider.java|  1 +
 hudi-common/pom.xml|  2 +-
 .../common/config/SerializableConfiguration.java   |  1 +
 ...rovider.java => HoodieCommonKryoRegistrar.java} | 13 +---
 .../hudi/common/util/SerializationUtils.java   |  4 +--
 hudi-examples/bin/hudi-delta-streamer  |  2 ++
 hudi-examples/hudi-examples-common/pom.xml |  6 
 hudi-examples/hudi-examples-java/pom.xml   |  6 
 .../examples/common/HoodieExampleSparkUtils.java   |  3 +-
 .../quickstart/TestHoodieSparkQuickstart.java  |  4 +--
 .../src/test/python/HoodiePySparkQuickstart.py |  2 ++
 hudi-flink-datasource/hudi-flink/pom.xml   | 24 +++
 hudi-hadoop-mr/pom.xml |  6 
 hudi-integ-test/README.md  |  6 
 hudi-kafka-connect/pom.xml |  7 +
 .../spark/sql/hudi/analysis/HoodieAnalysis.scala   | 23 ++
 .../hudi-spark/src/test/java/HoodieJavaApp.java|  7 +++--
 .../src/test/java/HoodieJavaGenerateApp.java   |  8 +++--
 .../src/test/java/HoodieJavaStreamingApp.java  | 15 ++---
 .../functional/TestHiveTableSchemaEvolution.java   | 20 +---
 .../org/apache/hudi/TestHoodieSparkSqlWriter.scala | 14 -
 .../org/apache/hudi/TestHoodieSparkUtils.scala |  4 +++
 .../hudi/TestTableSchemaResolverWithSparkSQL.scala |  6 ++--
 .../functional/TestDataSourceForBootstrap.scala| 15 +
 .../hudi/functional/TestStreamingSource.scala  |  5 ++-
 .../benchmark/BoundInMemoryExecutorBenchmark.scala |  1 +
 .../benchmark/CowTableReadBenchmark.scala  |  1 +
 .../ReadAndWriteWithoutAvroBenchmark.scala |  1 +
 .../spark/sql/hudi/HoodieSparkSqlTestBase.scala| 16 +++---
 .../hudi/procedure/TestBootstrapProcedure.scala|  3 +-
 hudi-sync/hudi-sync-common/pom.xml |  6 
 hudi-timeline-service/pom.xml  |  6 
 .../hudi/utilities/HoodieSnapshotCopier.java   |  5 +--
 .../hudi/utilities/HoodieSnapshotExporter.java |  5 +--
 .../org/apache/hudi/utilities/UtilHelpers.java |  7 +++--
 .../hudi/utilities/TestHoodieRepairTool.java   |  4 +--
 .../bundle-validation/conf/spark-defaults.conf |  1 +
 packaging/hudi-cli-bundle/pom.xml  | 33 
 packaging/hudi-flink-bundle/pom.xml| 32 +++
 packaging/hudi-hadoop-mr-bundle/pom.xml| 32 +++
 packaging/hudi-integ-test-bundle/pom.xml   | 32 +++
 packaging/hudi-kafka-connect-bundle/pom.xml| 31 +++
 packaging/hudi-presto-bundle/pom.xml   | 31 +++
 packaging/hudi-trino-bundle/pom.xml| 31 +++
 packaging/hudi-utilities-slim-bundle/README.md |  5 +++
 pom.xml| 30 ++
 58 files changed, 473 insertions(+), 130 deletions(-)
 rename 
hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/{HoodieSparkKryoProvider.scala
 => HoodieSparkKryoRegistrar.scala} (63%)
 rename 
hudi-common/src/main/java/org/apache/hudi/common/util/{HoodieCommonKryoProvider.java
 => HoodieCommonKryoRegistrar.java} (93%)



[hudi] branch master updated: [HUDI-5417] support to read avro from non-legacy map/list in parquet log (#7512)

2023-01-20 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new febff4afd2a [HUDI-5417] support to read avro from non-legacy map/list 
in parquet log (#7512)
febff4afd2a is described below

commit febff4afd2a71085fdba05dc08e857f8267216d3
Author: komao 
AuthorDate: Sat Jan 21 15:18:26 2023 +0800

[HUDI-5417] support to read avro from non-legacy map/list in parquet log 
(#7512)

### Change Logs

Support to read Avro from non-legacy map/list from Parquet log-blocks when 
using by `SparkRecordMerger`.

Without such conversion when using `SparkRecordMerger` and write to a table 
with schema that contains list/map type (written in non-legacy format), reading 
such records back as Avro will fail.
---
 .../hudi/io/storage/HoodieAvroParquetReader.java   |   4 +-
 .../avro/HoodieAvroParquetReaderBuilder.java   |  79 
 .../apache/parquet/avro/HoodieAvroReadSupport.java | 133 +
 .../apache/hudi/functional/TestMORDataSource.scala |  58 -
 4 files changed, 271 insertions(+), 3 deletions(-)

diff --git 
a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroParquetReader.java
 
b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroParquetReader.java
index aa57b73d0c6..3328707c9ed 100644
--- 
a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroParquetReader.java
+++ 
b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroParquetReader.java
@@ -32,9 +32,9 @@ import org.apache.avro.Schema;
 import org.apache.avro.generic.IndexedRecord;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
-import org.apache.parquet.avro.AvroParquetReader;
 import org.apache.parquet.avro.AvroReadSupport;
 import org.apache.parquet.avro.AvroSchemaConverter;
+import org.apache.parquet.avro.HoodieAvroParquetReaderBuilder;
 import org.apache.parquet.hadoop.ParquetInputFormat;
 import org.apache.parquet.hadoop.ParquetReader;
 
@@ -165,7 +165,7 @@ public class HoodieAvroParquetReader extends 
HoodieAvroFileReaderBase {
   AvroReadSupport.setAvroReadSchema(conf, requestedSchema.get());
   AvroReadSupport.setRequestedProjection(conf, requestedSchema.get());
 }
-ParquetReader reader = 
AvroParquetReader.builder(path).withConf(conf).build();
+ParquetReader reader = new 
HoodieAvroParquetReaderBuilder(path).withConf(conf).build();
 ParquetReaderIterator parquetReaderIterator = new 
ParquetReaderIterator<>(reader);
 readerIterators.add(parquetReaderIterator);
 return parquetReaderIterator;
diff --git 
a/hudi-common/src/main/java/org/apache/parquet/avro/HoodieAvroParquetReaderBuilder.java
 
b/hudi-common/src/main/java/org/apache/parquet/avro/HoodieAvroParquetReaderBuilder.java
new file mode 100644
index 000..d6179ea1aac
--- /dev/null
+++ 
b/hudi-common/src/main/java/org/apache/parquet/avro/HoodieAvroParquetReaderBuilder.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.parquet.avro;
+
+import org.apache.avro.generic.GenericData;
+import org.apache.avro.specific.SpecificData;
+import org.apache.hadoop.fs.Path;
+import org.apache.parquet.hadoop.ParquetReader;
+import org.apache.parquet.hadoop.api.ReadSupport;
+import org.apache.parquet.io.InputFile;
+
+/**
+ * Copy from org.apache.parquet.avro.AvroParquetReader.Builder.
+ * We use HoodieAvroParquetReaderBuilder to build HoodieAvroReadSupport
+ * that can support reading avro from non-legacy map/list in parquet file.
+ */
+public class HoodieAvroParquetReaderBuilder extends 
ParquetReader.Builder {
+
+  private GenericData model = null;
+  private boolean enableCompatibility = true;
+  private boolean isReflect = true;
+
+  @Deprecated
+  public HoodieAvroParquetReaderBuilder(Path path) {
+super(path);
+  }
+
+  public HoodieAvroParquetReaderBuilder(InputFile file) {
+super(file);
+  }
+
+  public HoodieAvroParquetReaderBuilder withDataModel(GenericData model) {
+this.model = model;
+
+//

[hudi] branch master updated: [HUDI-5499] Fixing Spark SQL configs not being properly propagated for CTAS and other commands (#7607)

2023-01-20 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new f0f8d618b30 [HUDI-5499] Fixing Spark SQL configs not being properly 
propagated for CTAS and other commands (#7607)
f0f8d618b30 is described below

commit f0f8d618b30eee73baf02806d915a8b12034edad
Author: Alexey Kudinkin 
AuthorDate: Fri Jan 20 13:53:25 2023 -0800

[HUDI-5499] Fixing Spark SQL configs not being properly propagated for CTAS 
and other commands (#7607)

### Change Logs

While following up and adding support for BrooklynData Benchmarks we've 
discovered that CTAS isn't properly propagating configs due to a recent change 
in 
[#5178](https://github.com/apache/hudi/pull/5178/files#diff-560283e494c8ba8da102fc217a2201220dd4db731ec23d80884e0f001a7cc0bcR117)

Unfortunately logic of handling the configuration in `ProvidesHoodieConfig` 
become overly complicated and fragmented.

This PR takes a stab at it trying to unify and streamline fusing the 
options from different sources (Spark Catalog props, Table properties, Spark 
SQL conf, overrides, etc) making sure different Spark SQL operations do handle 
it in much the same way (for ex, `MERGE INTO`, CTAS, `INSERT INTO`, etc)

Changes

 - Simplify and unify `ProvidesHoodieConfig` configuration fusion from 
different sources
 - Fixing CTAS to override "hoodie.combine.before.insert" as "false"
---
 .../apache/spark/sql/hudi/HoodieOptionConfig.scala |  12 --
 .../spark/sql/hudi/ProvidesHoodieConfig.scala  | 128 +++--
 .../command/CreateHoodieTableAsSelectCommand.scala |   7 +-
 .../hudi/command/MergeIntoHoodieTableCommand.scala |  16 +--
 .../command/procedures/HiveSyncProcedure.scala |   3 +-
 .../spark/sql/hudi/HoodieSparkSqlTestBase.scala|  26 -
 .../apache/spark/sql/hudi/TestCreateTable.scala|  20 +++-
 .../apache/spark/sql/hudi/TestInsertTable.scala|   9 ++
 8 files changed, 131 insertions(+), 90 deletions(-)

diff --git 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/HoodieOptionConfig.scala
 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/HoodieOptionConfig.scala
index 7da69b2c0bb..bb682cf9b5f 100644
--- 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/HoodieOptionConfig.scala
+++ 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/HoodieOptionConfig.scala
@@ -162,18 +162,6 @@ object HoodieOptionConfig {
   .toMap
   }
 
-  /**
-   * Get the primary key from the table options.
-   * @param options
-   * @return
-   */
-  def getPrimaryColumns(options: Map[String, String]): Array[String] = {
-val params = mapSqlOptionsToDataSourceWriteConfigs(options)
-params.get(DataSourceWriteOptions.RECORDKEY_FIELD.key)
-  .map(_.split(",").filter(_.nonEmpty))
-  .getOrElse(Array.empty)
-  }
-
   /**
* Get the table type from the table options.
* @param options
diff --git 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala
 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala
index bf6b6509b95..77ff939cf26 100644
--- 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala
+++ 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala
@@ -17,8 +17,10 @@
 
 package org.apache.spark.sql.hudi
 
+import org.apache.hudi.DataSourceWriteOptions
 import org.apache.hudi.DataSourceWriteOptions._
-import org.apache.hudi.common.config.TypedProperties
+import org.apache.hudi.HoodieConversionUtils.toProperties
+import org.apache.hudi.common.config.{DFSPropertiesConfiguration, 
TypedProperties}
 import org.apache.hudi.common.model.{OverwriteWithLatestAvroPayload, 
WriteOperationType}
 import org.apache.hudi.common.table.HoodieTableConfig
 import org.apache.hudi.config.HoodieWriteConfig.TBL_NAME
@@ -28,12 +30,13 @@ import org.apache.hudi.hive.{HiveSyncConfig, 
HiveSyncConfigHolder, MultiPartKeys
 import org.apache.hudi.keygen.ComplexKeyGenerator
 import org.apache.hudi.sql.InsertMode
 import org.apache.hudi.sync.common.HoodieSyncConfig
-import org.apache.hudi.{DataSourceWriteOptions, HoodieWriterUtils}
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable
 import org.apache.spark.sql.hive.HiveExternalCatalog
-import org.apache.spark.sql.hudi.HoodieSqlCommonUtils.{isUsingHiveCatalog, 
withSparkConf}
+import 
org.apache.spark.sql.hudi.HoodieOptionConfig.mapSqlOptionsToDataSourceWriteConfigs
+import org.apache.spark.sql.hudi.HoodieSql

[hudi] branch master updated: [HUDI-5384] Adding optimization rule to appropriately push down filters into the `HoodieFileIndex` (#7423)

2023-01-20 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new b1552eff7af [HUDI-5384] Adding optimization rule to appropriately push 
down filters into the `HoodieFileIndex` (#7423)
b1552eff7af is described below

commit b1552eff7af5fee8f8b11d051a147240ef619689
Author: Alexey Kudinkin 
AuthorDate: Fri Jan 20 08:29:30 2023 -0800

[HUDI-5384] Adding optimization rule to appropriately push down filters 
into the `HoodieFileIndex` (#7423)

### Change Logs

This is a follow-up for https://github.com/apache/hudi/pull/6680

After transitioning of the `HoodieFileIndex` to do file-listing _lazily_ by 
default, following issue has been uncovered: due to

 - Listing now being delayed (until actual execution of the 
`FileSourceScanExec` node)
 - Spark not providing a generic `Rule` to push-down the predicates (it has 
`PruneFileSourcePartitions` but it's only applicable to `CatalogFileIndex`)

Statistics (based on the `FileIndex`) for Hudi's relations have been 
incorrectly estimated due to now these being delayed until the execution time 
when partition-predicates are pushed-down to `HoodieFileIndex`.

To work this around we're introducing a new 
`HoodiePruneFileSourcePartitions` rule that is

 - Structurally borrowing from `PruneFileSourcePartitions`
 - Pushes down predicates to `HoodieFileIndex` to perform partition-pruning 
in time, before subsequent CBO stage
 - Addresses the issue of statistics for Hudi's relations being incorrectly 
estimated

For more details around the impact of `HoodiePruneFileSourcePartitions`, 
please check out corresponding `TestHoodiePruneFileSourcePartitions`
---
 .../client/common/HoodieSparkEngineContext.java|  10 +-
 .../spark/sql/HoodieCatalystExpressionUtils.scala  |  38 ++--
 .../org/apache/spark/sql/hudi/SparkAdapter.scala   |  23 ++-
 .../hudi/testutils/HoodieClientTestHarness.java|   7 +-
 .../apache/hudi/common/util/TablePathUtils.java|   4 +
 .../scala/org/apache/hudi/HoodieBaseRelation.scala |   4 +-
 .../scala/org/apache/hudi/HoodieFileIndex.scala|  25 ++-
 .../apache/hudi/SparkHoodieTableFileIndex.scala|   2 +
 .../sql/hudi/HoodieSparkSessionExtension.scala |  15 +-
 .../spark/sql/hudi/analysis/HoodieAnalysis.scala   |  55 +++--
 .../analysis/HoodiePruneFileSourcePartitions.scala | 124 +++
 .../org/apache/hudi/TestHoodieFileIndex.scala  |  12 +-
 .../hudi/TestNestedSchemaPruningOptimization.scala |   3 +
 .../TestHoodiePruneFileSourcePartitions.scala  | 228 +
 .../sql/HoodieSpark2CatalystExpressionUtils.scala  |  54 -
 .../apache/spark/sql/adapter/Spark2Adapter.scala   |  25 ++-
 .../sql/HoodieSpark3CatalystExpressionUtils.scala  |  32 +++
 .../spark/sql/adapter/BaseSpark3Adapter.scala  |  40 ++--
 .../sql/HoodieSpark31CatalystExpressionUtils.scala |   4 +-
 .../sql/HoodieSpark32CatalystExpressionUtils.scala |   4 +-
 .../sql/HoodieSpark33CatalystExpressionUtils.scala |   2 +-
 21 files changed, 623 insertions(+), 88 deletions(-)

diff --git 
a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/common/HoodieSparkEngineContext.java
 
b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/common/HoodieSparkEngineContext.java
index d8281d1a10b..c97cb78d8c9 100644
--- 
a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/common/HoodieSparkEngineContext.java
+++ 
b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/common/HoodieSparkEngineContext.java
@@ -57,15 +57,15 @@ public class HoodieSparkEngineContext extends 
HoodieEngineContext {
 
   private static final Logger LOG = 
LogManager.getLogger(HoodieSparkEngineContext.class);
   private final JavaSparkContext javaSparkContext;
-  private SQLContext sqlContext;
+  private final SQLContext sqlContext;
 
   public HoodieSparkEngineContext(JavaSparkContext jsc) {
-super(new SerializableConfiguration(jsc.hadoopConfiguration()), new 
SparkTaskContextSupplier());
-this.javaSparkContext = jsc;
-this.sqlContext = SQLContext.getOrCreate(jsc.sc());
+this(jsc, SQLContext.getOrCreate(jsc.sc()));
   }
 
-  public void setSqlContext(SQLContext sqlContext) {
+  public HoodieSparkEngineContext(JavaSparkContext jsc, SQLContext sqlContext) 
{
+super(new SerializableConfiguration(jsc.hadoopConfiguration()), new 
SparkTaskContextSupplier());
+this.javaSparkContext = jsc;
 this.sqlContext = sqlContext;
   }
 
diff --git 
a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieCatalystExpressionUtils.scala
 
b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieCatalystExpressionUtils.scala
index 24e50613f50..8a609d7d532 100644
--- 
a/hudi-client/hudi-spark-client/src/main/scala/org

[hudi] branch master updated: [HUDI-4911][HUDI-3301] Fixing `HoodieMetadataLogRecordReader` to avoid flushing cache for every lookup (#6782)

2023-01-19 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new f8028a400eb [HUDI-4911][HUDI-3301] Fixing 
`HoodieMetadataLogRecordReader` to avoid flushing cache for every lookup (#6782)
f8028a400eb is described below

commit f8028a400ebe7a64d7745fae90e769306a8a07a7
Author: Alexey Kudinkin 
AuthorDate: Thu Jan 19 09:10:13 2023 -0800

[HUDI-4911][HUDI-3301] Fixing `HoodieMetadataLogRecordReader` to avoid 
flushing cache for every lookup (#6782)

Currently, HoodieMetadataLogRecordReader is flushing cache for every lookup 
it's doing, which led to multiple occasions of poor performance as MT had to be 
re-scanned over and over again, even though nothing really changed.

This PR rectifies that, additionally:

Making sure LogRecordScanner is not extended for MT (there's no reason for 
that, instead its configuration is simply expanded to accommodate for MT 
use-case)
Refining and cleaning up the API provided HoodieMetadataLogRecordReader
Avoiding unnecessary locking
---
 .../cli/commands/TestHoodieLogFileCommand.java |   3 +-
 .../hudi/cli/integ/ITTestRepairsCommand.java   |   5 +-
 .../common/table/log/HoodieFileSliceReader.java|   2 +-
 .../java/org/apache/hudi/io/HoodieMergeHandle.java |   6 +-
 .../apache/hudi/io/HoodieSortedMergeHandle.java|   3 +-
 .../HoodieLogCompactionPlanGenerator.java  |   3 +-
 .../functional/TestHoodieBackedMetadata.java   |  15 +-
 .../functional/TestHoodieBackedTableMetadata.java  |  15 +-
 .../TestHoodieClientOnMergeOnReadStorage.java  |   4 +-
 .../hudi/testutils/HoodieClientTestHarness.java|   2 +-
 .../org/apache/hudi/BaseHoodieTableFileIndex.java  |   3 +-
 .../table/log/AbstractHoodieLogRecordReader.java   | 204 +++--
 .../table/log/HoodieMergedLogRecordScanner.java| 206 ++
 .../table/log/HoodieUnMergedLogRecordScanner.java  |  13 +-
 .../apache/hudi/common/util/CollectionUtils.java   |   4 +-
 .../util/collection/ExternalSpillableMap.java  |   2 +
 .../hudi/metadata/HoodieBackedTableMetadata.java   |  75 +++--
 .../metadata/HoodieMetadataLogRecordReader.java| 238 
 .../HoodieMetadataMergedLogRecordReader.java   | 254 -
 .../common/functional/TestHoodieLogFormat.java | 315 ++---
 .../apache/hudi/common/model/TestHoodieRecord.java |   3 +-
 .../hudi/common/testutils/SchemaTestUtil.java  |  25 +-
 .../common/util/collection/TestBitCaskDiskMap.java |  26 +-
 .../util/collection/TestExternalSpillableMap.java  |  27 +-
 .../util/collection/TestRocksDbBasedMap.java   |   3 +-
 .../common/util/collection/TestRocksDbDiskMap.java |  14 +-
 .../src/main/scala/org/apache/hudi/Iterators.scala |  55 ++--
 .../sql/hudi/procedure/TestRepairsProcedure.scala  |   5 +-
 28 files changed, 950 insertions(+), 580 deletions(-)

diff --git 
a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestHoodieLogFileCommand.java
 
b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestHoodieLogFileCommand.java
index 21e6218dbe2..aff12422f6a 100644
--- 
a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestHoodieLogFileCommand.java
+++ 
b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestHoodieLogFileCommand.java
@@ -194,7 +194,8 @@ public class TestHoodieLogFileCommand extends 
CLIFunctionalTestHarness {
   .withFileExtension(HoodieLogFile.DELTA_EXTENSION)
   
.withFileId("test-log-fileid1").overBaseCommit(INSTANT_TIME).withFs(fs).withSizeThreshold(500).build();
 
-  List records1 = 
SchemaTestUtil.generateHoodieTestRecords(0, 
100).stream().map(HoodieAvroIndexedRecord::new).collect(Collectors.toList());
+  SchemaTestUtil testUtil = new SchemaTestUtil();
+  List records1 = testUtil.generateHoodieTestRecords(0, 
100).stream().map(HoodieAvroIndexedRecord::new).collect(Collectors.toList());
   Map header = new HashMap<>();
   header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, INSTANT_TIME);
   header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
diff --git 
a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestRepairsCommand.java 
b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestRepairsCommand.java
index 69db47136e9..a95ed9ff778 100644
--- a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestRepairsCommand.java
+++ b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestRepairsCommand.java
@@ -86,8 +86,9 @@ public class ITTestRepairsCommand extends 
HoodieCLIIntegrationTestBase {
 Schema schema = 
HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema());
 
 // generate 200 records
-HoodieRecord[] hoodieRecords1 = 
SchemaTestUtil.generateHoodieTestRecords(0, 100, schema).toArray(new 
HoodieRecord[100]);
-

[hudi] branch master updated (27a8866a1e1 -> 124ab5fda66)

2023-01-18 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from 27a8866a1e1 [HUDI-5433] Fix the way we deduce the pending instants for 
MDT writes (#7544)
 add 124ab5fda66 [HUDI-5488] Make sure Disrupt queue start first, then 
insert records (#7582)

No new revisions were added by this update.

Summary of changes:
 .../apache/hudi/execution/TestDisruptorMessageQueue.java|  2 +-
 .../common/util/queue/BaseHoodieQueueBasedExecutor.java |  3 +++
 .../apache/hudi/common/util/queue/DisruptorExecutor.java| 13 -
 .../hudi/common/util/queue/DisruptorMessageQueue.java   | 13 -
 4 files changed, 24 insertions(+), 7 deletions(-)



[hudi] branch master updated: [MINOR] Unify naming for record merger (#7660)

2023-01-18 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new ec5022b4fdd [MINOR] Unify naming for record merger (#7660)
ec5022b4fdd is described below

commit ec5022b4fdd94c106bf243038aadf781f31b5be9
Author: Jon Vexler 
AuthorDate: Wed Jan 18 12:30:52 2023 -0500

[MINOR] Unify naming for record merger (#7660)

Addressing inconsistent usage of "record merger impls"/"record merger 
strategy" and "merger impls"/"merger strategy"
---
 .../org/apache/hudi/config/HoodieWriteConfig.java  | 31 +-
 .../functional/TestHoodieBackedMetadata.java   |  6 ++--
 .../hudi/common/model/HoodieAvroRecordMerger.java  |  2 +-
 .../hudi/common/table/HoodieTableConfig.java   | 20 ++--
 .../hudi/common/table/HoodieTableMetaClient.java   | 38 +++---
 .../apache/hudi/common/util/HoodieRecordUtils.java |  4 +--
 .../apache/hudi/streamer/FlinkStreamerConfig.java  | 15 +
 .../apache/hudi/hadoop/TestInputPathHandler.java   |  2 +-
 .../scala/org/apache/hudi/DataSourceOptions.scala  |  4 +--
 .../scala/org/apache/hudi/HoodieBaseRelation.scala | 14 
 .../scala/org/apache/hudi/HoodieWriterUtils.scala  |  4 +--
 .../src/main/scala/org/apache/hudi/Iterators.scala |  4 +--
 .../scala/org/apache/hudi/cdc/HoodieCDCRDD.scala   |  4 +--
 .../apache/spark/sql/hudi/HoodieOptionConfig.scala | 10 +++---
 .../spark/sql/hudi/ProvidesHoodieConfig.scala  |  4 +--
 .../hudi/command/MergeIntoHoodieTableCommand.scala |  2 +-
 .../apache/hudi/functional/TestCOWDataSource.scala |  2 +-
 .../apache/hudi/functional/TestMORDataSource.scala |  2 +-
 .../ReadAndWriteWithoutAvroBenchmark.scala |  6 ++--
 .../spark/sql/hudi/HoodieSparkSqlTestBase.scala|  4 +--
 .../spark/sql/hudi/TestHoodieOptionConfig.scala|  4 +--
 .../apache/spark/sql/hudi/TestInsertTable.scala|  2 +-
 .../org/apache/spark/sql/hudi/TestSpark3DDL.scala  |  2 +-
 .../org/apache/spark/sql/hudi/TestSqlConf.scala|  2 +-
 .../hudi/utilities/deltastreamer/DeltaSync.java|  6 ++--
 .../deltastreamer/TestHoodieDeltaStreamer.java |  2 +-
 rfc/rfc-46/rfc-46.md   |  2 +-
 27 files changed, 100 insertions(+), 98 deletions(-)

diff --git 
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java
 
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java
index 879136206e2..7367826e50d 100644
--- 
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java
+++ 
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java
@@ -135,17 +135,17 @@ public class HoodieWriteConfig extends HoodieConfig {
   .withDocumentation("Payload class used. Override this, if you like to 
roll your own merge logic, when upserting/inserting. "
   + "This will render any value set for PRECOMBINE_FIELD_OPT_VAL 
in-effective");
 
-  public static final ConfigProperty MERGER_IMPLS = ConfigProperty
-  .key("hoodie.datasource.write.merger.impls")
+  public static final ConfigProperty RECORD_MERGER_IMPLS = 
ConfigProperty
+  .key("hoodie.datasource.write.record.merger.impls")
   .defaultValue(HoodieAvroRecordMerger.class.getName())
   .withDocumentation("List of HoodieMerger implementations constituting 
Hudi's merging strategy -- based on the engine used. "
-  + "These merger impls will filter by 
hoodie.datasource.write.merger.strategy "
+  + "These merger impls will filter by 
hoodie.datasource.write.record.merger.strategy "
   + "Hudi will pick most efficient implementation to perform 
merging/combining of the records (during update, reading MOR table, etc)");
 
-  public static final ConfigProperty MERGER_STRATEGY = ConfigProperty
-  .key("hoodie.datasource.write.merger.strategy")
+  public static final ConfigProperty RECORD_MERGER_STRATEGY = 
ConfigProperty
+  .key("hoodie.datasource.write.record.merger.strategy")
   .defaultValue(HoodieRecordMerger.DEFAULT_MERGER_STRATEGY_UUID)
-  .withDocumentation("Id of merger strategy. Hudi will pick 
HoodieRecordMerger implementations in hoodie.datasource.write.merger.impls 
which has the same merger strategy id");
+  .withDocumentation("Id of merger strategy. Hudi will pick 
HoodieRecordMerger implementations in 
hoodie.datasource.write.record.merger.impls which has the same merger strategy 
id");
 
   public static final ConfigProperty KEYGENERATOR_CLASS_NAME = 
ConfigProperty
   .key("hoodie.datasource.write.keygenerator.class")
@@ -971,12 +971,12 @@ public class HoodieWriteC

[hudi] branch master updated (dc0f880c1b2 -> 672751975d0)

2023-01-09 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from dc0f880c1b2 [HUDI-5504] Fix concurrency conflict for flink async 
compaction with latency marker (#7609)
 add 672751975d0 [HUDI-5484] Avoid using `GenericRecord` in 
`HoodieColumnStatMetadata` (#7573)

No new revisions were added by this update.

Summary of changes:
 .../hudi/metadata/HoodieMetadataPayload.java   |  7 --
 .../java/org/apache/hudi/TestDataSourceUtils.java  | 29 ++
 2 files changed, 34 insertions(+), 2 deletions(-)



[hudi] branch release-0.12.2-blockers-candidate updated (6fa192aac75 -> 492f7d737ec)

2022-12-15 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a change to branch release-0.12.2-blockers-candidate
in repository https://gitbox.apache.org/repos/asf/hudi.git


from 6fa192aac75 [HUDI-4871][HUDI-4411] Upgrade to spark 3.3.1 & 3.2.2 
(#7155)
 add 492f7d737ec [HUDI-5097] Fix partition reading without partition fields 
table config (#7069)

No new revisions were added by this update.

Summary of changes:
 .../java/org/apache/hudi/BaseHoodieTableFileIndex.java | 14 +-
 1 file changed, 1 insertion(+), 13 deletions(-)



[hudi] branch release-0.12.2-blockers-candidate updated (0542f56733e -> 6fa192aac75)

2022-12-15 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a change to branch release-0.12.2-blockers-candidate
in repository https://gitbox.apache.org/repos/asf/hudi.git


from 0542f56733e [HUDI-5223] Partial failover for flink (#7208)
 add 6fa192aac75 [HUDI-4871][HUDI-4411] Upgrade to spark 3.3.1 & 3.2.2 
(#7155)

No new revisions were added by this update.

Summary of changes:
 README.md  |  4 ++--
 .../hudi/client/utils/SparkInternalSchemaConverter.java|  2 +-
 packaging/bundle-validation/ci_run.sh  |  8 
 pom.xml| 14 +++---
 4 files changed, 14 insertions(+), 14 deletions(-)



[hudi] branch master updated (8d13a7e383c -> 3a568033f14)

2022-12-15 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from 8d13a7e383c [HUDI-5023] Consuming records from Iterator directly 
instead of using inner message queue (#7174)
 add 3a568033f14 [HUDI-4871][HUDI-4411] Upgrade to spark 3.3.1 & 3.2.2 
(#7155)

No new revisions were added by this update.

Summary of changes:
 README.md  |  4 ++--
 .../hudi/client/utils/SparkInternalSchemaConverter.java|  2 +-
 packaging/bundle-validation/ci_run.sh  |  8 
 pom.xml| 14 +++---
 4 files changed, 14 insertions(+), 14 deletions(-)



[hudi] branch master updated (972408b55fc -> 8d13a7e383c)

2022-12-15 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from 972408b55fc [HUDI-5251] Split GitHub actions CI by spark and flink 
(#7265)
 add 8d13a7e383c [HUDI-5023] Consuming records from Iterator directly 
instead of using inner message queue (#7174)

No new revisions were added by this update.

Summary of changes:
 .../org/apache/hudi/config/HoodieWriteConfig.java  |   8 +-
 .../table/action/commit/HoodieMergeHelper.java |   6 +-
 ...edExecutorFactory.java => ExecutorFactory.java} |   9 +-
 .../hudi/execution/FlinkLazyInsertIterable.java|   7 +-
 .../hudi/execution/JavaLazyInsertIterable.java |   6 +-
 .../hudi/execution/SparkLazyInsertIterable.java|   6 +-
 .../hudi/execution/TestSimpleExecutionInSpark.java | 242 +
 .../util/queue/BaseHoodieQueueBasedExecutor.java   |   4 +-
 .../hudi/common/util/queue/ExecutorType.java   |  10 +-
 .../hudi/common/util/queue/HoodieExecutor.java |   6 +-
 .../common/util/queue/SimpleHoodieExecutor.java|  92 
 .../benchmark/BoundInMemoryExecutorBenchmark.scala |  41 +++-
 12 files changed, 404 insertions(+), 33 deletions(-)
 rename 
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/util/{QueueBasedExecutorFactory.java
 => ExecutorFactory.java} (88%)
 create mode 100644 
hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/execution/TestSimpleExecutionInSpark.java
 create mode 100644 
hudi-common/src/main/java/org/apache/hudi/common/util/queue/SimpleHoodieExecutor.java



[hudi] branch release-0.12.2-blockers-candidate updated (292630b4808 -> ee8c9dfe97b)

2022-12-14 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a change to branch release-0.12.2-blockers-candidate
in repository https://gitbox.apache.org/repos/asf/hudi.git


from 292630b4808 Avoiding costly lookups into the schema cache in 
`SqlTypedRecord`
 add ee8c9dfe97b Fixing schemas used for bootstrap reader

No new revisions were added by this update.

Summary of changes:
 .../org/apache/hudi/table/action/commit/HoodieMergeHelper.java | 10 --
 .../org/apache/hudi/table/action/commit/FlinkMergeHelper.java  |  9 -
 .../org/apache/hudi/table/action/commit/JavaMergeHelper.java   |  9 -
 3 files changed, 24 insertions(+), 4 deletions(-)



[hudi] branch release-0.12.2-blockers-candidate updated (4e205d20595 -> 292630b4808)

2022-12-14 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a change to branch release-0.12.2-blockers-candidate
in repository https://gitbox.apache.org/repos/asf/hudi.git


omit 4e205d20595 Disable some metadata bootstrap tests
omit 51f15f500b3 [HUDI-5348] Cache file slices in HoodieBackedTableMetadata 
(#7436)
omit 70e4615c26a [HUDI-5296] Allow disable schema on read after enabling 
(#7421)
omit 4085f27cfb0 [HUDI-5078] Fixing isTableService for replace commits 
(#7037)
omit 6e6940fc59e [HUDI-5353] Close file readers (#7412)
omit 68361fae88e [MINOR] Fix Out of Bounds Exception for 
DayBasedCompactionStrategy (#7360)
omit 39031d3c9ef [HUDI-5372] Fix NPE caused by alter table add column. 
(#7236)
omit 438f3ab6ae3 [HUDI-5347] Cleaned up transient state from 
`ExpressionPayload` making it non-serializable (#7424)
omit 08b414dd15c [HUDI-5336] Fixing parsing of log files while building 
file groups (#7393)
omit 7b8a7208602 [HUDI-5338] Adjust coalesce behavior within NONE sort mode 
for bulk insert (#7396)
omit bca85f376c1 [HUDI-5342] Add new bulk insert sort modes repartitioning 
data by partition path (#7402)
omit 172c438d64b [HUDI-5358] Fix flaky tests in 
TestCleanerInsertAndCleanByCommits (#7420)
omit f1d643e8f9e [HUDI-5350] Fix oom cause compaction event lost problem 
(#7408)
omit d156989 [HUDI-5346][HUDI-5320] Fixing Create Table as Select 
(CTAS) performance gaps (#7370)
omit 725a9b210a1 [HUDI-5291] Fixing NPE in MOR column stats accounting 
(#7349)
omit 4a28b8389f9 [HUDI-5345] Avoid fs.exists calls for metadata table in 
HFileBootstrapIndex (#7404)
omit 8510aacba8e [HUDI-5347] FIxing performance traps in Spark SQL `MERGE 
INTO` implementation (#7395)
 add ee60a6a23e7 [HUDI-5345] Avoid fs.exists calls for metadata table in 
HFileBootstrapIndex (#7404)
 add 6fa7ff9bddb [HUDI-5291] Fixing NPE in MOR column stats accounting 
(#7349)
 add 0e17e18c4a0 [HUDI-5346][HUDI-5320] Fixing Create Table as Select 
(CTAS) performance gaps (#7370)
 add 1a2ffddd064 [HUDI-5350] Fix oom cause compaction event lost problem 
(#7408)
 add 5d4f59c734c [HUDI-5358] Fix flaky tests in 
TestCleanerInsertAndCleanByCommits (#7420)
 add fb552aee6f1 [HUDI-5342] Add new bulk insert sort modes repartitioning 
data by partition path (#7402)
 add bbde3a93e8b [HUDI-5338] Adjust coalesce behavior within NONE sort mode 
for bulk insert (#7396)
 add 01ba6502aac [HUDI-5336] Fixing parsing of log files while building 
file groups (#7393)
 add 5e312233f5d [HUDI-5372] Fix NPE caused by alter table add column. 
(#7236)
 add 8f5723b9f53 [MINOR] Fix Out of Bounds Exception for 
DayBasedCompactionStrategy (#7360)
 add 106d216af2e [HUDI-5353] Close file readers (#7412)
 add 1743d059b6c [HUDI-5078] Fixing isTableService for replace commits 
(#7037)
 add 8bc9fa5ca95 [HUDI-5296] Allow disable schema on read after enabling 
(#7421)
 add 4fd25ca804f [HUDI-5348] Cache file slices in HoodieBackedTableMetadata 
(#7436)
 add 292630b4808 Avoiding costly lookups into the schema cache in 
`SqlTypedRecord`

This update added new revisions after undoing existing revisions.
That is to say, some revisions that were in the old version of the
branch are not in the new version.  This situation occurs
when a user --force pushes a change and generates a repository
containing something like this:

 * -- * -- B -- O -- O -- O   (4e205d20595)
\
 N -- N -- N   refs/heads/release-0.12.2-blockers-candidate 
(292630b4808)

You should already have received notification emails for all of the O
revisions, and so the following emails describe only the N revisions
from the common base, B.

Any revisions marked "omit" are not gone; other references still
refer to them.  Any revisions marked "discard" are gone forever.

No new revisions were added by this update.

Summary of changes:
 .../org/apache/hudi/AvroConversionUtils.scala  |   3 -
 .../org/apache/hudi/sql/IExpressionEvaluator.java} |  23 +-
 .../org/apache/hudi/HoodieSparkSqlWriter.scala |  59 +---
 .../sql/hudi/command/payload/SqlTypedRecord.scala  |  73 +
 .../hudi/command/payload/ExpressionCodeGen.scala   | 193 +++
 .../hudi/command/payload/ExpressionPayload.scala   | 352 -
 .../functional/TestDataSourceForBootstrap.scala|   6 +-
 .../sql/catalyst/expressions/SafeProjection.scala  |  75 -
 8 files changed, 411 insertions(+), 373 deletions(-)
 copy 
hudi-spark-datasource/hudi-spark-common/src/main/{scala/org/apache/spark/sql/catalyst/trees/HoodieLeafLike.scala
 => java/org/apache/hudi/sql/IExpressionEvaluator.java} (64%)
 create mode 100644 
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/payload/SqlTypedRecord.scala
 create mode 100644 
hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/payload/Expressi

[hudi] branch release-0.12.2-blockers-candidate updated (51af3e5f943 -> 51f15f500b3)

2022-12-13 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a change to branch release-0.12.2-blockers-candidate
in repository https://gitbox.apache.org/repos/asf/hudi.git


 discard 51af3e5f943 [HUDI-5348] Cache file slices in HoodieBackedTableMetadata 
(#7436)
 discard 5a6b4de0e04 [HUDI-5296] Allow disable schema on read after enabling 
(#7421)
 discard 7538e7e1512 [HUDI-5078] Fixing isTableService for replace commits 
(#7037)
 discard 1c0c379df92 [HUDI-5353] Close file readers (#7412)
 discard f11edd53c78 [MINOR] Fix Out of Bounds Exception for 
DayBasedCompactionStrategy (#7360)
 discard 4bb8f31d64b [HUDI-5372] Fix NPE caused by alter table add column. 
(#7236)
 discard 25571aa03d0 [HUDI-5347] Cleaned up transient state from 
`ExpressionPayload` making it non-serializable (#7424)
 discard b09e361723d [HUDI-5336] Fixing parsing of log files while building 
file groups (#7393)
 discard e647096286d [HUDI-5338] Adjust coalesce behavior within NONE sort mode 
for bulk insert (#7396)
 discard f33a430b054 [HUDI-5342] Add new bulk insert sort modes repartitioning 
data by partition path (#7402)
 discard ff817d9009c [HUDI-5358] Fix flaky tests in 
TestCleanerInsertAndCleanByCommits (#7420)
 discard c7c74e127d7 [HUDI-5350] Fix oom cause compaction event lost problem 
(#7408)
 discard 34537d29375 [HUDI-5346][HUDI-5320] Fixing Create Table as Select 
(CTAS) performance gaps (#7370)
 discard aecfb40a99a [HUDI-5291] Fixing NPE in MOR column stats accounting 
(#7349)
 discard e56631f34c0 [HUDI-5345] Avoid fs.exists calls for metadata table in 
HFileBootstrapIndex (#7404)
 discard 7ccdbaedb45 [HUDI-5347] FIxing performance traps in Spark SQL `MERGE 
INTO` implementation (#7395)
 discard 025a8db3f1d [HUDI-5344] Fix CVE - upgrade protobuf-java (#6960)
 discard 94860a41dfd [HUDI-5163] Fix failure handling with spark datasource 
write (#7140)
 discard fa2fd8e97ed [HUDI-5344] Fix CVE - upgrade protobuf-java to 3.18.2 
(#6957)
 discard 14004c83f63 [HUDI-5151] Fix bug with broken flink data skipping caused 
by ClassNotFoundException of InLineFileSystem (#7124)
 discard 0c963205084 [HUDI-5253] HoodieMergeOnReadTableInputFormat could have 
duplicate records issue if it contains delta files while still splittable 
(#7264)
 discard 7e3451269a8 [HUDI-5242] Do not fail Meta sync in Deltastreamer when 
inline table service fails (#7243)
 discard 6948ab10020 [HUDI-5277] Close HoodieWriteClient before exiting 
RunClusteringProcedure (#7300)
 discard 44bbfef9a3a [HUDI-5260] Fix insert into sql command with strict sql 
insert mode (#7269)
 discard 7614443d518 [HUDI-5252] ClusteringCommitSink supports to rollback 
clustering (#7263)
 add d4ec501f755 [HUDI-5260] Fix insert into sql command with strict sql 
insert mode (#7269)
 add 5230a11f15d [HUDI-5277] Close HoodieWriteClient before exiting 
RunClusteringProcedure (#7300)
 add a78cb091f94 [HUDI-5242] Do not fail Meta sync in Deltastreamer when 
inline table service fails (#7243)
 add 4ccee729d29 [HUDI-5253] HoodieMergeOnReadTableInputFormat could have 
duplicate records issue if it contains delta files while still splittable 
(#7264)
 add 64a359b5bd8 [HUDI-5151] Fix bug with broken flink data skipping caused 
by ClassNotFoundException of InLineFileSystem (#7124)
 add ab80838fd35 [HUDI-5344] Fix CVE - upgrade protobuf-java to 3.18.2 
(#6957)
 add e3c956284ed [HUDI-5163] Fix failure handling with spark datasource 
write (#7140)
 add 8b294b05639 [HUDI-5344] Fix CVE - upgrade protobuf-java (#6960)
 add 8510aacba8e [HUDI-5347] FIxing performance traps in Spark SQL `MERGE 
INTO` implementation (#7395)
 add 4a28b8389f9 [HUDI-5345] Avoid fs.exists calls for metadata table in 
HFileBootstrapIndex (#7404)
 add 725a9b210a1 [HUDI-5291] Fixing NPE in MOR column stats accounting 
(#7349)
 add d156989 [HUDI-5346][HUDI-5320] Fixing Create Table as Select 
(CTAS) performance gaps (#7370)
 add f1d643e8f9e [HUDI-5350] Fix oom cause compaction event lost problem 
(#7408)
 add 172c438d64b [HUDI-5358] Fix flaky tests in 
TestCleanerInsertAndCleanByCommits (#7420)
 add bca85f376c1 [HUDI-5342] Add new bulk insert sort modes repartitioning 
data by partition path (#7402)
 add 7b8a7208602 [HUDI-5338] Adjust coalesce behavior within NONE sort mode 
for bulk insert (#7396)
 add 08b414dd15c [HUDI-5336] Fixing parsing of log files while building 
file groups (#7393)
 add 438f3ab6ae3 [HUDI-5347] Cleaned up transient state from 
`ExpressionPayload` making it non-serializable (#7424)
 add 39031d3c9ef [HUDI-5372] Fix NPE caused by alter table add column. 
(#7236)
 add 68361fae88e [MINOR] Fix Out of Bounds Exception for 
DayBasedCompactionStrategy (#7360)
 add 6e6940fc59e [HUDI-5353] Close file readers (#7412)
 add 4085f27cfb0 [HUDI-5078] Fixing isTableService for replace commits 
(#7037)
 add 70e4615c26a [HUDI-5296] Allow disable schema on read after enabling 
(#7421)
 add 51f15f500b3 [HUDI

[hudi] branch release-0.12.2-blockers-candidate created (now 51af3e5f943)

2022-12-13 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a change to branch release-0.12.2-blockers-candidate
in repository https://gitbox.apache.org/repos/asf/hudi.git


  at 51af3e5f943 [HUDI-5348] Cache file slices in HoodieBackedTableMetadata 
(#7436)

No new revisions were added by this update.



[hudi] branch release-0.12.2-shadow updated: [HUDI-5347] Cleaned up transient state from `ExpressionPayload` making it non-serializable (#7424)

2022-12-13 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a commit to branch release-0.12.2-shadow
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/release-0.12.2-shadow by this 
push:
 new 4d0a27a90eb [HUDI-5347] Cleaned up transient state from 
`ExpressionPayload` making it non-serializable (#7424)
4d0a27a90eb is described below

commit 4d0a27a90eb9eeab989f39fe8c78ebf44b6c0d8c
Author: Alexey Kudinkin 
AuthorDate: Sun Dec 11 19:25:19 2022 -0800

[HUDI-5347] Cleaned up transient state from `ExpressionPayload` making it 
non-serializable (#7424)

- Internal state (cached records, writer schemas) are removed to make
   sure that `ExpressionPayload` object is serializable at all times.
- `ExpressionPayload` caches are scoped down to `ThreadLocal` since
   some of the re-used components (AvroSerializer, AvroDeserializer, 
SafeProjection)
   have internal mutable state and therefore are not thread-safe
---
 .../hudi/command/payload/ExpressionPayload.scala   | 174 -
 .../sql/catalyst/expressions/SafeProjection.scala  |  75 +
 2 files changed, 172 insertions(+), 77 deletions(-)

diff --git 
a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/payload/ExpressionPayload.scala
 
b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/payload/ExpressionPayload.scala
index 80c795c7d63..8b2cc23aecc 100644
--- 
a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/payload/ExpressionPayload.scala
+++ 
b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/payload/ExpressionPayload.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.hudi.command.payload
 
-import com.github.benmanes.caffeine.cache.Caffeine
+import com.github.benmanes.caffeine.cache.{Cache, Caffeine}
 import org.apache.avro.Schema
 import org.apache.avro.generic.{GenericData, GenericRecord, IndexedRecord}
 import org.apache.hudi.AvroConversionUtils.convertAvroSchemaToStructType
@@ -35,13 +35,12 @@ import org.apache.hudi.io.HoodieWriteHandle
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.avro.{HoodieAvroDeserializer, HoodieAvroSerializer}
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection
-import org.apache.spark.sql.catalyst.expressions.{Expression, Projection}
+import org.apache.spark.sql.catalyst.expressions.{Expression, Projection, 
SafeProjection}
 import org.apache.spark.sql.hudi.SerDeUtils
 import org.apache.spark.sql.hudi.command.payload.ExpressionPayload._
 import org.apache.spark.sql.types.BooleanType
 
-import java.util.function.Function
+import java.util.function.{Function, Supplier}
 import java.util.{Base64, Properties}
 import scala.collection.JavaConverters._
 import scala.collection.mutable.ArrayBuffer
@@ -61,26 +60,12 @@ import scala.collection.mutable.ArrayBuffer
  */
 class ExpressionPayload(@transient record: GenericRecord,
 @transient orderingVal: Comparable[_])
-  extends DefaultHoodieRecordPayload(record, orderingVal) {
+  extends DefaultHoodieRecordPayload(record, orderingVal) with Logging {
 
   def this(recordOpt: HOption[GenericRecord]) {
 this(recordOpt.orElse(null), 0)
   }
 
-  /**
-   * Target schema used for writing records into the table
-   */
-  private var writeSchema: Schema = _
-
-  /**
-   * Original record's schema
-   *
-   * NOTE: To avoid excessive overhead of serializing original record's Avro 
schema along
-   *   w/ _every_ record, we instead make it to be provided along with 
every request
-   *   requiring this record to be deserialized
-   */
-  private var recordSchema: Schema = _
-
   override def combineAndGetUpdateValue(currentValue: IndexedRecord,
 schema: Schema): 
HOption[IndexedRecord] = {
 throw new IllegalStateException(s"Should not call this method for 
${getClass.getCanonicalName}")
@@ -93,7 +78,7 @@ class ExpressionPayload(@transient record: GenericRecord,
   override def combineAndGetUpdateValue(targetRecord: IndexedRecord,
 schema: Schema,
 properties: Properties): 
HOption[IndexedRecord] = {
-init(properties)
+val recordSchema = getRecordSchema(properties)
 
 val sourceRecord = bytesToAvro(recordBytes, recordSchema)
 val joinedRecord = joinRecord(sourceRecord, targetRecord)
@@ -136,8 +121,9 @@ class ExpressionPayload(@transient record: GenericRecord,
   // If the update condition matched  then execute assignment expression
   // to compute final record to update. We will return the first matched 
record.
   if (conditionEvalResult) {
+val writerSchema = getWriterSchema(properties)
 val re

[hudi] branch release-0.12.2-shadow updated (12a0897eac9 -> abb86ca5920)

2022-12-13 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a change to branch release-0.12.2-shadow
in repository https://gitbox.apache.org/repos/asf/hudi.git


from 12a0897eac9 [HUDI-5295] One meta sync failure should not prevent other 
meta sync from occurring (#7367)
 add abb86ca5920 [HUDI-5347] FIxing performance traps in Spark SQL `MERGE 
INTO` implementation (#7395)

No new revisions were added by this update.

Summary of changes:
 .../org/apache/hudi/AvroConversionUtils.scala  |   3 +
 .../java/org/apache/hudi/avro/AvroSchemaUtils.java |  13 +
 .../org/apache/hudi/sql/IExpressionEvaluator.java  |  37 ---
 .../sql/hudi/command/payload/SqlTypedRecord.scala  |  72 -
 .../hudi/command/payload/ExpressionCodeGen.scala   | 193 --
 .../hudi/command/payload/ExpressionPayload.scala   | 292 +
 6 files changed, 203 insertions(+), 407 deletions(-)
 delete mode 100644 
hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/sql/IExpressionEvaluator.java
 delete mode 100644 
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/payload/SqlTypedRecord.scala
 delete mode 100644 
hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/payload/ExpressionCodeGen.scala



[hudi] branch master updated: [HUDI-5296] Allow disable schema on read after enabling (#7421)

2022-12-12 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new aacfe6de806 [HUDI-5296] Allow disable schema on read after enabling 
(#7421)
aacfe6de806 is described below

commit aacfe6de806e045a4a02f5f4a15910fb60d40fa7
Author: Sivabalan Narayanan 
AuthorDate: Mon Dec 12 11:39:18 2022 -0800

[HUDI-5296] Allow disable schema on read after enabling (#7421)

If someone has enabled schema on read by mistake and never really renamed 
or dropped a column. it should be feasible to disable schema on read. This 
patch fixes that. essentially both on read and write path, if 
"hoodie.schema.on.read.enable" config is not set, it will fallback to regular 
code path. It might fail or users might miss data if any they have performed 
any irrevocable changes like renames. But for rest, this should work.
---
 .../scala/org/apache/hudi/HoodieBaseRelation.scala | 20 ++---
 .../org/apache/hudi/HoodieSparkSqlWriter.scala | 90 +-
 .../org/apache/hudi/IncrementalRelation.scala  |  6 +-
 .../org/apache/spark/sql/hudi/TestSpark3DDL.scala  | 54 -
 4 files changed, 105 insertions(+), 65 deletions(-)

diff --git 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala
 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala
index afc0781eb1b..9c984b96fb2 100644
--- 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala
+++ 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala
@@ -140,7 +140,7 @@ abstract class HoodieBaseRelation(val sqlContext: 
SQLContext,
*/
   protected lazy val (tableAvroSchema: Schema, internalSchemaOpt: 
Option[InternalSchema]) = {
 val schemaResolver = new TableSchemaResolver(metaClient)
-val internalSchemaOpt = if (!isSchemaEvolutionEnabled) {
+val internalSchemaOpt = if (!isSchemaEvolutionEnabledOnRead(optParams, 
sparkSession)) {
   None
 } else {
   Try {
@@ -639,15 +639,6 @@ abstract class HoodieBaseRelation(val sqlContext: 
SQLContext,
 
   private def prunePartitionColumns(dataStructSchema: StructType): StructType =
 StructType(dataStructSchema.filterNot(f => 
partitionColumns.contains(f.name)))
-
-  private def isSchemaEvolutionEnabled = {
-// NOTE: Schema evolution could be configured both t/h optional parameters 
vehicle as well as
-//   t/h Spark Session configuration (for ex, for Spark SQL)
-optParams.getOrElse(DataSourceReadOptions.SCHEMA_EVOLUTION_ENABLED.key,
-  
DataSourceReadOptions.SCHEMA_EVOLUTION_ENABLED.defaultValue.toString).toBoolean 
||
-  sparkSession.conf.get(DataSourceReadOptions.SCHEMA_EVOLUTION_ENABLED.key,
-
DataSourceReadOptions.SCHEMA_EVOLUTION_ENABLED.defaultValue.toString).toBoolean
-  }
 }
 
 object HoodieBaseRelation extends SparkAdapterSupport {
@@ -749,4 +740,13 @@ object HoodieBaseRelation extends SparkAdapterSupport {
 })
 }
   }
+
+  def isSchemaEvolutionEnabledOnRead(optParams: Map[String, String], 
sparkSession: SparkSession): Boolean = {
+// NOTE: Schema evolution could be configured both t/h optional parameters 
vehicle as well as
+//   t/h Spark Session configuration (for ex, for Spark SQL)
+optParams.getOrElse(DataSourceReadOptions.SCHEMA_EVOLUTION_ENABLED.key,
+  
DataSourceReadOptions.SCHEMA_EVOLUTION_ENABLED.defaultValue.toString).toBoolean 
||
+  sparkSession.conf.get(DataSourceReadOptions.SCHEMA_EVOLUTION_ENABLED.key,
+
DataSourceReadOptions.SCHEMA_EVOLUTION_ENABLED.defaultValue.toString).toBoolean
+  }
 }
diff --git 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala
 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala
index 1427d0c7c9e..f0ede2b2b82 100644
--- 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala
+++ 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala
@@ -150,14 +150,17 @@ object HoodieSparkSqlWriter {
   // Handle various save modes
   handleSaveModes(sqlContext.sparkSession, mode, basePath, tableConfig, 
tblName, operation, fs)
   val partitionColumns = 
SparkKeyGenUtils.getPartitionColumns(keyGenerator, toProperties(parameters))
-  // Create the table if not present
-  if (!tableExists) {
+  val tableMetaClient = if (tableExists) {
+HoodieTableMetaClient.builder
+  .setConf(sparkContext.hadoopConfiguration)
+  .setBasePath(path)
+  .build()
+  } else {
 val baseFileFormat = 
hoodieConfig.getStringOrDefault(HoodieTableConfig.BASE_FILE_FORMAT)

[hudi] branch master updated: [HUDI-5342] Add new bulk insert sort modes repartitioning data by partition path (#7402)

2022-12-09 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new cad739 [HUDI-5342] Add new bulk insert sort modes repartitioning 
data by partition path (#7402)
cad739 is described below

commit cad739ffce1723b5615d8751414b86211c8c
Author: Y Ethan Guo 
AuthorDate: Fri Dec 9 19:04:44 2022 -0800

[HUDI-5342] Add new bulk insert sort modes repartitioning data by partition 
path (#7402)

This PR adds two new bulk insert sort modes, PARTITION_PATH_REPARTITION and 
PARTITION_PATH_REPARTITION_AND_SORT, which does the following

For a physically partitioned table, repartition the input records based on 
the partition path, limiting the shuffle parallelism to specified 
outputSparkPartitions. For PARTITION_PATH_REPARTITION_AND_SORT, an additional 
step of sorting the records based on the partition path within each Spark 
partition is done.
For a physically non-partitioned table, simply does coalesce for the input 
rows with outputSparkPartitions.
New unit tests are added to verify the added functionality.
---
 .../org/apache/hudi/config/HoodieWriteConfig.java  | 11 
 .../execution/bulkinsert/BulkInsertSortMode.java   |  8 ++-
 .../java/org/apache/hudi/table/HoodieTable.java|  7 +++
 .../MultipleSparkJobExecutionStrategy.java | 13 ++--
 .../BulkInsertInternalPartitionerFactory.java  |  8 ++-
 ...lkInsertInternalPartitionerWithRowsFactory.java |  7 ++-
 .../bulkinsert/PartitionPathRDDPartitioner.java| 52 
 ...PartitionPathRepartitionAndSortPartitioner.java | 71 ++
 ...nPathRepartitionAndSortPartitionerWithRows.java | 62 +++
 .../PartitionPathRepartitionPartitioner.java   | 70 +
 ...artitionPathRepartitionPartitionerWithRows.java | 60 ++
 .../TestBulkInsertInternalPartitioner.java | 20 +++---
 .../TestBulkInsertInternalPartitionerForRows.java  | 24 +---
 .../function/SerializableFunctionUnchecked.java| 34 +++
 .../hudi/common/table/HoodieTableConfig.java   |  4 ++
 .../org/apache/hudi/HoodieSparkSqlWriter.scala |  8 ++-
 16 files changed, 430 insertions(+), 29 deletions(-)

diff --git 
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java
 
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java
index 466bfeaeba..03eb44b001 100644
--- 
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java
+++ 
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java
@@ -344,6 +344,17 @@ public class HoodieWriteConfig extends HoodieConfig {
   + "GLOBAL_SORT: this ensures best file sizes, with lowest memory 
overhead at cost of sorting. "
   + "PARTITION_SORT: Strikes a balance by only sorting within a 
partition, still keeping the memory overhead of writing "
   + "lowest and best effort file sizing. "
+  + "PARTITION_PATH_REPARTITION: this ensures that the data for a 
single physical partition in the table is written "
+  + "by the same Spark executor, best for input data evenly 
distributed across different partition paths. "
+  + "This can cause imbalance among Spark executors if the input data 
is skewed, i.e., most records are intended for "
+  + "a handful of partition paths among all. "
+  + "PARTITION_PATH_REPARTITION_AND_SORT: this ensures that the data 
for a single physical partition in the table is written "
+  + "by the same Spark executor, best for input data evenly 
distributed across different partition paths. "
+  + "Compared to PARTITION_PATH_REPARTITION, this sort mode does an 
additional step of sorting the records "
+  + "based on the partition path within a single Spark partition, 
given that data for multiple physical partitions "
+  + "can be sent to the same Spark partition and executor. "
+  + "This can cause imbalance among Spark executors if the input data 
is skewed, i.e., most records are intended for "
+  + "a handful of partition paths among all. "
   + "NONE: No sorting. Fastest and matches `spark.write.parquet()` in 
terms of number of files, overheads");
 
   public static final ConfigProperty EMBEDDED_TIMELINE_SERVER_ENABLE = 
ConfigProperty
diff --git 
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/execution/bulkinsert/BulkInsertSortMode.java
 
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/execution/bulkinsert/BulkInsertSortMode.java
index d171b8cd72..6fd06545ae 100644
--- 
a/hudi-cl

[hudi] branch master updated (5498821085 -> c2c0fa0d57)

2022-12-09 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from 5498821085 [HUDI-4881] Push down filters if possible when syncing 
partitions to Hive (#6725)
 add c2c0fa0d57 [HUDI-5358] Fix flaky tests in 
TestCleanerInsertAndCleanByCommits (#7420)

No new revisions were added by this update.

Summary of changes:
 .../clean/TestCleanerInsertAndCleanByCommits.java  | 136 -
 1 file changed, 104 insertions(+), 32 deletions(-)



[hudi] branch master updated: [HUDI-5346][HUDI-5320] Fixing Create Table as Select (CTAS) performance gaps (#7370)

2022-12-08 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new 8de53571e0 [HUDI-5346][HUDI-5320] Fixing Create Table as Select (CTAS) 
performance gaps (#7370)
8de53571e0 is described below

commit 8de53571e0e1e7ea63c7ecf7b646da64eceaef8c
Author: Alexey Kudinkin 
AuthorDate: Thu Dec 8 18:52:47 2022 -0800

[HUDI-5346][HUDI-5320] Fixing Create Table as Select (CTAS) performance 
gaps (#7370)

This PR is addressing some of the performance traps detected while 
stress-testing Spark SQL's Create Table as Select command:

Avoids reordering of the columns w/in CTAS (there's no need for it, 
InsertIntoTableCommand will be resolving columns anyway)
Fixing validation sequence w/in InsertIntoTableCommand to first resolve the 
columns and then run validation (currently it's done the other way around)
Propagating properties specified in CTAS to the HoodieSparkSqlWriter (for 
ex, currently there's no way to disable MT when using CTAS precisely b/c of the 
fact that these properties are not propagated)
Additionally following improvements to HoodieBulkInsertHelper were made:

Now if meta-fields are disabled, we won't be dereferencing incoming Dataset 
into RDD and instead simply add stubbed out meta-fields t/h additional 
Projection
---
 .../hudi/HoodieDatasetBulkInsertHelper.scala   | 68 ++-
 .../org/apache/spark/sql/HoodieUnsafeUtils.scala   | 12 +++-
 .../org/apache/hudi/HoodieSparkSqlWriter.scala |  7 +-
 .../command/CreateHoodieTableAsSelectCommand.scala | 76 +-
 .../command/InsertIntoHoodieTableCommand.scala |  3 +-
 .../spark/sql/hudi/HoodieSparkSqlTestBase.scala|  4 +-
 .../apache/spark/sql/hudi/TestInsertTable.scala| 22 ---
 .../spark/sql/HoodieSpark2CatalystPlanUtils.scala  | 11 +++-
 8 files changed, 106 insertions(+), 97 deletions(-)

diff --git 
a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieDatasetBulkInsertHelper.scala
 
b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieDatasetBulkInsertHelper.scala
index 296abaf4f5..79fa67acdb 100644
--- 
a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieDatasetBulkInsertHelper.scala
+++ 
b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieDatasetBulkInsertHelper.scala
@@ -33,7 +33,9 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.HoodieUnsafeRowUtils.{composeNestedFieldPath, 
getNestedInternalRowValue}
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.types.{StringType, StructField, StructType}
+import org.apache.spark.sql.catalyst.expressions.{Alias, Literal}
+import org.apache.spark.sql.catalyst.plans.logical.Project
+import org.apache.spark.sql.types.{DataType, StringType, StructField, 
StructType}
 import org.apache.spark.sql.{DataFrame, Dataset, HoodieUnsafeUtils, Row}
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -58,31 +60,6 @@ object HoodieDatasetBulkInsertHelper extends Logging {
 val populateMetaFields = config.populateMetaFields()
 val schema = df.schema
 
-val keyGeneratorClassName = 
config.getStringOrThrow(HoodieWriteConfig.KEYGENERATOR_CLASS_NAME,
-  "Key-generator class name is required")
-
-val prependedRdd: RDD[InternalRow] =
-  df.queryExecution.toRdd.mapPartitions { iter =>
-val keyGenerator =
-  ReflectionUtils.loadClass(keyGeneratorClassName, new 
TypedProperties(config.getProps))
-.asInstanceOf[SparkKeyGeneratorInterface]
-
-iter.map { row =>
-  val (recordKey, partitionPath) =
-if (populateMetaFields) {
-  (keyGenerator.getRecordKey(row, schema), 
keyGenerator.getPartitionPath(row, schema))
-} else {
-  (UTF8String.EMPTY_UTF8, UTF8String.EMPTY_UTF8)
-}
-  val commitTimestamp = UTF8String.EMPTY_UTF8
-  val commitSeqNo = UTF8String.EMPTY_UTF8
-  val filename = UTF8String.EMPTY_UTF8
-
-  // TODO use mutable row, avoid re-allocating
-  new HoodieInternalRow(commitTimestamp, commitSeqNo, recordKey, 
partitionPath, filename, row, false)
-}
-  }
-
 val metaFields = Seq(
   StructField(HoodieRecord.COMMIT_TIME_METADATA_FIELD, StringType),
   StructField(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, StringType),
@@ -92,11 +69,44 @@ object HoodieDatasetBulkInsertHelper extends Logging {
 
 val updatedSchema = StructType(metaFields ++ schema.fields)
 
-val updatedDF = if (populateMetaFields && 
config.shouldCombineBeforeInsert) {
-  val dedupedRdd = dedupeRows(prependedRdd, updatedSchema, 
config.getPreCombineField, SparkHoodieIndexFactory.isGlobalIndex(config))
+val updatedDF 

[hudi] branch master updated (da9fef6046 -> 2da69d35d6)

2022-12-08 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from da9fef6046 [HUDI-5345] Avoid fs.exists calls for metadata table in 
HFileBootstrapIndex (#7404)
 add 2da69d35d6 [HUDI-5291] Fixing NPE in MOR column stats accounting 
(#7349)

No new revisions were added by this update.

Summary of changes:
 .../java/org/apache/hudi/avro/HoodieAvroUtils.java | 18 
 .../cow-updated2-column-stats-index-table.json |  4 ++--
 .../mor-updated2-column-stats-index-table.json |  4 ++--
 ...-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json |  2 +-
 .../hudi/functional/TestColumnStatsIndex.scala | 24 --
 5 files changed, 27 insertions(+), 25 deletions(-)



[hudi] branch master updated (3dc76af95d8 -> 4a825dcdc4e)

2022-12-02 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from 3dc76af95d8 [HUDI-5302] Fix: compute hash key from recordKey failed 
when recordKeyValue contains ',' (#7342)
 add 4a825dcdc4e [MINOR] Disable the `SparkSqlCoreFlow` tests (#7368)

No new revisions were added by this update.

Summary of changes:
 hudi-spark-datasource/hudi-spark/pom.xml   |  3 +++
 .../org/apache/hudi/functional/SparkSQLCoreFlow.java   | 18 ++
 .../apache/hudi/functional/TestSparkSqlCoreFlow.scala  |  3 +--
 3 files changed, 14 insertions(+), 10 deletions(-)
 copy 
hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/providers/HoodieWriteClientProvider.java
 => 
hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/SparkSQLCoreFlow.java
 (68%)



[hudi] branch master updated: Rebased MOR iterators onto a `CachingIterator` (to be idempotent) (#7334)

2022-11-30 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new d7a7285b29 Rebased MOR iterators onto a `CachingIterator` (to be 
idempotent) (#7334)
d7a7285b29 is described below

commit d7a7285b2994453708c060bd7fb5138710ae65bf
Author: Alexey Kudinkin 
AuthorDate: Wed Nov 30 13:31:33 2022 -0800

Rebased MOR iterators onto a `CachingIterator` (to be idempotent) (#7334)

Addressing an invalid semantic of MOR iterators not being actually 
idempotent: ie, calling `hasNext` multiple times was actually leading to 
iterator advancing, therefore potentially skipping the elements for ex in cases 
like:

```
// [[isEmpty]] will invoke [[hasNext]] to check whether Iterator has any 
elements
if (iter.isEmpty) {
  // ...
} else {
  // Here [[map]] will invoke [[hasNext]] again, therefore skipping the 
elements
  iter.map { /* ... */ }
}
```
---
 .../scala/org/apache/hudi/HoodieSparkUtils.scala   |  2 +-
 .../{LogFileIterator.scala => Iterators.scala} | 29 ++
 .../org/apache/hudi/util/CachingIterator.scala | 44 ++
 3 files changed, 56 insertions(+), 19 deletions(-)

diff --git 
a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala
 
b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala
index 404d8d9309..bd7d3647b2 100644
--- 
a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala
+++ 
b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala
@@ -93,7 +93,7 @@ object HoodieSparkUtils extends SparkAdapterSupport with 
SparkVersionsSupport {
 
 // NOTE: We're accessing toRdd here directly to avoid [[InternalRow]] to 
[[Row]] conversion
 //   Additionally, we have to explicitly wrap around resulting [[RDD]] 
into the one
-//   injecting [[SQLConf]], which by default isn't propgated by Spark 
to the executor(s).
+//   injecting [[SQLConf]], which by default isn't propagated by Spark 
to the executor(s).
 //   [[SQLConf]] is required by [[AvroSerializer]]
 injectSQLConf(df.queryExecution.toRdd.mapPartitions { rows =>
   if (rows.isEmpty) {
diff --git 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/LogFileIterator.scala
 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/Iterators.scala
similarity index 95%
rename from 
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/LogFileIterator.scala
rename to 
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/Iterators.scala
index 07a0ce7f23..68b25fafe0 100644
--- 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/LogFileIterator.scala
+++ 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/Iterators.scala
@@ -34,20 +34,17 @@ import org.apache.hudi.hadoop.config.HoodieRealtimeConfig
 import org.apache.hudi.internal.schema.InternalSchema
 import org.apache.hudi.metadata.{HoodieBackedTableMetadata, 
HoodieTableMetadata}
 import 
org.apache.hudi.metadata.HoodieTableMetadata.getDataTableBasePathFromMetadataTable
-
 import org.apache.avro.Schema
 import org.apache.avro.generic.{GenericRecord, GenericRecordBuilder, 
IndexedRecord}
-
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapred.JobConf
-
+import org.apache.hudi.util.CachingIterator
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.types.StructType
 
 import java.io.Closeable
 import java.util.Properties
-
 import scala.annotation.tailrec
 import scala.collection.JavaConverters._
 import scala.util.Try
@@ -61,7 +58,7 @@ class LogFileIterator(split: HoodieMergeOnReadFileSplit,
   requiredSchema: HoodieTableSchema,
   tableState: HoodieTableState,
   config: Configuration)
-  extends Iterator[InternalRow] with Closeable with AvroDeserializerSupport {
+  extends CachingIterator[InternalRow] with Closeable with 
AvroDeserializerSupport {
 
   protected val maxCompactionMemoryInBytes: Long = 
getMaxCompactionMemoryInBytes(new JobConf(config))
 
@@ -78,8 +75,6 @@ class LogFileIterator(split: HoodieMergeOnReadFileSplit,
 
   protected val logFileReaderAvroSchema: Schema = new 
Schema.Parser().parse(tableSchema.avroSchemaStr)
 
-  protected var recordToLoad: InternalRow = _
-
   private val requiredSchemaSafeAvroProjection = 
SafeAvroProjection.create(logFileReaderAvroSchema, avroSchema)
 
   // TODO: now logScanner with internalSchema support column project, we may 
no need projectAvroUnsafe
@@ -107,7 +102,7 @@ class LogFileIterator(split: HoodieMergeOnReadFileSplit,
   protected def removeLogRecord(key: String): 

[hudi] branch release-feature-rfc46 updated: [Minor] fix multi deser avro payload (#7021)

2022-11-28 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a commit to branch release-feature-rfc46
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/release-feature-rfc46 by this 
push:
 new df83709f51 [Minor] fix multi deser avro payload (#7021)
df83709f51 is described below

commit df83709f51b7df32ec15e69b97fa6631f3883ce5
Author: komao 
AuthorDate: Tue Nov 29 07:54:51 2022 +0800

[Minor] fix multi deser avro payload (#7021)

In HoodieAvroRecord, we will call isDelete, shouldIgnore before we write it 
to the file. Each method will deserialize HoodiePayload. So we add 
deserialization method in HoodieRecord and call this method once before calling 
isDelete or shouldIgnore.

Co-authored-by: wangzixuan.wzxuan 
Co-authored-by: Alexey Kudinkin 
Co-authored-by: Alexey Kudinkin 
---
 .../org/apache/hudi/io/HoodieAppendHandle.java | 26 ++--
 .../java/org/apache/hudi/io/HoodieMergeHandle.java |  1 -
 .../hudi/commmon/model/HoodieSparkRecord.java  | 30 +-
 .../apache/hudi/common/model/BaseAvroPayload.java  | 46 +-
 .../common/model/DefaultHoodieRecordPayload.java   |  8 ++--
 .../hudi/common/model/EventTimeAvroPayload.java|  9 ++---
 .../apache/hudi/common/model/HoodieAvroRecord.java | 37 +++--
 .../org/apache/hudi/common/model/HoodieRecord.java | 17 +++-
 .../model/OverwriteWithLatestAvroPayload.java  | 24 +--
 .../common/model/PartialUpdateAvroPayload.java |  1 -
 .../apache/hudi/sink/utils/PayloadCreation.java|  2 +-
 .../main/java/org/apache/hudi/QuickstartUtils.java | 12 +++---
 .../hudi/command/payload/ExpressionPayload.scala   | 23 ---
 .../model/TestHoodieRecordSerialization.scala  |  2 +-
 14 files changed, 154 insertions(+), 84 deletions(-)

diff --git 
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java
 
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java
index 564d63ba77..2ef02b1dae 100644
--- 
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java
+++ 
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java
@@ -35,6 +35,7 @@ import org.apache.hudi.common.model.HoodieOperation;
 import org.apache.hudi.common.model.HoodiePartitionMetadata;
 import org.apache.hudi.common.model.HoodiePayloadProps;
 import org.apache.hudi.common.model.HoodieRecord;
+import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType;
 import org.apache.hudi.common.model.HoodieRecordLocation;
 import org.apache.hudi.common.model.HoodieWriteStat.RuntimeStats;
 import org.apache.hudi.common.model.IOType;
@@ -215,18 +216,16 @@ public class HoodieAppendHandle extends 
HoodieWriteHandle finalRecord = Option.empty();
-  if (!nullifyPayload && !hoodieRecord.isDelete(tableSchema, 
recordProperties)) {
-if (hoodieRecord.shouldIgnore(tableSchema, recordProperties)) {
-  return Option.of(hoodieRecord);
+  Option finalRecord = nullifyPayload ? Option.empty() : 
Option.of(hoodieRecord);
+  // Check for delete
+  if (finalRecord.isPresent() && !finalRecord.get().isDelete(tableSchema, 
recordProperties)) {
+// Check for ignore ExpressionPayload
+if (finalRecord.get().shouldIgnore(tableSchema, recordProperties)) {
+  return finalRecord;
 }
 // Convert GenericRecord to GenericRecord with hoodie commit metadata 
in schema
-HoodieRecord rewrittenRecord;
-if (schemaOnReadEnabled) {
-  rewrittenRecord = 
hoodieRecord.rewriteRecordWithNewSchema(tableSchema, recordProperties, 
writeSchemaWithMetaFields);
-} else {
-  rewrittenRecord = hoodieRecord.rewriteRecord(tableSchema, 
recordProperties, writeSchemaWithMetaFields);
-}
+HoodieRecord rewrittenRecord = schemaOnReadEnabled ? 
finalRecord.get().rewriteRecordWithNewSchema(tableSchema, recordProperties, 
writeSchemaWithMetaFields)
+: finalRecord.get().rewriteRecord(tableSchema, recordProperties, 
writeSchemaWithMetaFields);
 HoodieRecord populatedRecord = populateMetadataFields(rewrittenRecord, 
writeSchemaWithMetaFields, recordProperties);
 finalRecord = Option.of(populatedRecord);
 if (isUpdateRecord) {
@@ -236,6 +235,7 @@ public class HoodieAppendHandle extends 
HoodieWriteHandle extends 
HoodieWriteHandlehttps://issues.apache.org/jira/browse/HUDI-5249
+if (config.isMetadataColumnStatsIndexEnabled() && 
recordMerger.getRecordType() == HoodieRecordType.AVRO) {
   final List fieldsToIndex;
   // If column stats index is enabled but columns not configured then we 
assume that
   // all columns should be indexed
@@ -511,7 +513,7 @@ public class HoodieAppendHandle extends 
HoodieWriteHandle orderingVal = record.getOrderingValue(table

[hudi] branch master updated (65ce431d0c -> c5590cdd0f)

2022-11-16 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from 65ce431d0c [HUDI-5203] Handle null fields in debezium avro payloads 
(#7193)
 add c5590cdd0f [HUDI-4812] Lazy fetching partition path & file slice for 
HoodieFileIndex (#6680)

No new revisions were added by this update.

Summary of changes:
 .../apache/hudi/keygen/BuiltinKeyGenerator.java| 178 +++---
 .../apache/hudi/keygen/ComplexKeyGenerator.java|   4 +-
 .../hudi/keygen/PartitionPathFormatterBase.java| 112 +++
 .../org/apache/hudi/keygen/SimpleKeyGenerator.java |  23 +-
 .../hudi/keygen/StringPartitionPathFormatter.java  |  72 
 .../keygen/UTF8StringPartitionPathFormatter.java   |  80 +
 .../scala/org/apache/hudi/util/JFunction.scala |   7 +-
 .../org/apache/hudi/BaseHoodieTableFileIndex.java  | 370 -
 .../apache/hudi/common/util/CollectionUtils.java   |   4 +-
 .../java/org/apache/hudi/hadoop/CachingPath.java   |  50 ++-
 .../apache/hudi/metadata/BaseTableMetadata.java|   3 +-
 .../metadata/FileSystemBackedTableMetadata.java|  25 +-
 .../hudi/metadata/HoodieBackedTableMetadata.java   |   7 +
 .../hudi/metadata/HoodieMetadataPayload.java   |   4 +-
 .../apache/hudi/metadata/HoodieTableMetadata.java  |  13 +-
 .../hudi/hadoop/HiveHoodieTableFileIndex.java  |  18 +-
 .../HoodieMergeOnReadTableInputFormat.java |   2 +-
 .../scala/org/apache/hudi/DataSourceOptions.scala  |  26 ++
 .../scala/org/apache/hudi/HoodieFileIndex.scala|  90 ++---
 .../apache/hudi/SparkHoodieTableFileIndex.scala| 262 ---
 .../apache/hudi/keygen/TestSimpleKeyGenerator.java |   5 +-
 .../org/apache/hudi/ScalaAssertionSupport.scala|   2 +-
 .../org/apache/hudi/TestHoodieFileIndex.scala  | 265 +++
 .../apache/hudi/functional/TestCOWDataSource.scala |  36 +-
 .../apache/hudi/functional/TestMORDataSource.scala |   2 +-
 .../apache/spark/sql/hudi/TestCreateTable.scala|  18 +-
 .../parquet/Spark31HoodieParquetFileFormat.scala   |   2 +-
 27 files changed, 1164 insertions(+), 516 deletions(-)
 create mode 100644 
hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/PartitionPathFormatterBase.java
 create mode 100644 
hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/StringPartitionPathFormatter.java
 create mode 100644 
hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/UTF8StringPartitionPathFormatter.java



[hudi] branch master updated (866ebf91ac -> 57961c0e37)

2022-11-16 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from 866ebf91ac [HUDI-5227] Bump Javalin to 4.6.7 and Jetty to 9.4.48 
(#7211)
 add 57961c0e37 Use as.of.instant for IncrementalRelation (#6921)

No new revisions were added by this update.

Summary of changes:
 .../src/main/scala/org/apache/hudi/IncrementalRelation.scala   | 7 ---
 .../org/apache/hudi/functional/TestCOWDataSourceStorage.scala  | 3 +++
 2 files changed, 7 insertions(+), 3 deletions(-)



[hudi] branch master updated (3b8df4bb20 -> 6b0b03b12b)

2022-11-14 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from 3b8df4bb20 [HUDI-5206] RowColumnReader should not return null value 
for certain null child columns (#7194)
 add 6b0b03b12b [MINOR] Make sure Dictionary Encoding in Parquet enabled by 
default (#7052)

No new revisions were added by this update.

Summary of changes:
 .../scala/org/apache/hudi/cli/SparkHelpers.scala| 11 ++-
 .../java/org/apache/hudi/io/HoodieAppendHandle.java |  8 +++-
 .../hudi/avro/TestHoodieAvroParquetWriter.java  |  2 +-
 .../hudi/testutils/HoodieWriteableTestTable.java|  2 +-
 .../storage/row/HoodieRowDataFileWriterFactory.java |  3 ++-
 .../table/log/block/HoodieParquetDataBlock.java | 21 ++---
 .../apache/hudi/io/storage/HoodieParquetConfig.java |  5 -
 .../hudi/common/functional/TestHoodieLogFormat.java |  4 ++--
 .../hudi/hadoop/testutils/InputFormatTestUtil.java  |  2 +-
 .../org/apache/spark/sql/hudi/SparkHelpers.scala| 11 ++-
 10 files changed, 48 insertions(+), 21 deletions(-)



[hudi] branch master updated (49c5fcbf49 -> a06b1c0c4e)

2022-11-10 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from 49c5fcbf49 [HUDI-5176] Fix incremental source to consider inflight 
commits before completed commits (#7160)
 add a06b1c0c4e [MINOR] Update RFC-46 doc (#7050)

No new revisions were added by this update.

Summary of changes:
 rfc/rfc-46/rfc-46.md | 168 ---
 1 file changed, 146 insertions(+), 22 deletions(-)



[hudi] branch release-feature-rfc46 updated: [MINOR] Properly registering target classes w/ Kryo (#7026)

2022-10-27 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a commit to branch release-feature-rfc46
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/release-feature-rfc46 by this 
push:
 new 4282d890d8 [MINOR] Properly registering target classes w/ Kryo (#7026)
4282d890d8 is described below

commit 4282d890d877030d8e883ee576628962df2b629e
Author: Alexey Kudinkin 
AuthorDate: Thu Oct 27 18:03:54 2022 -0700

[MINOR] Properly registering target classes w/ Kryo (#7026)

* Added `HoodieKryoRegistrar` registering necessary Hudi's classes w/ Kryo 
to make their serialization more efficient (by serializing just the class id, 
in-liue the fully qualified class-name)

* Redirected Kryo registration to `HoodieKryoRegistrar`

* Registered additional classes likely to be serialized by Kryo

* Updated tests

* Fixed serialization of Avro's `Utf8` to serialize just the bytes

* Added tests

* Added custom `AvroUtf8Serializer`;
Tidying up

* Extracted `HoodieCommonKryoRegistrar` to leverage in `SerializationUtils`

* `HoodieKryoRegistrar` > `HoodieSparkKryoRegistrar`;
Rebased `HoodieSparkKryoRegistrar` onto `HoodieCommonKryoRegistrar`

* `lint`

* Fixing compilation for Spark 2.x

* Disabling flaky test
---
 .../java/org/apache/hudi/cli/utils/SparkUtil.java  |  4 +-
 .../cli/functional/CLIFunctionalTestHarness.java   |  4 +-
 .../apache/hudi/client/SparkRDDWriteClient.java| 12 ---
 .../apache/spark/HoodieSparkKryoRegistrar.scala| 69 +
 .../java/org/apache/hudi/table/TestCleaner.java|  4 +
 .../hudi/testutils/FunctionalTestHarness.java  |  3 +-
 .../SparkClientFunctionalTestHarness.java  |  3 +-
 .../common/util/HoodieCommonKryoRegistrar.java | 89 ++
 .../hudi/common/util/SerializationUtils.java   | 39 +++---
 .../hudi/common/util/TestSerializationUtils.java   | 29 ++-
 .../quickstart/TestHoodieSparkQuickstart.java  | 14 +---
 .../model/TestHoodieRecordSerialization.scala  | 12 +--
 .../org/apache/hudi/utilities/UtilHelpers.java |  7 +-
 .../apache/hudi/utilities/TestHoodieIndexer.java   |  3 +-
 .../hudi/utilities/TestHoodieRepairTool.java   |  4 +-
 15 files changed, 245 insertions(+), 51 deletions(-)

diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/utils/SparkUtil.java 
b/hudi-cli/src/main/java/org/apache/hudi/cli/utils/SparkUtil.java
index bcccb66b37..e333a6167a 100644
--- a/hudi-cli/src/main/java/org/apache/hudi/cli/utils/SparkUtil.java
+++ b/hudi-cli/src/main/java/org/apache/hudi/cli/utils/SparkUtil.java
@@ -21,11 +21,11 @@ package org.apache.hudi.cli.utils;
 import org.apache.hudi.cli.HoodieCliSparkConfig;
 import org.apache.hudi.cli.commands.SparkEnvCommand;
 import org.apache.hudi.cli.commands.SparkMain;
-import org.apache.hudi.client.SparkRDDWriteClient;
 import org.apache.hudi.common.fs.FSUtils;
 import org.apache.hudi.common.util.Option;
 import org.apache.hudi.common.util.StringUtils;
 
+import org.apache.spark.HoodieSparkKryoRegistrar$;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.launcher.SparkLauncher;
@@ -116,7 +116,7 @@ public class SparkUtil {
   }
 
   public static JavaSparkContext initJavaSparkContext(SparkConf sparkConf) {
-SparkRDDWriteClient.registerClasses(sparkConf);
+HoodieSparkKryoRegistrar$.MODULE$.register(sparkConf);
 JavaSparkContext jsc = new JavaSparkContext(sparkConf);
 
jsc.hadoopConfiguration().setBoolean(HoodieCliSparkConfig.CLI_PARQUET_ENABLE_SUMMARY_METADATA,
 false);
 FSUtils.prepareHadoopConf(jsc.hadoopConfiguration());
diff --git 
a/hudi-cli/src/test/java/org/apache/hudi/cli/functional/CLIFunctionalTestHarness.java
 
b/hudi-cli/src/test/java/org/apache/hudi/cli/functional/CLIFunctionalTestHarness.java
index 04f77df549..6d6335ab0f 100644
--- 
a/hudi-cli/src/test/java/org/apache/hudi/cli/functional/CLIFunctionalTestHarness.java
+++ 
b/hudi-cli/src/test/java/org/apache/hudi/cli/functional/CLIFunctionalTestHarness.java
@@ -20,7 +20,6 @@
 package org.apache.hudi.cli.functional;
 
 import org.apache.hudi.client.SparkRDDReadClient;
-import org.apache.hudi.client.SparkRDDWriteClient;
 import org.apache.hudi.client.common.HoodieSparkEngineContext;
 import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
 import org.apache.hudi.testutils.HoodieClientTestUtils;
@@ -28,6 +27,7 @@ import org.apache.hudi.testutils.providers.SparkProvider;
 import org.apache.hudi.timeline.service.TimelineService;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.spark.HoodieSparkKryoRegistrar$;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.sql.SQLContext;
@@ -100,7 +100,7 @@ public class CLIFunctionalTestHarness impleme

[hudi] branch master updated (20e8ec79c8 -> 7accc476d3)

2022-10-26 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from 20e8ec79c8 [HUDI-5058] Fix flink catalog read spark table error : 
primary key col can not be nullable (#7009)
 add 7accc476d3 [HUDI-5087] Fix incorrect merging sequence for Column Stats 
Record in `HoodieMetadataPayload` (#7053)

No new revisions were added by this update.

Summary of changes:
 .../hudi/metadata/HoodieMetadataPayload.java   |  32 -
 .../hudi/metadata/TestHoodieMetadataPayload.java   | 153 +
 2 files changed, 182 insertions(+), 3 deletions(-)
 create mode 100644 
hudi-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java



[hudi] 02/02: Rebased `ByteBuffer` cloning onto the new utility

2022-10-20 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a commit to branch HUDI-4971-cancel-relocation
in repository https://gitbox.apache.org/repos/asf/hudi.git

commit a3017d2773c14e1400380e92d415a183518604a0
Author: Alexey Kudinkin 
AuthorDate: Thu Oct 20 13:01:13 2022 -0700

Rebased `ByteBuffer` cloning onto the new utility
---
 .../src/main/java/org/apache/hudi/common/util/AvroOrcUtils.java | 4 ++--
 hudi-common/src/main/java/org/apache/hudi/common/util/OrcUtils.java | 5 +++--
 .../src/main/scala/org/apache/hudi/ColumnStatsIndexSupport.scala| 6 ++
 .../src/main/scala/org/apache/spark/sql/hudi/SerDeUtils.scala   | 6 ++
 4 files changed, 9 insertions(+), 12 deletions(-)

diff --git 
a/hudi-common/src/main/java/org/apache/hudi/common/util/AvroOrcUtils.java 
b/hudi-common/src/main/java/org/apache/hudi/common/util/AvroOrcUtils.java
index ca59c301c8..c83ec68976 100644
--- a/hudi-common/src/main/java/org/apache/hudi/common/util/AvroOrcUtils.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/util/AvroOrcUtils.java
@@ -54,6 +54,7 @@ import org.apache.hudi.exception.HoodieIOException;
 import org.apache.orc.TypeDescription;
 
 import static org.apache.avro.JsonProperties.NULL_VALUE;
+import static org.apache.hudi.common.util.BinaryUtils.toBytes;
 
 /**
  * Methods including addToVector, addUnionValue, createOrcSchema are 
originally from
@@ -221,8 +222,7 @@ public class AvroOrcUtils {
   binaryBytes = ((GenericData.Fixed)value).bytes();
 } else if (value instanceof ByteBuffer) {
   final ByteBuffer byteBuffer = (ByteBuffer) value;
-  binaryBytes = new byte[byteBuffer.remaining()];
-  byteBuffer.get(binaryBytes);
+  binaryBytes = toBytes(byteBuffer);
 } else if (value instanceof byte[]) {
   binaryBytes = (byte[]) value;
 } else {
diff --git 
a/hudi-common/src/main/java/org/apache/hudi/common/util/OrcUtils.java 
b/hudi-common/src/main/java/org/apache/hudi/common/util/OrcUtils.java
index 0cc4059197..4cb55f3790 100644
--- a/hudi-common/src/main/java/org/apache/hudi/common/util/OrcUtils.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/util/OrcUtils.java
@@ -52,6 +52,8 @@ import java.util.Map;
 import java.util.Set;
 import java.util.stream.Collectors;
 
+import static org.apache.hudi.common.util.BinaryUtils.toBytes;
+
 /**
  * Utility functions for ORC files.
  */
@@ -238,8 +240,7 @@ public class OrcUtils extends BaseFileUtils {
 try (Reader reader = OrcFile.createReader(orcFilePath, 
OrcFile.readerOptions(conf))) {
   if (reader.hasMetadataValue("orc.avro.schema")) {
 ByteBuffer metadataValue = reader.getMetadataValue("orc.avro.schema");
-byte[] bytes = new byte[metadataValue.remaining()];
-metadataValue.get(bytes);
+byte[] bytes = toBytes(metadataValue);
 return new Schema.Parser().parse(new String(bytes));
   } else {
 TypeDescription orcSchema = reader.getSchema();
diff --git 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/ColumnStatsIndexSupport.scala
 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/ColumnStatsIndexSupport.scala
index 58511f791e..dc413afff1 100644
--- 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/ColumnStatsIndexSupport.scala
+++ 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/ColumnStatsIndexSupport.scala
@@ -29,6 +29,7 @@ import org.apache.hudi.common.data.HoodieData
 import org.apache.hudi.common.model.HoodieRecord
 import org.apache.hudi.common.table.HoodieTableMetaClient
 import org.apache.hudi.common.table.view.FileSystemViewStorageConfig
+import org.apache.hudi.common.util.BinaryUtils.toBytes
 import org.apache.hudi.common.util.ValidationUtils.checkState
 import org.apache.hudi.common.util.collection
 import org.apache.hudi.common.util.hash.ColumnIndexID
@@ -469,10 +470,7 @@ object ColumnStatsIndexSupport {
 }
   case BinaryType =>
 value match {
-  case b: ByteBuffer =>
-val bytes = new Array[Byte](b.remaining)
-b.get(bytes)
-bytes
+  case b: ByteBuffer => toBytes(b)
   case other => other
 }
 
diff --git 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/SerDeUtils.scala
 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/SerDeUtils.scala
index 19d0a0a98b..294d282e3d 100644
--- 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/SerDeUtils.scala
+++ 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/SerDeUtils.scala
@@ -33,10 +33,8 @@ object SerDeUtils {
   }
 
   def toBytes(o: Any): Array[Byte] = {
-val bb: ByteBuffer = SERIALIZER_THREAD_LOCAL.get.serialize(o)
-val bytes = new Array[Byte](bb.capacity())
-bb.ge

[hudi] branch HUDI-4971-cancel-relocation created (now a3017d2773)

2022-10-20 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a change to branch HUDI-4971-cancel-relocation
in repository https://gitbox.apache.org/repos/asf/hudi.git


  at a3017d2773 Rebased `ByteBuffer` cloning onto the new utility

This branch includes the following new commits:

 new 82d78409a3 `BinaryUtil` > `BinaryUtils`; Added utility to extract 
bytes from `ByteBuffer`
 new a3017d2773 Rebased `ByteBuffer` cloning onto the new utility

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.




[hudi] 01/02: `BinaryUtil` > `BinaryUtils`; Added utility to extract bytes from `ByteBuffer`

2022-10-20 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a commit to branch HUDI-4971-cancel-relocation
in repository https://gitbox.apache.org/repos/asf/hudi.git

commit 82d78409a375e80de73002744be85229e1ecfc8a
Author: Alexey Kudinkin 
AuthorDate: Thu Oct 20 12:59:38 2022 -0700

`BinaryUtil` > `BinaryUtils`;
Added utility to extract bytes from `ByteBuffer`
---
 .../apache/hudi/sort/SpaceCurveSortingHelper.java  | 34 +++---
 .../spark/sql/hudi/execution/RangeSample.scala | 10 +++
 .../hudi/common/table/HoodieTableConfig.java   |  4 +--
 .../util/{BinaryUtil.java => BinaryUtils.java} | 12 +++-
 .../apache/hudi/common/util/SpillableMapUtils.java |  2 +-
 .../common/util/collection/BitCaskDiskMap.java |  2 +-
 .../{TestBinaryUtil.java => TestBinaryUtils.java}  | 22 +++---
 7 files changed, 48 insertions(+), 38 deletions(-)

diff --git 
a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/sort/SpaceCurveSortingHelper.java
 
b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/sort/SpaceCurveSortingHelper.java
index 496168e844..1ff54773c4 100644
--- 
a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/sort/SpaceCurveSortingHelper.java
+++ 
b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/sort/SpaceCurveSortingHelper.java
@@ -18,7 +18,7 @@
 
 package org.apache.hudi.sort;
 
-import org.apache.hudi.common.util.BinaryUtil;
+import org.apache.hudi.common.util.BinaryUtils;
 import org.apache.hudi.common.util.CollectionUtils;
 import org.apache.hudi.config.HoodieClusteringConfig;
 import org.apache.hudi.optimize.HilbertCurveUtils;
@@ -158,7 +158,7 @@ public class SpaceCurveSortingHelper {
 .toArray(byte[][]::new);
 
   // Interleave received bytes to produce Z-curve ordinal
-  byte[] zOrdinalBytes = BinaryUtil.interleaving(zBytes, 8);
+  byte[] zOrdinalBytes = BinaryUtils.interleaving(zBytes, 8);
   return appendToRow(row, zOrdinalBytes);
 })
   .sortBy(f -> new ByteArraySorting((byte[]) f.get(fieldNum)), true, 
fileNum);
@@ -206,30 +206,30 @@ public class SpaceCurveSortingHelper {
   @Nonnull
   private static byte[] mapColumnValueTo8Bytes(Row row, int index, DataType 
dataType) {
 if (dataType instanceof LongType) {
-  return BinaryUtil.longTo8Byte(row.isNullAt(index) ? Long.MAX_VALUE : 
row.getLong(index));
+  return BinaryUtils.longTo8Byte(row.isNullAt(index) ? Long.MAX_VALUE : 
row.getLong(index));
 } else if (dataType instanceof DoubleType) {
-  return BinaryUtil.doubleTo8Byte(row.isNullAt(index) ? Double.MAX_VALUE : 
row.getDouble(index));
+  return BinaryUtils.doubleTo8Byte(row.isNullAt(index) ? Double.MAX_VALUE 
: row.getDouble(index));
 } else if (dataType instanceof IntegerType) {
-  return BinaryUtil.intTo8Byte(row.isNullAt(index) ? Integer.MAX_VALUE : 
row.getInt(index));
+  return BinaryUtils.intTo8Byte(row.isNullAt(index) ? Integer.MAX_VALUE : 
row.getInt(index));
 } else if (dataType instanceof FloatType) {
-  return BinaryUtil.doubleTo8Byte(row.isNullAt(index) ? Float.MAX_VALUE : 
row.getFloat(index));
+  return BinaryUtils.doubleTo8Byte(row.isNullAt(index) ? Float.MAX_VALUE : 
row.getFloat(index));
 } else if (dataType instanceof StringType) {
-  return BinaryUtil.utf8To8Byte(row.isNullAt(index) ? "" : 
row.getString(index));
+  return BinaryUtils.utf8To8Byte(row.isNullAt(index) ? "" : 
row.getString(index));
 } else if (dataType instanceof DateType) {
-  return BinaryUtil.longTo8Byte(row.isNullAt(index) ? Long.MAX_VALUE : 
row.getDate(index).getTime());
+  return BinaryUtils.longTo8Byte(row.isNullAt(index) ? Long.MAX_VALUE : 
row.getDate(index).getTime());
 } else if (dataType instanceof TimestampType) {
-  return BinaryUtil.longTo8Byte(row.isNullAt(index) ? Long.MAX_VALUE : 
row.getTimestamp(index).getTime());
+  return BinaryUtils.longTo8Byte(row.isNullAt(index) ? Long.MAX_VALUE : 
row.getTimestamp(index).getTime());
 } else if (dataType instanceof ByteType) {
-  return BinaryUtil.byteTo8Byte(row.isNullAt(index) ? Byte.MAX_VALUE : 
row.getByte(index));
+  return BinaryUtils.byteTo8Byte(row.isNullAt(index) ? Byte.MAX_VALUE : 
row.getByte(index));
 } else if (dataType instanceof ShortType) {
-  return BinaryUtil.intTo8Byte(row.isNullAt(index) ? Short.MAX_VALUE : 
row.getShort(index));
+  return BinaryUtils.intTo8Byte(row.isNullAt(index) ? Short.MAX_VALUE : 
row.getShort(index));
 } else if (dataType instanceof DecimalType) {
-  return BinaryUtil.longTo8Byte(row.isNullAt(index) ? Long.MAX_VALUE : 
row.getDecimal(index).longValue());
+  return BinaryUtils.longTo8Byte(row.isNullAt(index) ? Long.MAX_VALUE : 
row.getDecimal(index).longValue());
 } else if (dataType instanceof BooleanType) {
   boolean value = row.isNullAt(index) ? false : row.getBoolean(index);
-  return Bin

[hudi] branch release-feature-rfc46 updated: [MINOR] Make sure all `HoodieRecord`s are appropriately serializable by Kryo (#6977)

2022-10-19 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a commit to branch release-feature-rfc46
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/release-feature-rfc46 by this 
push:
 new 9a1aa0a683 [MINOR] Make sure all `HoodieRecord`s are appropriately 
serializable by Kryo (#6977)
9a1aa0a683 is described below

commit 9a1aa0a6830b79761a9afa1c469772875d713aa1
Author: Alexey Kudinkin 
AuthorDate: Wed Oct 19 21:16:13 2022 -0700

[MINOR] Make sure all `HoodieRecord`s are appropriately serializable by 
Kryo (#6977)

* Make sure `HoodieRecord`, `HoodieKey`, `HoodieRecordLocation` are all 
`KryoSerializable`

* Revisited `HoodieRecord` serialization hooks to make sure they a) could 
not be overridden, b) provide for hooks to properly
serialize record's payload;
Implemented serialization hooks for `HoodieAvroIndexedRecord`;
Implemented serialization hooks for `HoodieEmptyRecord`;

* Revisited `HoodieRecord` serialization hooks to make sure they a) could 
not be overridden, b) provide for hooks to properly
serialize record's payload;
Implemented serialization hooks for `HoodieAvroIndexedRecord`;
Implemented serialization hooks for `HoodieEmptyRecord`;
Implemented serialization hooks for `HoodieAvroRecord`;

* Revisited `HoodieSparkRecord` to transiently hold on to the schema so 
that it could project row

* Implemented serialization hooks for `HoodieSparkRecord`

* Added `TestHoodieSparkRecord`

* Added tests for Avro-based records

* Added test for `HoodieEmptyRecord`

* Fixed sealing/unsealing for `HoodieRecord` in 
`HoodieBackedTableMetadataWriter`

* Properly handle deflated records

* Fixing `Row`s encoding

* Fixed `HoodieRecord` to be properly sealed/unsealed

* Fixed serialization of the `HoodieRecordGlobalLocation`
---
 .../metadata/HoodieBackedTableMetadataWriter.java  |   2 +
 .../hudi/commmon/model/HoodieSparkRecord.java  | 105 ---
 .../bulkinsert/RDDSpatialCurveSortPartitioner.java |   2 +-
 .../hudi/common/model/HoodieAvroIndexedRecord.java |  30 
 .../apache/hudi/common/model/HoodieAvroRecord.java |  16 ++
 .../hudi/common/model/HoodieEmptyRecord.java   |  34 +++-
 .../org/apache/hudi/common/model/HoodieKey.java|  23 ++-
 .../org/apache/hudi/common/model/HoodieRecord.java |  53 +-
 .../common/model/HoodieRecordGlobalLocation.java   |  23 ++-
 .../hudi/common/model/HoodieRecordLocation.java|  19 +-
 .../sink/partitioner/BucketAssignFunction.java |   7 +
 .../org/apache/hudi/HoodieSparkSqlWriter.scala |   2 +-
 .../scala/org/apache/hudi/LogFileIterator.scala|   2 +-
 .../SparkFullBootstrapDataProviderBase.java|   2 +-
 .../model/TestHoodieRecordSerialization.scala  | 195 +
 15 files changed, 473 insertions(+), 42 deletions(-)

diff --git 
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
 
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
index 962875fb92..eda28a5286 100644
--- 
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
+++ 
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
@@ -983,7 +983,9 @@ public abstract class HoodieBackedTableMetadataWriter 
implements HoodieTableMeta
   HoodieData rddSinglePartitionRecords = records.map(r -> {
 FileSlice slice = 
finalFileSlices.get(HoodieTableMetadataUtil.mapRecordKeyToFileGroupIndex(r.getRecordKey(),
 fileGroupCount));
+r.unseal();
 r.setCurrentLocation(new 
HoodieRecordLocation(slice.getBaseInstantTime(), slice.getFileId()));
+r.seal();
 return r;
   });
 
diff --git 
a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/commmon/model/HoodieSparkRecord.java
 
b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/commmon/model/HoodieSparkRecord.java
index 9cdccbe407..43000d1964 100644
--- 
a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/commmon/model/HoodieSparkRecord.java
+++ 
b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/commmon/model/HoodieSparkRecord.java
@@ -18,6 +18,10 @@
 
 package org.apache.hudi.commmon.model;
 
+import com.esotericsoftware.kryo.Kryo;
+import com.esotericsoftware.kryo.KryoSerializable;
+import com.esotericsoftware.kryo.io.Input;
+import com.esotericsoftware.kryo.io.Output;
 import org.apache.avro.Schema;
 import org.apache.hudi.HoodieInternalRowUtils;
 import org.apache.hudi.SparkAdapterSupport$;
@@ -70,9 +74,8 @@ import static org.apache.spark.sql.types.DataTypes.StringType;
  *   need to be updated (ie serving as an overlay layer on top of 
[[UnsafeRow]])
  * 
  *
-
  */
-pub

[hudi] branch asf-site updated: [DOCS] Add more blogs to Hudi website (#6973)

2022-10-18 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/asf-site by this push:
 new 2a200530be [DOCS] Add more blogs to Hudi website (#6973)
2a200530be is described below

commit 2a200530bea882502b95112066624c1e8df9942b
Author: Bhavani Sudha Saktheeswaran <2179254+bhasu...@users.noreply.github.com>
AuthorDate: Tue Oct 18 12:47:24 2022 -0700

[DOCS] Add more blogs to Hudi website (#6973)
---
 ...lding-Streaming-Data-Lakes-with-Hudi-and-MinIO.mdx} |  0
 ...09-28-Data-processing-with-Spark-time-traveling.mdx | 15 +++
 ...to-Apache-Hudi-using-AWS-Glue-and-DeltaStreamer.mdx | 18 ++
 3 files changed, 33 insertions(+)

diff --git 
a/website/blog/2022-09-20-Data-Lake-Lakehouse-Guide-Powered-by-Data-Lake-Table-Formats-Delta-Lake-Iceberg-Hudi.mdx
 b/website/blog/2022-09-20-Building-Streaming-Data-Lakes-with-Hudi-and-MinIO.mdx
similarity index 100%
rename from 
website/blog/2022-09-20-Data-Lake-Lakehouse-Guide-Powered-by-Data-Lake-Table-Formats-Delta-Lake-Iceberg-Hudi.mdx
rename to 
website/blog/2022-09-20-Building-Streaming-Data-Lakes-with-Hudi-and-MinIO.mdx
diff --git 
a/website/blog/2022-09-28-Data-processing-with-Spark-time-traveling.mdx 
b/website/blog/2022-09-28-Data-processing-with-Spark-time-traveling.mdx
new file mode 100644
index 00..cba7cb2ba9
--- /dev/null
+++ b/website/blog/2022-09-28-Data-processing-with-Spark-time-traveling.mdx
@@ -0,0 +1,15 @@
+---
+title: "Data processing with Spark: time traveling"
+authors:
+- name: Petrica Leuca
+category: blog
+image: 
/assets/images/blog/2022-09-28_Data_processing_with_Spark_time_traveling.png
+tags:
+- how-to
+- time-travel
+- devgenius
+---
+
+import Redirect from '@site/src/components/Redirect';
+
+https://blog.devgenius.io/data-processing-with-spark-time-traveling-55905f765694;>Redirecting...
 please wait!! 
diff --git 
a/website/blog/2022-10-06-Ingest-streaming-data-to-Apache-Hudi-using-AWS-Glue-and-DeltaStreamer.mdx
 
b/website/blog/2022-10-06-Ingest-streaming-data-to-Apache-Hudi-using-AWS-Glue-and-DeltaStreamer.mdx
new file mode 100644
index 00..bb923aa034
--- /dev/null
+++ 
b/website/blog/2022-10-06-Ingest-streaming-data-to-Apache-Hudi-using-AWS-Glue-and-DeltaStreamer.mdx
@@ -0,0 +1,18 @@
+---
+title: "Ingest streaming data to Apache Hudi tables using AWS Glue and Apache 
Hudi DeltaStreamer"
+authors:
+- name: Vishal Pathak
+- name: Anand Prakash
+- name: Noritaka Sekiyama
+category: blog
+image: 
/assets/images/blog/2022-10-06_Ingest_streaming_data_to_Apache_Hudi_tables_using_AWS_Glue_and_DeltaStreamer.png
+tags:
+- how-to
+- streaming ingestion
+- deltastreamer
+- amazon
+---
+
+import Redirect from '@site/src/components/Redirect';
+
+https://aws.amazon.com/blogs/big-data/ingest-streaming-data-to-apache-hudi-tables-using-aws-glue-and-apache-hudi-deltastreamer/;>Redirecting...
 please wait!! 



[hudi] branch master updated (a989d32f40 -> 42dcfc43c4)

2022-10-12 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from a989d32f40 [MINOR] Moved readme from  .github to the workflows folder 
(#6932)
 add 42dcfc43c4 [HUDI-4952] Fixing reading from metadata table when there 
are no inflight commits (#6836)

No new revisions were added by this update.

Summary of changes:
 .../functional/TestHoodieBackedMetadata.java   | 35 ++
 .../hudi/metadata/HoodieTableMetadataUtil.java |  8 +++--
 .../scala/org/apache/hudi/HoodieFileIndex.scala| 26 ++--
 .../scala/org/apache/hudi/cdc/HoodieCDCRDD.scala   |  2 +-
 .../org/apache/hudi/TestHoodieFileIndex.scala  |  1 +
 5 files changed, 54 insertions(+), 18 deletions(-)



[hudi] 01/01: [HUDI-4905] Improve type handling in proto schema conversion

2022-10-07 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git

commit 5d2c2853ea37ca8268f0d049460bee026216
Merge: 06d924137b 7d5b9dc0a9
Author: Alexey Kudinkin 
AuthorDate: Fri Oct 7 15:32:38 2022 -0700

[HUDI-4905] Improve type handling in proto schema conversion

 hudi-utilities/pom.xml |   1 -
 .../schema/ProtoClassBasedSchemaProvider.java  |  19 +-
 .../sources/helpers/ProtoConversionUtil.java   | 242 ++---
 .../schema/TestProtoClassBasedSchemaProvider.java  |  21 +-
 .../utilities/sources/TestProtoKafkaSource.java|   2 +-
 .../sources/helpers/TestProtoConversionUtil.java   | 100 +++--
 .../schema-provider/proto/oneof_schema.avsc|  42 
 .../resources/schema-provider/proto/sample.proto   |   8 +
 ..._flattened.avsc => sample_schema_defaults.avsc} |  31 ++-
 ...le_schema_wrapped_and_timestamp_as_record.avsc} |  16 +-
 10 files changed, 357 insertions(+), 125 deletions(-)



[hudi] branch master updated (06d924137b -> 5d2c2853ea)

2022-10-07 Thread akudinkin
This is an automated email from the ASF dual-hosted git repository.

akudinkin pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from 06d924137b [HUDI-2786] Docker demo on mac aarch64 (#6859)
 add 9c1fa14fd6 add support for unraveling proto schemas
 add 510d525e15 fix some compile issues
 add aad9ec1320 naming and style updates
 add 889927 make test data random, reuse code
 add a922a5beca add test for 2 different recursion depths, fix schema cache 
key
 add 3b37dc95d9 add unsigned long support
 add 706291d4f3 better handle other types
 add c28e874fca rebase on 4904
 add 190cc16381 get all tests working
 add f18fff886e fix oneof expected schema, update tests after rebase
 add ff5baa8706 revert scala binary change
 add 0069da2d1a try a different method to avoid avro version
 add 71a39bf488 Merge remote-tracking branch 'origin/master' into HUDI-4905
 add c5dff63375 delete unused file
 add f53d47ea3b address PR feedback, update decimal precision
 add 1831639e39 fix isNullable issue, check if class is Int64value
 add eca2992d65 checkstyle fix
 add 423da6f7bb change wrapper descriptor set initialization
 add fb2d9f0030 add in testing for unsigned long to BigInteger conversion
 add f03f9610cf shade protobuf dependency
 add 57f8b81194 Merge remote-tracking branch 'origin/master' into HUDI-4905
 add 7d5b9dc0a9 Revert "shade protobuf dependency"
 new 5d2c2853ea [HUDI-4905] Improve type handling in proto schema conversion

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 hudi-utilities/pom.xml |   1 -
 .../schema/ProtoClassBasedSchemaProvider.java  |  19 +-
 .../sources/helpers/ProtoConversionUtil.java   | 242 ++---
 .../schema/TestProtoClassBasedSchemaProvider.java  |  21 +-
 .../utilities/sources/TestProtoKafkaSource.java|   2 +-
 .../sources/helpers/TestProtoConversionUtil.java   | 100 +++--
 .../schema-provider/proto/oneof_schema.avsc|  42 
 .../resources/schema-provider/proto/sample.proto   |   8 +
 ..._flattened.avsc => sample_schema_defaults.avsc} |  31 ++-
 ...le_schema_wrapped_and_timestamp_as_record.avsc} |  16 +-
 10 files changed, 357 insertions(+), 125 deletions(-)
 create mode 100644 
hudi-utilities/src/test/resources/schema-provider/proto/oneof_schema.avsc
 rename 
hudi-utilities/src/test/resources/schema-provider/proto/{sample_schema_flattened.avsc
 => sample_schema_defaults.avsc} (92%)
 rename 
hudi-utilities/src/test/resources/schema-provider/proto/{sample_schema_nested.avsc
 => sample_schema_wrapped_and_timestamp_as_record.avsc} (95%)