[GitHub] [hudi] xiarixiaoyao commented on pull request #8026: [HUDI-5835] After performing the update operation, the hoodie table cannot be read normally by spark
xiarixiaoyao commented on PR #8026: URL: https://github.com/apache/hudi/pull/8026#issuecomment-1445023168 @hudi-bot run azure -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] simonjobs closed issue #8020: [SUPPORT] org.apache.avro.AvroTypeException: Cannot encode decimal with precision 4 as max precision 2
simonjobs closed issue #8020: [SUPPORT] org.apache.avro.AvroTypeException: Cannot encode decimal with precision 4 as max precision 2 URL: https://github.com/apache/hudi/issues/8020 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] simonjobs commented on issue #8020: [SUPPORT] org.apache.avro.AvroTypeException: Cannot encode decimal with precision 4 as max precision 2
simonjobs commented on issue #8020: URL: https://github.com/apache/hudi/issues/8020#issuecomment-1445005134 Thanks again, looking forward to the next release. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #8026: [HUDI-5835] After performing the update operation, the hoodie table cannot be read normally by spark
hudi-bot commented on PR #8026: URL: https://github.com/apache/hudi/pull/8026#issuecomment-1444996300 ## CI report: * 0b0ca829bb52ee630882391a5525ac85a69ecb7c UNKNOWN * edc3bd93fb3e6441e1bb3c6ec7cfad22495d13de Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15393) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] nsivabalan commented on a diff in pull request #8044: [WIP] Adding standalone restore tool
nsivabalan commented on code in PR #8044: URL: https://github.com/apache/hudi/pull/8044#discussion_r1117863047 ## hudi-utilities/src/main/java/org/apache/hudi/utilities/MORRestoreTool.java: ## @@ -0,0 +1,329 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.utilities; + +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.view.FileSystemViewManager; +import org.apache.hudi.common.table.view.HoodieTableFileSystemView; +import org.apache.hudi.exception.HoodieException; + +import com.beust.jcommander.JCommander; +import com.beust.jcommander.Parameter; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; + +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.stream.Collectors; + +/** + * + */ +public class MORRestoreTool implements Serializable { + + private static final Logger LOG = LogManager.getLogger(HoodieMetadataTableValidator.class); + + // Spark context + private transient JavaSparkContext jsc; + // config + private Config cfg; + // Properties with source, hoodie client, key generator etc. + private TypedProperties props; + + private final HoodieTableMetaClient metaClient; + + public MORRestoreTool(HoodieTableMetaClient metaClient) { +this.metaClient = metaClient; + } + + public MORRestoreTool(JavaSparkContext jsc, Config cfg) { +this.jsc = jsc; +this.cfg = cfg; + +this.props = cfg.propsFilePath == null +? UtilHelpers.buildProperties(cfg.configs) +: readConfigFromFileSystem(jsc, cfg); + +this.metaClient = HoodieTableMetaClient.builder() +.setConf(jsc.hadoopConfiguration()).setBasePath(cfg.basePath) +.build(); + } + + /** + * Reads config from the file system. + * + * @param jsc {@link JavaSparkContext} instance. + * @param cfg {@link Config} instance. + * @return the {@link TypedProperties} instance. + */ + private TypedProperties readConfigFromFileSystem(JavaSparkContext jsc, Config cfg) { +return UtilHelpers.readConfig(jsc.hadoopConfiguration(), new Path(cfg.propsFilePath), cfg.configs) +.getProps(true); + } + + public static class Config implements Serializable { +@Parameter(names = {"--base-path", "-sp"}, description = "Base path for the table", required = true) +public String basePath = null; + +@Parameter(names = {"--parallelism", "-pl"}, description = "Parallelism for valuation", required = false) +public int parallelism = 200; + +@Parameter(names = {"--commitTime", "-c"}, description = "Instant Time to restore to", required = true) +public String commitTime = ""; + +@Parameter(names = {"--dryRun"}, description = "Dry run without deleting any files", required = false) +public boolean dryRun = true; + +@Parameter(names = {"--spark-master", "-ms"}, description = "Spark master", required = false) +public String sparkMaster = null; + +@Parameter(names = {"--spark-memory", "-sm"}, description = "spark memory to use", required = false) +public String sparkMemory = "1g"; + +@Parameter(names = {"--assume-date-partitioning"}, description = "Should HoodieWriteClient assume the data is partitioned by dates, i.e three levels from base path." ++ "This is a stop-gap to support tables created by versions < 0.3.1. Will be removed eventually", required = false) +public Boolean assumeDatePartitioning = false; + +@Parameter(names = {"--props"}, description = "path to properties file on localfs or dfs, with configurations for " +
[GitHub] [hudi] nsivabalan opened a new pull request, #8044: [WIP] Adding standalone restore tool
nsivabalan opened a new pull request, #8044: URL: https://github.com/apache/hudi/pull/8044 ### Change Logs For MOR Table, restoring to a very old delta commit is very time consuming. since internally, we do rollback of 1 commit at a time. This standalone tool takes a stab at improving the performance of restore. You can choose a delta commit just before a compaction commit, and this tool will directly delete files for newer file slices after the delta commit chosen. Caution: Metadata has to be disbaled. And this tool takesn unconventional route of not going via rollback. This tool directly lists the files and deletes them and also deleted the timeline files if necessary. ### Impact _Describe any public API or user-facing feature change or any performance impact._ ### Risk level (write none, low medium or high below) _If medium or high, explain what verification was done to mitigate the risks._ ### Documentation Update _Describe any necessary documentation update if there is any new feature, config, or user-facing change_ - _The config description must be updated if new configs are added or the default value of the configs are changed_ - _Any new feature or user-facing change requires updating the Hudi website. Please create a Jira ticket, attach the ticket number here and follow the [instruction](https://hudi.apache.org/contribute/developer-setup#website) to make changes to the website._ ### Contributor's checklist - [ ] Read through [contributor's guide](https://hudi.apache.org/contribute/how-to-contribute) - [ ] Change Logs and Impact were stated clearly - [ ] Adequate tests were added if applicable - [ ] CI passed -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] danny0405 commented on pull request #8029: [HUDI-5832] add relocated prefix for hbase classes in hbase-site.xml
danny0405 commented on PR #8029: URL: https://github.com/apache/hudi/pull/8029#issuecomment-1444980126 Reasonable, looks good from my side. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] danny0405 commented on a diff in pull request #7997: [HUDI-5822] Fix FileId not found exception when FileId is passed to HoodieMergeHa...
danny0405 commented on code in PR #7997: URL: https://github.com/apache/hudi/pull/7997#discussion_r1117860492 ## hudi-common/src/test/java/org/apache/hudi/common/testutils/FileCreateUtils.java: ## @@ -101,7 +101,11 @@ public static String markerFileName(String instantTime, String fileId, IOType io } public static String markerFileName(String instantTime, String fileId, IOType ioType, String fileExtension) { -return String.format("%s_%s_%s%s%s.%s", fileId, WRITE_TOKEN, instantTime, fileExtension, HoodieTableMetaClient.MARKER_EXTN, ioType); +return markerFileName(instantTime, fileId, ioType, fileExtension, WRITE_TOKEN); + } + + public static String markerFileName(String instantTime, String fileId, IOType ioType, String fileExtension, String writeToken) { +return String.format("%s_%s_%s%s%s.%s", fileId, writeToken, instantTime, fileExtension, HoodieTableMetaClient.MARKER_EXTN, ioType); Review Comment: Usually we do not add logic only for testing purpose. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] danny0405 commented on a diff in pull request #7886: [HUDI-5726] Fix timestamp field is 8 hours longer than the time
danny0405 commented on code in PR #7886: URL: https://github.com/apache/hudi/pull/7886#discussion_r1117860202 ## hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/AvroToRowDataConverters.java: ## @@ -233,7 +234,9 @@ private static AvroToRowDataConverter createTimestampConverter(int precision) { "Unexpected object type for TIMESTAMP logical type. Received: " + avroObject); } } - return TimestampData.fromInstant(instant); + Timestamp timestamp = new Timestamp(instant.toEpochMilli()); + timestamp.setNanos(instant.getNano()); + return TimestampData.fromTimestamp(timestamp); Review Comment: We should not add local time zone to this, you can just fix the tests. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] danny0405 commented on issue #2813: [SUPPORT] HoodieRealtimeRecordReader can only work on RealtimeSplit and not with hdfs://111.parquet:0+4
danny0405 commented on issue #2813: URL: https://github.com/apache/hudi/issues/2813#issuecomment-1444977868 rt table with pure log is not supported well for Hive queries, you may need to switch to ro table instead. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] danny0405 commented on issue #8025: Found commits after time :20230220161017756, please rollback greater commits first
danny0405 commented on issue #8025: URL: https://github.com/apache/hudi/issues/8025#issuecomment-1444978115 What version of Hudi did you use, seems an uknown bug. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #8026: [HUDI-5835] After performing the update operation, the hoodie table cannot be read normally by spark
hudi-bot commented on PR #8026: URL: https://github.com/apache/hudi/pull/8026#issuecomment-1444971439 ## CI report: * ba042182e490e4601f0a8bda31eeb8e42251ca18 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15392) * 0b0ca829bb52ee630882391a5525ac85a69ecb7c UNKNOWN * edc3bd93fb3e6441e1bb3c6ec7cfad22495d13de Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15393) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] xiarixiaoyao commented on issue #8020: [SUPPORT] org.apache.avro.AvroTypeException: Cannot encode decimal with precision 4 as max precision 2
xiarixiaoyao commented on issue #8020: URL: https://github.com/apache/hudi/issues/8020#issuecomment-1444963514 @simonjobs yes i relaxed the limit of SchemaChangeUtils.isTypeUpdateAllow in master branch , and test decimal(3, 0) -> decimal(4, 0) all is well. If you have any other questions, welcome to raise -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #8026: [HUDI-5835] After performing the update operation, the hoodie table cannot be read normally by spark
hudi-bot commented on PR #8026: URL: https://github.com/apache/hudi/pull/8026#issuecomment-1444942429 ## CI report: * ba042182e490e4601f0a8bda31eeb8e42251ca18 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15392) * 0b0ca829bb52ee630882391a5525ac85a69ecb7c UNKNOWN * edc3bd93fb3e6441e1bb3c6ec7cfad22495d13de UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #8026: [HUDI-5835] After performing the update operation, the hoodie table cannot be read normally by spark
hudi-bot commented on PR #8026: URL: https://github.com/apache/hudi/pull/8026#issuecomment-1444934165 ## CI report: * ba042182e490e4601f0a8bda31eeb8e42251ca18 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15392) * 0b0ca829bb52ee630882391a5525ac85a69ecb7c UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #8026: [HUDI-5835] After performing the update operation, the hoodie table cannot be read normally by spark
hudi-bot commented on PR #8026: URL: https://github.com/apache/hudi/pull/8026#issuecomment-1444924991 ## CI report: * ba042182e490e4601f0a8bda31eeb8e42251ca18 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15392) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] xiarixiaoyao commented on a diff in pull request #8026: [HUDI-5835] After performing the update operation, the hoodie table cannot be read normally by spark
xiarixiaoyao commented on code in PR #8026: URL: https://github.com/apache/hudi/pull/8026#discussion_r1117842078 ## hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala: ## @@ -613,8 +617,11 @@ object HoodieBaseRelation extends SparkAdapterSupport { def apply(file: PartitionedFile): Iterator[InternalRow] = read.apply(file) } - def convertToAvroSchema(structSchema: StructType): Schema = -sparkAdapter.getAvroSchemaConverters.toAvroType(structSchema, nullable = false, "Record") + def convertToAvroSchema(structSchema: StructType, tableName: String ): Schema = { +val (recordName, namespace) = AvroConversionUtils.getAvroRecordNameAndNamespace(tableName) +val avroSchema = sparkAdapter.getAvroSchemaConverters.toAvroType(structSchema, nullable = false, recordName, namespace) +getAvroSchemaWithDefaults(avroSchema, structSchema) + } Review Comment: yes, schemaConverters.toAvroType will lost default value. see https://github.com/apache/hudi/pull/2765 In the schema evolution scenario, the default value is very important, avroSchema cares about this。 eg: If we add a new column newCol: string to the table, the default value of newCol will be null after schemaConverters.toAvroType , the default vaule of newCol will be lost now if we use this schema to read old avro log(not contains column newCol), avro will complain that there is no default value, and throw exception. https://github.com/apache/hudi/pull/7915 The root cause of this pr is that we lost the default value in the conversion process -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #8026: [HUDI-5835] After performing the update operation, the hoodie table cannot be read normally by spark
hudi-bot commented on PR #8026: URL: https://github.com/apache/hudi/pull/8026#issuecomment-1444873044 ## CI report: * 8ba4e5bf88c21d3b3734f5a07a1cfdd103c2e3c5 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15360) * ba042182e490e4601f0a8bda31eeb8e42251ca18 UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] xiarixiaoyao commented on a diff in pull request #8026: [HUDI-5835] After performing the update operation, the hoodie table cannot be read normally by spark
xiarixiaoyao commented on code in PR #8026: URL: https://github.com/apache/hudi/pull/8026#discussion_r1117839441 ## hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala: ## @@ -155,12 +158,13 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext, } } +val avroNameAndSpace = AvroConversionUtils.getAvroRecordNameAndNamespace(tableName) Review Comment: fixed ## hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala: ## @@ -99,6 +100,8 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext, protected val sparkSession: SparkSession = sqlContext.sparkSession + protected lazy val tableName = metaClient.getTableConfig.getTableName Review Comment: fixed -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] alexeykudinkin closed pull request #4214: [WIP][HUDI-2928] Switching default Parquet's column encoding to zstd
alexeykudinkin closed pull request #4214: [WIP][HUDI-2928] Switching default Parquet's column encoding to zstd URL: https://github.com/apache/hudi/pull/4214 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] alexeykudinkin closed pull request #5319: [WIP] Adjusting `DeltaStreamer` shutdown sequence to avoid awaiting for 24h
alexeykudinkin closed pull request #5319: [WIP] Adjusting `DeltaStreamer` shutdown sequence to avoid awaiting for 24h URL: https://github.com/apache/hudi/pull/5319 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] alexeykudinkin commented on a diff in pull request #7987: [HUDI-5514] Record Keys Auto-gen Prototype
alexeykudinkin commented on code in PR #7987: URL: https://github.com/apache/hudi/pull/7987#discussion_r1117812991 ## hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/RecordKeyAutoGen.scala: ## @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.hudi.common.config.HoodieConfig +import org.apache.hudi.common.model.{HoodiePayloadProps, HoodieRecord, WriteOperationType} +import org.apache.hudi.common.table.HoodieTableConfig +import org.apache.hudi.config.HoodieWriteConfig +import org.apache.hudi.exception.HoodieException +import org.apache.hudi.keygen.constant.KeyGeneratorOptions +import org.apache.spark.sql.{Column, DataFrame} +import org.apache.spark.sql.catalyst.expressions.AutoRecordKeyGenExpression + +import scala.collection.mutable +import scala.jdk.CollectionConverters.mapAsScalaMapConverter + +object RecordKeyAutoGen { + + /** + * Set of operations supporting record-key auto-gen (currently only [[WriteOperationType.INSERT]], + * [[WriteOperationType.BULK_INSERT]]) + */ + private val supportedOperations: Set[String] = +Set(WriteOperationType.INSERT, WriteOperationType.BULK_INSERT).map(_.value) + + /** + * Set of operations compatible w/ record-key auto-gen (additionally to [[supportedOperations]] + * [[WriteOperationType.DELETE]] is a compatible operation) + */ + private val compatibleOperations: Set[String] = supportedOperations ++ +Set(WriteOperationType.DELETE).map(_.value) + + def tryRecordKeyAutoGen(df: DataFrame, commitInstant: String, config: HoodieConfig): DataFrame = { +val shouldAutoGenRecordKeys = config.getBooleanOrDefault(HoodieTableConfig.AUTO_GEN_RECORD_KEYS) +val operation = config.getStringOrDefault(DataSourceWriteOptions.OPERATION) + +if (shouldAutoGenRecordKeys && supportedOperations.contains(operation)) { + // TODO reorder to keep all meta-fields as first? + df.withColumn(HoodieRecord.AUTOGEN_ROW_KEY, new Column(AutoRecordKeyGenExpression(commitInstant))) Review Comment: It's a meta-field rather than a data-field, but you're bringing up a good point, we'd have 3 fields which'd look mostly identical (`_hoodie_record_key`, `_hoodie_seq_no` and the new one). We should definitely try to optimize it. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] alexeykudinkin commented on a diff in pull request #8026: [HUDI-5835] After performing the update operation, the hoodie table cannot be read normally by spark
alexeykudinkin commented on code in PR #8026: URL: https://github.com/apache/hudi/pull/8026#discussion_r1117800390 ## hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala: ## @@ -155,12 +158,13 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext, } } +val avroNameAndSpace = AvroConversionUtils.getAvroRecordNameAndNamespace(tableName) Review Comment: nit: we can move this inside the map and also make it `val (name, namespace) = getAvroRecordNameAndNamespace(...)` ## hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala: ## @@ -99,6 +100,8 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext, protected val sparkSession: SparkSession = sqlContext.sparkSession + protected lazy val tableName = metaClient.getTableConfig.getTableName Review Comment: nit: better make this a method so that we don't carry this field additional when serializing ## hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala: ## @@ -613,8 +617,11 @@ object HoodieBaseRelation extends SparkAdapterSupport { def apply(file: PartitionedFile): Iterator[InternalRow] = read.apply(file) } - def convertToAvroSchema(structSchema: StructType): Schema = -sparkAdapter.getAvroSchemaConverters.toAvroType(structSchema, nullable = false, "Record") + def convertToAvroSchema(structSchema: StructType, tableName: String ): Schema = { +val (recordName, namespace) = AvroConversionUtils.getAvroRecordNameAndNamespace(tableName) +val avroSchema = sparkAdapter.getAvroSchemaConverters.toAvroType(structSchema, nullable = false, recordName, namespace) +getAvroSchemaWithDefaults(avroSchema, structSchema) + } Review Comment: @xiarixiaoyao i still don't understand why we need to set defaults in the schema. Can you please elaborate on that one? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #8043: [HUDI-5843] multiwriter deltastreamer checkpoints
hudi-bot commented on PR #8043: URL: https://github.com/apache/hudi/pull/8043#issuecomment-1444585397 ## CI report: * dbbd03b207d3108988f6c2997f6a3504f39f265d Azure: [SUCCESS](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15391) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] nsivabalan commented on a diff in pull request #8043: [HUDI-5843] multiwriter deltastreamer checkpoints
nsivabalan commented on code in PR #8043: URL: https://github.com/apache/hudi/pull/8043#discussion_r1117688720 ## hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java: ## @@ -698,8 +717,13 @@ private Pair, JavaRDD> writeToSink(JavaRDD 0; if (!hasErrors || cfg.commitOnErrors) { HashMap checkpointCommitMetadata = new HashMap<>(); + Option> extraPreCommitFunc = Option.empty(); if (checkpointStr != null) { -checkpointCommitMetadata.put(CHECKPOINT_KEY, checkpointStr); +if (identifier.isPresent()) { + extraPreCommitFunc = Option.of(new HoodieDeltaStreamerMultiwriterCheckpoint(this, checkpointStr, latestCheckpointWritten)); Review Comment: fetching of latestCheckpoint should happen within the extraPreCommitFunc i.e. within the lock. ## hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamerMultiwriterCheckpoint.java: ## @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utilities.deltastreamer; + +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.function.BiConsumer; + +import static org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.CHECKPOINT_KEY; + +/** + * This is used as an extraPreCommitFunc in BaseHoodieWriteClient + * It adds the checkpoint to deltacommit metadata. It must be implemented this way + * because it needs the lock to ensure that it does not overwrite another deltastreamers + * latest checkpoint with an older one. + */ +public class HoodieDeltaStreamerMultiwriterCheckpoint implements BiConsumer { Review Comment: nit: can rename to "DeltastreamerMultiWriterCkptUpdateFunc" ## hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamerMultiwriterCheckpoint.java: ## @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utilities.deltastreamer; + +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.function.BiConsumer; + +import static org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.CHECKPOINT_KEY; + +/** + * This is used as an extraPreCommitFunc in BaseHoodieWriteClient + * It adds the checkpoint to deltacommit metadata. It must be implemented this way + * because it needs the lock to ensure that it does not overwrite another deltastreamers + * latest checkpoint with an older one. + */ +public class
[hudi] branch asf-site updated: [HUDI-5833] Add 0.13.0 release notes (#8022)
This is an automated email from the ASF dual-hosted git repository. yihua pushed a commit to branch asf-site in repository https://gitbox.apache.org/repos/asf/hudi.git The following commit(s) were added to refs/heads/asf-site by this push: new c0a5b21a35d [HUDI-5833] Add 0.13.0 release notes (#8022) c0a5b21a35d is described below commit c0a5b21a35d4d1fb05b54c699565c0455e622dde Author: Y Ethan Guo AuthorDate: Fri Feb 24 13:29:39 2023 -0800 [HUDI-5833] Add 0.13.0 release notes (#8022) Adds the 0.13.0 release notes and download links for 0.13.0. --- website/docusaurus.config.js | 6 +- website/releases/download.md | 4 + website/releases/older-releases.md | 2 +- website/releases/release-0.10.0.md | 2 +- website/releases/release-0.10.1.md | 2 +- website/releases/release-0.11.0.md | 2 +- website/releases/release-0.11.1.md | 2 +- website/releases/release-0.12.0.md | 2 +- website/releases/release-0.12.1.md | 2 +- website/releases/release-0.12.2.md | 2 +- website/releases/release-0.13.0.md | 506 + website/releases/release-0.7.0.md | 2 +- website/releases/release-0.8.0.md | 2 +- website/releases/release-0.9.0.md | 2 +- website/src/components/HomepageHeader/index.js | 2 +- 15 files changed, 525 insertions(+), 15 deletions(-) diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index 94cd3d02bad..074e887a023 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -112,11 +112,11 @@ module.exports = { }, { from: ['/docs/releases', '/docs/next/releases'], -to: '/releases/release-0.12.2', +to: '/releases/release-0.13.0', }, { from: ['/releases'], -to: '/releases/release-0.12.2', +to: '/releases/release-0.13.0', }, ], }, @@ -281,7 +281,7 @@ module.exports = { }, { label: 'Releases', - to: '/releases/release-0.12.2', + to: '/releases/release-0.13.0', }, { label: 'Download', diff --git a/website/releases/download.md b/website/releases/download.md index e7ceb1d5c56..12b9f614d94 100644 --- a/website/releases/download.md +++ b/website/releases/download.md @@ -6,6 +6,10 @@ toc: true last_modified_at: 2022-12-27T15:59:57-04:00 --- +### Release 0.13.0 +* Source Release : [Apache Hudi 0.13.0 Source Release](https://www.apache.org/dyn/closer.lua/hudi/0.13.0/hudi-0.13.0.src.tgz) ([asc](https://downloads.apache.org/hudi/0.13.0/hudi-0.13.0.src.tgz.asc), [sha512](https://downloads.apache.org/hudi/0.13.0/hudi-0.13.0.src.tgz.sha512)) +* Release Note : ([Release Note for Apache Hudi 0.13.0](/releases/release-0.13.0)) + ### Release 0.12.2 * [Long Term Support](/releases/release-0.12.2#long-term-support): this is the latest stable release * Source Release : [Apache Hudi 0.12.2 Source Release](https://www.apache.org/dyn/closer.lua/hudi/0.12.2/hudi-0.12.2.src.tgz) ([asc](https://downloads.apache.org/hudi/0.12.2/hudi-0.12.2.src.tgz.asc), [sha512](https://downloads.apache.org/hudi/0.12.2/hudi-0.12.2.src.tgz.sha512)) diff --git a/website/releases/older-releases.md b/website/releases/older-releases.md index 3147cb7c645..9822d6e6510 100644 --- a/website/releases/older-releases.md +++ b/website/releases/older-releases.md @@ -1,6 +1,6 @@ --- title: "Older Releases" -sidebar_position: 12 +sidebar_position: 13 layout: releases toc: true last_modified_at: 2020-05-28T08:40:00-07:00 diff --git a/website/releases/release-0.10.0.md b/website/releases/release-0.10.0.md index 35223c5e526..45fbf4593f0 100644 --- a/website/releases/release-0.10.0.md +++ b/website/releases/release-0.10.0.md @@ -1,6 +1,6 @@ --- title: "Release 0.10.0" -sidebar_position: 8 +sidebar_position: 9 layout: releases toc: true last_modified_at: 2021-12-10T22:07:00+08:00 diff --git a/website/releases/release-0.10.1.md b/website/releases/release-0.10.1.md index be4bc237091..04e9f88f53f 100644 --- a/website/releases/release-0.10.1.md +++ b/website/releases/release-0.10.1.md @@ -1,6 +1,6 @@ --- title: "Release 0.10.1" -sidebar_position: 7 +sidebar_position: 8 layout: releases toc: true last_modified_at: 2022-01-27T22:07:00+08:00 diff --git a/website/releases/release-0.11.0.md b/website/releases/release-0.11.0.md index 312722f5846..7e7a4ef748b 100644 --- a/website/releases/release-0.11.0.md +++ b/website/releases/release-0.11.0.md @@ -1,6 +1,6 @@ --- title: "Release 0.11.0" -sidebar_position: 6 +sidebar_position: 7 layout: releases toc: true last_modified_at: 2022-01-27T22:07:00+08:00 diff --git a/website/releases/release-0.11.1.md b/website/releases/release-0.11.1.md index
[GitHub] [hudi] yihua merged pull request #8022: [HUDI-5833] Add 0.13.0 release notes
yihua merged PR #8022: URL: https://github.com/apache/hudi/pull/8022 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] yihua commented on a diff in pull request #8022: [HUDI-5833] Add 0.13.0 release notes
yihua commented on code in PR #8022: URL: https://github.com/apache/hudi/pull/8022#discussion_r1117629977 ## website/releases/release-0.13.0.md: ## @@ -0,0 +1,506 @@ +--- +title: "Release 0.13.0" +sidebar_position: 2 +layout: releases +toc: true +last_modified_at: 2022-02-22T13:00:00-08:00 +--- +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# [Release 0.13.0](https://github.com/apache/hudi/releases/tag/release-0.13.0) ([docs](/docs/quick-start-guide)) + +Apache Hudi 0.13.0 release introduces a number of new features including [Metaserver](#metaserver), +[Change Data Capture](#change-data-capture), [new Record Merge API](#optimizing-record-payload-handling), +[new sources for Deltastreamer](#new-source-support-in-deltastreamer) and more. While there is no table version upgrade +required for this release, users are expected to take actions by following the [Migration Guide](#migration-guide-overview) +down below on relevant [breaking changes](#migration-guide-breaking-changes) and +[behavior changes](#migration-guide-behavior-changes) before using 0.13.0 release. + +## Migration Guide: Overview + +This release keeps the same table version (`5`) as [0.12.0 release](/releases/release-0.12.0), and there is no need for +a table version upgrade if you are upgrading from 0.12.0. There are a few +[breaking changes](#migration-guide-breaking-changes) and [behavior changes](#migration-guide-behavior-changes) as +described below, and users are expected to take action accordingly before using 0.13.0 release. + +:::caution +If migrating from an older release (pre 0.12.0), please also check the upgrade instructions from each older release in +sequence. +::: + +## Migration Guide: Breaking Changes + +### Bundle Updates + + Spark bundle Support + +From now on, [`hudi-spark3.2-bundle`](https://mvnrepository.com/artifact/org.apache.hudi/hudi-spark3.2-bundle) works +with Apache Spark 3.2.1 and newer versions for Spark 3.2.x. The support for Spark 3.2.0 with +[`hudi-spark3.2-bundle`](https://mvnrepository.com/artifact/org.apache.hudi/hudi-spark3.2-bundle) is +dropped because of the Spark implementation change of `getHive` method of `HiveClientImpl` which is incompatible between +Spark version 3.2.0 and 3.2.1. + + Utilities Bundle Change + +The AWS and GCP bundle jars are separated from +[`hudi-utilities-bundle`](https://mvnrepository.com/artifact/org.apache.hudi/hudi-utilities-bundle). The user would need +to use [**`hudi-aws-bundle`**](https://mvnrepository.com/artifact/org.apache.hudi/hudi-aws-bundle) or +[**`hudi-gcp-bundle`**](https://mvnrepository.com/artifact/org.apache.hudi/hudi-gcp-bundle) along with +[`hudi-utilities-bundle`](https://mvnrepository.com/artifact/org.apache.hudi/hudi-utilities-bundle) while using the +cloud services. + + New Flink Bundle + +Hudi is now supported on Flink 1.16.x with the new +[`hudi-flink1.16-bundle`](https://mvnrepository.com/artifact/org.apache.hudi/hudi-flink1.16-bundle). + +### Lazy File Index in Spark + +Hudi's File Index in Spark is switched to be listed lazily ***by default***: this entails that it would **only** be listing +partitions that are requested by the query (i.e., after partition-pruning) as opposed to always listing the whole table +before this release. This is expected to bring considerable performance improvement for large tables. + +A new configuration property is added if the user wants to change the listing behavior: +`hoodie.datasource.read.file.index.listing.mode` (now default to **`lazy`**). There are two possible values that you can +set: + +- **`eager`**: This lists all partition paths and corresponding file slices within them eagerly, during initialization. +This is the default behavior prior 0.13.0. + - If a Hudi table has 1000 partitions, the eager mode lists the files under all of them when constructing the file index. + +- **`lazy`**: The partitions and file-slices within them will be listed lazily, allowing partition pruning predicates to +be pushed down appropriately, therefore only listing partitions after these have already been pruned. + - The files are not listed under the partitions when the File Index is initialized. The files are listed only under +targeted partition(s) after partition pruning using predicates (e.g., `datestr=2023-02-19`) in queries. + +:::tip +To preserve the behavior pre 0.13.0, the user needs to set `hoodie.datasource.read.file.index.listing.mode=eager`. +::: + +:::danger Breaking Change +The **breaking change** occurs only in cases when the table has **BOTH**: multiple partition columns AND partition +values contain slashes that are not URL-encoded. +::: + +For example let's assume we want to parse two partition columns - `month` (`2022/01`) and `day` (`03`), from the +partition path `2022/01/03`. Since there is a mismatch between the number of partition columns (2 here - `month` and +`day`) and the number of components
[GitHub] [hudi] soumilshah1995 commented on issue #8033: [SUPPORT] Hudi to support Change-Data-Capture RFC 51 |
soumilshah1995 commented on issue #8033: URL: https://github.com/apache/hudi/issues/8033#issuecomment-120010 @xushiyan @yihu -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #8043: [HUDI-5843] multiwriter deltastreamer checkpoints
hudi-bot commented on PR #8043: URL: https://github.com/apache/hudi/pull/8043#issuecomment-1444389392 ## CI report: * dbbd03b207d3108988f6c2997f6a3504f39f265d Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15391) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #8043: [HUDI-5843] multiwriter deltastreamer checkpoints
hudi-bot commented on PR #8043: URL: https://github.com/apache/hudi/pull/8043#issuecomment-1444316323 ## CI report: * dbbd03b207d3108988f6c2997f6a3504f39f265d UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] bhasudha commented on a diff in pull request #8022: [HUDI-5833] Add 0.13.0 release notes
bhasudha commented on code in PR #8022: URL: https://github.com/apache/hudi/pull/8022#discussion_r1117522555 ## website/releases/release-0.13.0.md: ## @@ -0,0 +1,506 @@ +--- +title: "Release 0.13.0" +sidebar_position: 2 +layout: releases +toc: true +last_modified_at: 2022-02-22T13:00:00-08:00 +--- +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# [Release 0.13.0](https://github.com/apache/hudi/releases/tag/release-0.13.0) ([docs](/docs/quick-start-guide)) + +Apache Hudi 0.13.0 release introduces a number of new features including [Metaserver](#metaserver), +[Change Data Capture](#change-data-capture), [new Record Merge API](#optimizing-record-payload-handling), +[new sources for Deltastreamer](#new-source-support-in-deltastreamer) and more. While there is no table version upgrade +required for this release, users are expected to take actions by following the [Migration Guide](#migration-guide-overview) +down below on relevant [breaking changes](#migration-guide-breaking-changes) and +[behavior changes](#migration-guide-behavior-changes) before using 0.13.0 release. + +## Migration Guide: Overview + +This release keeps the same table version (`5`) as [0.12.0 release](/releases/release-0.12.0), and there is no need for +a table version upgrade if you are upgrading from 0.12.0. There are a few +[breaking changes](#migration-guide-breaking-changes) and [behavior changes](#migration-guide-behavior-changes) as +described below, and users are expected to take action accordingly before using 0.13.0 release. + +:::caution +If migrating from an older release (pre 0.12.0), please also check the upgrade instructions from each older release in +sequence. +::: + +## Migration Guide: Breaking Changes + +### Bundle Updates + + Spark bundle Support + +From now on, [`hudi-spark3.2-bundle`](https://mvnrepository.com/artifact/org.apache.hudi/hudi-spark3.2-bundle) works +with Apache Spark 3.2.1 and newer versions for Spark 3.2.x. The support for Spark 3.2.0 with +[`hudi-spark3.2-bundle`](https://mvnrepository.com/artifact/org.apache.hudi/hudi-spark3.2-bundle) is +dropped because of the Spark implementation change of `getHive` method of `HiveClientImpl` which is incompatible between +Spark version 3.2.0 and 3.2.1. + + Utilities Bundle Change + +The AWS and GCP bundle jars are separated from +[`hudi-utilities-bundle`](https://mvnrepository.com/artifact/org.apache.hudi/hudi-utilities-bundle). The user would need +to use [**`hudi-aws-bundle`**](https://mvnrepository.com/artifact/org.apache.hudi/hudi-aws-bundle) or +[**`hudi-gcp-bundle`**](https://mvnrepository.com/artifact/org.apache.hudi/hudi-gcp-bundle) along with +[`hudi-utilities-bundle`](https://mvnrepository.com/artifact/org.apache.hudi/hudi-utilities-bundle) while using the +cloud services. + + New Flink Bundle + +Hudi is now supported on Flink 1.16.x with the new +[`hudi-flink1.16-bundle`](https://mvnrepository.com/artifact/org.apache.hudi/hudi-flink1.16-bundle). + +### Lazy File Index in Spark + +Hudi's File Index in Spark is switched to be listed lazily ***by default***: this entails that it would **only** be listing +partitions that are requested by the query (i.e., after partition-pruning) as opposed to always listing the whole table +before this release. This is expected to bring considerable performance improvement for large tables. + +A new configuration property is added if the user wants to change the listing behavior: +`hoodie.datasource.read.file.index.listing.mode` (now default to **`lazy`**). There are two possible values that you can +set: + +- **`eager`**: This lists all partition paths and corresponding file slices within them eagerly, during initialization. +This is the default behavior prior 0.13.0. + - If a Hudi table has 1000 partitions, the eager mode lists the files under all of them when constructing the file index. + +- **`lazy`**: The partitions and file-slices within them will be listed lazily, allowing partition pruning predicates to +be pushed down appropriately, therefore only listing partitions after these have already been pruned. + - The files are not listed under the partitions when the File Index is initialized. The files are listed only under +targeted partition(s) after partition pruning using predicates (e.g., `datestr=2023-02-19`) in queries. + +:::tip +To preserve the behavior pre 0.13.0, the user needs to set `hoodie.datasource.read.file.index.listing.mode=eager`. +::: + +:::danger Breaking Change +The **breaking change** occurs only in cases when the table has **BOTH**: multiple partition columns AND partition +values contain slashes that are not URL-encoded. +::: + +For example let's assume we want to parse two partition columns - `month` (`2022/01`) and `day` (`03`), from the +partition path `2022/01/03`. Since there is a mismatch between the number of partition columns (2 here - `month` and +`day`) and the number of
[jira] [Updated] (HUDI-5843) Multiwriter Checkpoints for deltastreamer
[ https://issues.apache.org/jira/browse/HUDI-5843?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] ASF GitHub Bot updated HUDI-5843: - Labels: pull-request-available (was: ) > Multiwriter Checkpoints for deltastreamer > - > > Key: HUDI-5843 > URL: https://issues.apache.org/jira/browse/HUDI-5843 > Project: Apache Hudi > Issue Type: Improvement > Components: deltastreamer >Reporter: Jonathan Vexler >Assignee: Jonathan Vexler >Priority: Major > Labels: pull-request-available > > Give each deltastreamer an identifier. Map identifiers to checkpoints so that > we can run multiple delta streamers at the same time. -- This message was sent by Atlassian Jira (v8.20.10#820010)
[GitHub] [hudi] jonvex opened a new pull request, #8043: [HUDI-5843] multiwriter deltastreamer checkpoints
jonvex opened a new pull request, #8043: URL: https://github.com/apache/hudi/pull/8043 ### Change Logs Add new config "hoodie.deltastreamer.multiwriter.source.checkpoint.id". When this is set, multiwriter checkpoints are enabled for deltastreamer. Each deltastreamer instance should use a unique id. ### Impact Can write from multiple sources to one table with deltastreamer ### Risk level (write none, low medium or high below) low ### Documentation Update Need to add to change logs, maybe need a section in the deltastreamer page ### Contributor's checklist - [ ] Read through [contributor's guide](https://hudi.apache.org/contribute/how-to-contribute) - [ ] Change Logs and Impact were stated clearly - [ ] Adequate tests were added if applicable - [ ] CI passed -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #7998: [HUDI-5824] Fix: do not combine if write operation is Upsert and COMBINE_BEFORE_UPSERT is false
hudi-bot commented on PR #7998: URL: https://github.com/apache/hudi/pull/7998#issuecomment-1444290033 ## CI report: * 27d61f01fb6709e3aaa08de9ace7738dbedffb24 UNKNOWN * cde8d4ffa1cae261731d94c2a0117ece6473a882 Azure: [SUCCESS](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15389) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] nsivabalan commented on a diff in pull request #7987: [HUDI-5514] Record Keys Auto-gen Prototype
nsivabalan commented on code in PR #7987: URL: https://github.com/apache/hudi/pull/7987#discussion_r1117448184 ## hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/RecordKeyAutoGen.scala: ## @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.hudi.common.config.HoodieConfig +import org.apache.hudi.common.model.{HoodiePayloadProps, HoodieRecord, WriteOperationType} +import org.apache.hudi.common.table.HoodieTableConfig +import org.apache.hudi.config.HoodieWriteConfig +import org.apache.hudi.exception.HoodieException +import org.apache.hudi.keygen.constant.KeyGeneratorOptions +import org.apache.spark.sql.{Column, DataFrame} +import org.apache.spark.sql.catalyst.expressions.AutoRecordKeyGenExpression + +import scala.collection.mutable +import scala.jdk.CollectionConverters.mapAsScalaMapConverter + +object RecordKeyAutoGen { + + /** + * Set of operations supporting record-key auto-gen (currently only [[WriteOperationType.INSERT]], + * [[WriteOperationType.BULK_INSERT]]) + */ + private val supportedOperations: Set[String] = +Set(WriteOperationType.INSERT, WriteOperationType.BULK_INSERT).map(_.value) + + /** + * Set of operations compatible w/ record-key auto-gen (additionally to [[supportedOperations]] + * [[WriteOperationType.DELETE]] is a compatible operation) + */ + private val compatibleOperations: Set[String] = supportedOperations ++ +Set(WriteOperationType.DELETE).map(_.value) + + def tryRecordKeyAutoGen(df: DataFrame, commitInstant: String, config: HoodieConfig): DataFrame = { +val shouldAutoGenRecordKeys = config.getBooleanOrDefault(HoodieTableConfig.AUTO_GEN_RECORD_KEYS) +val operation = config.getStringOrDefault(DataSourceWriteOptions.OPERATION) + +if (shouldAutoGenRecordKeys && supportedOperations.contains(operation)) { + // TODO reorder to keep all meta-fields as first? + df.withColumn(HoodieRecord.AUTOGEN_ROW_KEY, new Column(AutoRecordKeyGenExpression(commitInstant))) Review Comment: why do we need to add a new column? can't we keep it in memory(HoodieKey) and add it to our meta field (_hoodie_record_key) only? trying not to change the schema of table irrespective of whether auto gen is enabled or not. ## hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala: ## @@ -537,6 +544,21 @@ object HoodieSparkSqlWriter { fullPartitions.distinct } + def handleRecordKeyAutoGen(df: DataFrame, commitInstant: String, config: HoodieConfig): DataFrame = { +if (config.getBooleanOrDefault(HoodieTableConfig.AUTO_GEN_RECORD_KEYS)) { + val monotonicIdFormat = "#" * 19 + val rowKeyExpr = Concat(Seq( +Literal(s"${commitInstant}_"), +FormatNumber(MonotonicallyIncreasingID(), Literal(monotonicIdFormat)) + )) + + // TODO reorder? + df.withColumn(HoodieRecord.AUTOGEN_ROW_KEY, new Column(rowKeyExpr)) Review Comment: I don't think we can add a new data field. We already have a meta field for holding the record key (_hoodie_record_key). we should try to hold the auto genrated record key in memory (HoodieKey.recordkey) and let the writer write it to meta fields within write Handle. Or we should drop the newly added data field (HoodieRecord.AUTOGEN_ROW_KEY) later. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #7997: [HUDI-5822] Fix FileId not found exception when FileId is passed to HoodieMergeHa...
hudi-bot commented on PR #7997: URL: https://github.com/apache/hudi/pull/7997#issuecomment-1444088547 ## CI report: * 4ea65336bf55d988e388f7301a0cad9f42bd7b9b Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15311) Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15388) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] alexeykudinkin commented on a diff in pull request #7978: [HUDI-5812] Optimize the data size check in HoodieBaseParquetWriter
alexeykudinkin commented on code in PR #7978: URL: https://github.com/apache/hudi/pull/7978#discussion_r1117349049 ## hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieBaseParquetWriter.java: ## @@ -36,11 +36,10 @@ */ public abstract class HoodieBaseParquetWriter extends ParquetWriter { - private static final int WRITTEN_RECORDS_THRESHOLD_FOR_FILE_SIZE_CHECK = 1000; - + private final HoodieParquetConfig> parquetConfig; private final AtomicLong writtenRecordCount = new AtomicLong(0); private final long maxFileSize; - private long lastCachedDataSize = -1; + private long recordNumForNextCheck; Review Comment: nit: `recordCountForNextSizeCheck` ## hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetConfig.java: ## @@ -34,6 +37,8 @@ private final Configuration hadoopConf; private final double compressionRatio; private final boolean dictionaryEnabled; + private final long minRowCountForSizeCheck = DEFAULT_MINIMUM_RECORD_COUNT_FOR_CHECK; Review Comment: @boneanxs since there's already Parquet config for it, let's just re-use that (i was advising against introducing new Hudi config dedicated for it, but we should actually reuse existing Parquet's one) ## hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieBaseParquetWriter.java: ## @@ -56,23 +55,35 @@ public HoodieBaseParquetWriter(Path file, DEFAULT_WRITER_VERSION, FSUtils.registerFileSystem(file, parquetConfig.getHadoopConf())); +this.parquetConfig = parquetConfig; // We cannot accurately measure the snappy compressed output file size. We are choosing a // conservative 10% // TODO - compute this compression ratio dynamically by looking at the bytes written to the // stream and the actual file size reported by HDFS this.maxFileSize = parquetConfig.getMaxFileSize() + Math.round(parquetConfig.getMaxFileSize() * parquetConfig.getCompressionRatio()); +this.recordNumForNextCheck = parquetConfig.getMinRowCountForSizeCheck(); } public boolean canWrite() { -// TODO we can actually do evaluation more accurately: -// if we cache last data size check, since we account for how many records -// were written we can accurately project avg record size, and therefore -// estimate how many more records we can write before cut off -if (lastCachedDataSize == -1 || getWrittenRecordCount() % WRITTEN_RECORDS_THRESHOLD_FOR_FILE_SIZE_CHECK == 0) { - lastCachedDataSize = getDataSize(); +if (getWrittenRecordCount() >= recordNumForNextCheck) { + long dataSize = getDataSize(); + // In some very extreme cases, like all records are same value, then it's possible + // the dataSize is much lower than the writtenRecordCount(high compression ratio), + // causing avgRecordSize to 0, we'll force the avgRecordSize to 1 for such cases. + long avgRecordSize = Math.max(dataSize / getWrittenRecordCount(), 1); + // Follow the parquet block size check logic here, return false + // if it is within ~2 records of the limit + if (dataSize > (maxFileSize - avgRecordSize * 2)) { +return false; + } + recordNumForNextCheck = Math.min( Review Comment: Let's simplify this formula to make it more easily digestable: ``` writtenCount + Math.min(Math.max((maxFileSize / avgRecordSize - writtenCount) / 2, minCountForCheck), maxCountForCheck) ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #7680: [HUDI-5548] spark sql show | update hudi's table properties
hudi-bot commented on PR #7680: URL: https://github.com/apache/hudi/pull/7680#issuecomment-1444052034 ## CI report: * bde72fbb0e1f68dbd4d73954a204ae52e84f5b4d Azure: [SUCCESS](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15387) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Closed] (HUDI-5656) Metadata Bootstrap flow resulting in NPE
[ https://issues.apache.org/jira/browse/HUDI-5656?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Alexey Kudinkin closed HUDI-5656. - Resolution: Fixed > Metadata Bootstrap flow resulting in NPE > > > Key: HUDI-5656 > URL: https://issues.apache.org/jira/browse/HUDI-5656 > Project: Apache Hudi > Issue Type: Bug > Components: bootstrap >Affects Versions: 0.13.0 >Reporter: Alexey Kudinkin >Assignee: Alexey Kudinkin >Priority: Blocker > Labels: pull-request-available > Fix For: 0.13.1 > > > After adding a simple statement forcing the test to read whole bootstrapped > table: > {code:java} > sqlContext.sql("select * from bootstrapped").show(); {code} > > Following NPE have been observed on master > (testBulkInsertsAndUpsertsWithBootstrap): > {code:java} > org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in > stage 183.0 failed 1 times, most recent failure: Lost task 0.0 in stage 183.0 > (TID 971, localhost, executor driver): java.lang.NullPointerException > at > org.apache.spark.sql.catalyst.expressions.codegen.UnsafeWriter.write(UnsafeWriter.java:109) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.writeFields_0_1$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown > Source) > at scala.collection.Iterator$$anon$10.next(Iterator.scala:448) > at > org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:256) > at > org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:836) > at > org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:836) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) > at org.apache.spark.scheduler.Task.run(Task.scala:123) > at > org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:411) > at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:748)Driver stacktrace: at > org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:1889) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:1877) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:1876) > at > scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:59) > at > scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:52) > at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) > at > org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:926) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:926) > at scala.Option.foreach(Option.scala:257) > at > org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048) > at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) > at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101) > at > org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365) > at > org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38) > at
[jira] [Updated] (HUDI-5656) Metadata Bootstrap flow resulting in NPE
[ https://issues.apache.org/jira/browse/HUDI-5656?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Alexey Kudinkin updated HUDI-5656: -- Fix Version/s: 0.13.1 (was: 0.14.0) > Metadata Bootstrap flow resulting in NPE > > > Key: HUDI-5656 > URL: https://issues.apache.org/jira/browse/HUDI-5656 > Project: Apache Hudi > Issue Type: Bug > Components: bootstrap >Affects Versions: 0.13.0 >Reporter: Alexey Kudinkin >Assignee: Alexey Kudinkin >Priority: Blocker > Labels: pull-request-available > Fix For: 0.13.1 > > > After adding a simple statement forcing the test to read whole bootstrapped > table: > {code:java} > sqlContext.sql("select * from bootstrapped").show(); {code} > > Following NPE have been observed on master > (testBulkInsertsAndUpsertsWithBootstrap): > {code:java} > org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in > stage 183.0 failed 1 times, most recent failure: Lost task 0.0 in stage 183.0 > (TID 971, localhost, executor driver): java.lang.NullPointerException > at > org.apache.spark.sql.catalyst.expressions.codegen.UnsafeWriter.write(UnsafeWriter.java:109) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.writeFields_0_1$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown > Source) > at scala.collection.Iterator$$anon$10.next(Iterator.scala:448) > at > org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:256) > at > org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:836) > at > org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:836) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) > at org.apache.spark.scheduler.Task.run(Task.scala:123) > at > org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:411) > at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:748)Driver stacktrace: at > org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:1889) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:1877) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:1876) > at > scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:59) > at > scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:52) > at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) > at > org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:926) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:926) > at scala.Option.foreach(Option.scala:257) > at > org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048) > at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) > at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101) > at > org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365) > at >
[jira] [Updated] (HUDI-915) Partition Columns missing in files upserted after Metadata Bootstrap
[ https://issues.apache.org/jira/browse/HUDI-915?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Alexey Kudinkin updated HUDI-915: - Status: Patch Available (was: In Progress) > Partition Columns missing in files upserted after Metadata Bootstrap > > > Key: HUDI-915 > URL: https://issues.apache.org/jira/browse/HUDI-915 > Project: Apache Hudi > Issue Type: Bug > Components: Common Core >Affects Versions: 0.13.0 >Reporter: Udit Mehrotra >Assignee: Alexey Kudinkin >Priority: Blocker > Labels: pull-request-available > Fix For: 0.13.1 > > > This issue happens in when the source data is partitioned using _*hive-style > partitioning*_ which is also the default behavior of spark when it writes the > data. With this partitioning, the partition column/schema is never stored in > the files but instead retrieved on the fly from the file paths which have > partition folder in the form *_partition_key=partition_value_*. > Now, during metadata bootstrap we store only the metadata columns in the hudi > table folder. Also the *bootstrap schema* we are computing directly reads > schema from the source data file which does not have the *partition column > schema* in it. Thus it is not complete. > All this manifests into issues when we ultimately do *upserts* on these > bootstrapped files and they are fully bootstrapped. During upsert time the > schema evolves because the upsert dataframe needs to have partition column in > it for performing upserts. Thus ultimately the *upserted rows* have the > correct partition column value stored, while the other records which are > simply copied over from the metadata bootstrap file have missing partition > column in them. Thus, we observe a different behavior here with > *bootstrapped* vs *non-bootstrapped* tables. > While this is not at the moment creating issues with *Hive* because it is > able to determine the partition columns becuase of all the metadata it > stores, however it creates a problem with other engines like *Spark* where > the partition columns will show up as *null* when the upserted files are read. > Thus, the proposal is to fix the following issues: > * When performing bootstrap, figure out the partition schema and store it in > the *bootstrap schema* in the commit metadata file. This would provide the > following benefits: > ** From a completeness perspective this is good so that there is no > behavioral changes between bootstrapped vs non-bootstrapped tables. > ** In spark bootstrap relation and incremental query relation where we need > to figure out the latest schema, once can simply get the accurate schema from > the commit metadata file instead of having to determine whether or not > partition column is present in the schema obtained from the metadata file and > if not figure out the partition schema everytime and merge (which can be > expensive). > * When doing upsert on files that are metadata bootstrapped, the partition > column values should be correctly determined and copied to the upserted file > to avoid missing and null values. > ** Again this is consistent behavior with non-bootstrapped tables and even > though Hive seems to somehow handle this, we should consider other engines > like *Spark* where it cannot be automatically handled. > ** Without this it will be significantly more complicated to be able to > provide the partition value on read side in spark, to be able to determine > everytime whether partition value is null and somehow filling it in. > ** Once the table is fully bootstrapped at some point in future, and the > bootstrap commit is say cleaned up and spark querying happens through > *parquet* datasource instead of *new bootstrapped datasource*, the *parquet > datasource* will return null values wherever it find the missing partition > values. In that case, we have no control over the *parquet* datasource as it > is simply reading from the file. -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Closed] (HUDI-915) Partition Columns missing in files upserted after Metadata Bootstrap
[ https://issues.apache.org/jira/browse/HUDI-915?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Alexey Kudinkin closed HUDI-915. Resolution: Fixed > Partition Columns missing in files upserted after Metadata Bootstrap > > > Key: HUDI-915 > URL: https://issues.apache.org/jira/browse/HUDI-915 > Project: Apache Hudi > Issue Type: Bug > Components: Common Core >Affects Versions: 0.13.0 >Reporter: Udit Mehrotra >Assignee: Alexey Kudinkin >Priority: Blocker > Labels: pull-request-available > Fix For: 0.13.1 > > > This issue happens in when the source data is partitioned using _*hive-style > partitioning*_ which is also the default behavior of spark when it writes the > data. With this partitioning, the partition column/schema is never stored in > the files but instead retrieved on the fly from the file paths which have > partition folder in the form *_partition_key=partition_value_*. > Now, during metadata bootstrap we store only the metadata columns in the hudi > table folder. Also the *bootstrap schema* we are computing directly reads > schema from the source data file which does not have the *partition column > schema* in it. Thus it is not complete. > All this manifests into issues when we ultimately do *upserts* on these > bootstrapped files and they are fully bootstrapped. During upsert time the > schema evolves because the upsert dataframe needs to have partition column in > it for performing upserts. Thus ultimately the *upserted rows* have the > correct partition column value stored, while the other records which are > simply copied over from the metadata bootstrap file have missing partition > column in them. Thus, we observe a different behavior here with > *bootstrapped* vs *non-bootstrapped* tables. > While this is not at the moment creating issues with *Hive* because it is > able to determine the partition columns becuase of all the metadata it > stores, however it creates a problem with other engines like *Spark* where > the partition columns will show up as *null* when the upserted files are read. > Thus, the proposal is to fix the following issues: > * When performing bootstrap, figure out the partition schema and store it in > the *bootstrap schema* in the commit metadata file. This would provide the > following benefits: > ** From a completeness perspective this is good so that there is no > behavioral changes between bootstrapped vs non-bootstrapped tables. > ** In spark bootstrap relation and incremental query relation where we need > to figure out the latest schema, once can simply get the accurate schema from > the commit metadata file instead of having to determine whether or not > partition column is present in the schema obtained from the metadata file and > if not figure out the partition schema everytime and merge (which can be > expensive). > * When doing upsert on files that are metadata bootstrapped, the partition > column values should be correctly determined and copied to the upserted file > to avoid missing and null values. > ** Again this is consistent behavior with non-bootstrapped tables and even > though Hive seems to somehow handle this, we should consider other engines > like *Spark* where it cannot be automatically handled. > ** Without this it will be significantly more complicated to be able to > provide the partition value on read side in spark, to be able to determine > everytime whether partition value is null and somehow filling it in. > ** Once the table is fully bootstrapped at some point in future, and the > bootstrap commit is say cleaned up and spark querying happens through > *parquet* datasource instead of *new bootstrapped datasource*, the *parquet > datasource* will return null values wherever it find the missing partition > values. In that case, we have no control over the *parquet* datasource as it > is simply reading from the file. -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Assigned] (HUDI-915) Partition Columns missing in files upserted after Metadata Bootstrap
[ https://issues.apache.org/jira/browse/HUDI-915?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Alexey Kudinkin reassigned HUDI-915: Assignee: Alexey Kudinkin (was: Ethan Guo) > Partition Columns missing in files upserted after Metadata Bootstrap > > > Key: HUDI-915 > URL: https://issues.apache.org/jira/browse/HUDI-915 > Project: Apache Hudi > Issue Type: Task > Components: Common Core >Affects Versions: 0.9.0 >Reporter: Udit Mehrotra >Assignee: Alexey Kudinkin >Priority: Blocker > Labels: pull-request-available > Fix For: 0.13.1 > > > This issue happens in when the source data is partitioned using _*hive-style > partitioning*_ which is also the default behavior of spark when it writes the > data. With this partitioning, the partition column/schema is never stored in > the files but instead retrieved on the fly from the file paths which have > partition folder in the form *_partition_key=partition_value_*. > Now, during metadata bootstrap we store only the metadata columns in the hudi > table folder. Also the *bootstrap schema* we are computing directly reads > schema from the source data file which does not have the *partition column > schema* in it. Thus it is not complete. > All this manifests into issues when we ultimately do *upserts* on these > bootstrapped files and they are fully bootstrapped. During upsert time the > schema evolves because the upsert dataframe needs to have partition column in > it for performing upserts. Thus ultimately the *upserted rows* have the > correct partition column value stored, while the other records which are > simply copied over from the metadata bootstrap file have missing partition > column in them. Thus, we observe a different behavior here with > *bootstrapped* vs *non-bootstrapped* tables. > While this is not at the moment creating issues with *Hive* because it is > able to determine the partition columns becuase of all the metadata it > stores, however it creates a problem with other engines like *Spark* where > the partition columns will show up as *null* when the upserted files are read. > Thus, the proposal is to fix the following issues: > * When performing bootstrap, figure out the partition schema and store it in > the *bootstrap schema* in the commit metadata file. This would provide the > following benefits: > ** From a completeness perspective this is good so that there is no > behavioral changes between bootstrapped vs non-bootstrapped tables. > ** In spark bootstrap relation and incremental query relation where we need > to figure out the latest schema, once can simply get the accurate schema from > the commit metadata file instead of having to determine whether or not > partition column is present in the schema obtained from the metadata file and > if not figure out the partition schema everytime and merge (which can be > expensive). > * When doing upsert on files that are metadata bootstrapped, the partition > column values should be correctly determined and copied to the upserted file > to avoid missing and null values. > ** Again this is consistent behavior with non-bootstrapped tables and even > though Hive seems to somehow handle this, we should consider other engines > like *Spark* where it cannot be automatically handled. > ** Without this it will be significantly more complicated to be able to > provide the partition value on read side in spark, to be able to determine > everytime whether partition value is null and somehow filling it in. > ** Once the table is fully bootstrapped at some point in future, and the > bootstrap commit is say cleaned up and spark querying happens through > *parquet* datasource instead of *new bootstrapped datasource*, the *parquet > datasource* will return null values wherever it find the missing partition > values. In that case, we have no control over the *parquet* datasource as it > is simply reading from the file. -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-915) Partition Columns missing in files upserted after Metadata Bootstrap
[ https://issues.apache.org/jira/browse/HUDI-915?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Alexey Kudinkin updated HUDI-915: - Affects Version/s: 0.13.0 (was: 0.9.0) > Partition Columns missing in files upserted after Metadata Bootstrap > > > Key: HUDI-915 > URL: https://issues.apache.org/jira/browse/HUDI-915 > Project: Apache Hudi > Issue Type: Bug > Components: Common Core >Affects Versions: 0.13.0 >Reporter: Udit Mehrotra >Assignee: Alexey Kudinkin >Priority: Blocker > Labels: pull-request-available > Fix For: 0.13.1 > > > This issue happens in when the source data is partitioned using _*hive-style > partitioning*_ which is also the default behavior of spark when it writes the > data. With this partitioning, the partition column/schema is never stored in > the files but instead retrieved on the fly from the file paths which have > partition folder in the form *_partition_key=partition_value_*. > Now, during metadata bootstrap we store only the metadata columns in the hudi > table folder. Also the *bootstrap schema* we are computing directly reads > schema from the source data file which does not have the *partition column > schema* in it. Thus it is not complete. > All this manifests into issues when we ultimately do *upserts* on these > bootstrapped files and they are fully bootstrapped. During upsert time the > schema evolves because the upsert dataframe needs to have partition column in > it for performing upserts. Thus ultimately the *upserted rows* have the > correct partition column value stored, while the other records which are > simply copied over from the metadata bootstrap file have missing partition > column in them. Thus, we observe a different behavior here with > *bootstrapped* vs *non-bootstrapped* tables. > While this is not at the moment creating issues with *Hive* because it is > able to determine the partition columns becuase of all the metadata it > stores, however it creates a problem with other engines like *Spark* where > the partition columns will show up as *null* when the upserted files are read. > Thus, the proposal is to fix the following issues: > * When performing bootstrap, figure out the partition schema and store it in > the *bootstrap schema* in the commit metadata file. This would provide the > following benefits: > ** From a completeness perspective this is good so that there is no > behavioral changes between bootstrapped vs non-bootstrapped tables. > ** In spark bootstrap relation and incremental query relation where we need > to figure out the latest schema, once can simply get the accurate schema from > the commit metadata file instead of having to determine whether or not > partition column is present in the schema obtained from the metadata file and > if not figure out the partition schema everytime and merge (which can be > expensive). > * When doing upsert on files that are metadata bootstrapped, the partition > column values should be correctly determined and copied to the upserted file > to avoid missing and null values. > ** Again this is consistent behavior with non-bootstrapped tables and even > though Hive seems to somehow handle this, we should consider other engines > like *Spark* where it cannot be automatically handled. > ** Without this it will be significantly more complicated to be able to > provide the partition value on read side in spark, to be able to determine > everytime whether partition value is null and somehow filling it in. > ** Once the table is fully bootstrapped at some point in future, and the > bootstrap commit is say cleaned up and spark querying happens through > *parquet* datasource instead of *new bootstrapped datasource*, the *parquet > datasource* will return null values wherever it find the missing partition > values. In that case, we have no control over the *parquet* datasource as it > is simply reading from the file. -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-915) Partition Columns missing in files upserted after Metadata Bootstrap
[ https://issues.apache.org/jira/browse/HUDI-915?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Alexey Kudinkin updated HUDI-915: - Issue Type: Bug (was: Task) > Partition Columns missing in files upserted after Metadata Bootstrap > > > Key: HUDI-915 > URL: https://issues.apache.org/jira/browse/HUDI-915 > Project: Apache Hudi > Issue Type: Bug > Components: Common Core >Affects Versions: 0.9.0 >Reporter: Udit Mehrotra >Assignee: Alexey Kudinkin >Priority: Blocker > Labels: pull-request-available > Fix For: 0.13.1 > > > This issue happens in when the source data is partitioned using _*hive-style > partitioning*_ which is also the default behavior of spark when it writes the > data. With this partitioning, the partition column/schema is never stored in > the files but instead retrieved on the fly from the file paths which have > partition folder in the form *_partition_key=partition_value_*. > Now, during metadata bootstrap we store only the metadata columns in the hudi > table folder. Also the *bootstrap schema* we are computing directly reads > schema from the source data file which does not have the *partition column > schema* in it. Thus it is not complete. > All this manifests into issues when we ultimately do *upserts* on these > bootstrapped files and they are fully bootstrapped. During upsert time the > schema evolves because the upsert dataframe needs to have partition column in > it for performing upserts. Thus ultimately the *upserted rows* have the > correct partition column value stored, while the other records which are > simply copied over from the metadata bootstrap file have missing partition > column in them. Thus, we observe a different behavior here with > *bootstrapped* vs *non-bootstrapped* tables. > While this is not at the moment creating issues with *Hive* because it is > able to determine the partition columns becuase of all the metadata it > stores, however it creates a problem with other engines like *Spark* where > the partition columns will show up as *null* when the upserted files are read. > Thus, the proposal is to fix the following issues: > * When performing bootstrap, figure out the partition schema and store it in > the *bootstrap schema* in the commit metadata file. This would provide the > following benefits: > ** From a completeness perspective this is good so that there is no > behavioral changes between bootstrapped vs non-bootstrapped tables. > ** In spark bootstrap relation and incremental query relation where we need > to figure out the latest schema, once can simply get the accurate schema from > the commit metadata file instead of having to determine whether or not > partition column is present in the schema obtained from the metadata file and > if not figure out the partition schema everytime and merge (which can be > expensive). > * When doing upsert on files that are metadata bootstrapped, the partition > column values should be correctly determined and copied to the upserted file > to avoid missing and null values. > ** Again this is consistent behavior with non-bootstrapped tables and even > though Hive seems to somehow handle this, we should consider other engines > like *Spark* where it cannot be automatically handled. > ** Without this it will be significantly more complicated to be able to > provide the partition value on read side in spark, to be able to determine > everytime whether partition value is null and somehow filling it in. > ** Once the table is fully bootstrapped at some point in future, and the > bootstrap commit is say cleaned up and spark querying happens through > *parquet* datasource instead of *new bootstrapped datasource*, the *parquet > datasource* will return null values wherever it find the missing partition > values. In that case, we have no control over the *parquet* datasource as it > is simply reading from the file. -- This message was sent by Atlassian Jira (v8.20.10#820010)
[hudi] branch master updated: [HUDI-915][HUDI-5656] Rebased `HoodieBootstrapRelation` onto `HoodieBaseRelation` (#7804)
This is an automated email from the ASF dual-hosted git repository. akudinkin pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git The following commit(s) were added to refs/heads/master by this push: new 2770ff50714 [HUDI-915][HUDI-5656] Rebased `HoodieBootstrapRelation` onto `HoodieBaseRelation` (#7804) 2770ff50714 is described below commit 2770ff507141f013f7500354595137b52a543e8b Author: Alexey Kudinkin AuthorDate: Fri Feb 24 08:43:49 2023 -0800 [HUDI-915][HUDI-5656] Rebased `HoodieBootstrapRelation` onto `HoodieBaseRelation` (#7804) Currently `HoodieBootstrapRelation` is treats partitioned tables improperly resulting in NPE while trying to read bootstrapped table. To address that `HoodieBootstrapRelation` have been rebased onto `HoodieBaseRelation` sharing core of the reading semantic with other Hudi's file-based Relation implementations for COW, MOR (such as schema handling, file-listing, etc) --- .../scala/org/apache/hudi/HoodieBaseRelation.scala | 47 ++-- .../scala/org/apache/hudi/HoodieBootstrapRDD.scala | 103 .../org/apache/hudi/HoodieBootstrapRelation.scala | 259 +++-- .../apache/hudi/MergeOnReadSnapshotRelation.scala | 2 +- .../spark/sql/hudi/HoodieSqlCommonUtils.scala | 30 ++- .../functional/TestDataSourceForBootstrap.scala| 166 +++-- .../deltastreamer/TestHoodieDeltaStreamer.java | 4 + 7 files changed, 344 insertions(+), 267 deletions(-) diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala index 99b5b5c87ba..cb02c59a690 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala @@ -48,6 +48,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.HoodieCatalystExpressionUtils.{convertToCatalystExpression, generateUnsafeProjection} import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.analysis.Resolver import org.apache.spark.sql.catalyst.expressions.{Expression, SubqueryExpression} import org.apache.spark.sql.execution.FileRelation import org.apache.spark.sql.execution.datasources._ @@ -66,7 +67,12 @@ import scala.util.{Failure, Success, Try} trait HoodieFileSplit {} -case class HoodieTableSchema(structTypeSchema: StructType, avroSchemaStr: String, internalSchema: Option[InternalSchema] = None) +case class HoodieTableSchema(structTypeSchema: StructType, avroSchemaStr: String, internalSchema: Option[InternalSchema] = None) { + + def this(structTypeSchema: StructType) = +this(structTypeSchema, convertToAvroSchema(structTypeSchema).toString) + +} case class HoodieTableState(tablePath: String, latestCommitTimestamp: Option[String], @@ -98,6 +104,8 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext, protected val sparkSession: SparkSession = sqlContext.sparkSession + protected lazy val resolver: Resolver = sparkSession.sessionState.analyzer.resolver + protected lazy val conf: Configuration = new Configuration(sqlContext.sparkContext.hadoopConfiguration) protected lazy val jobConf = new JobConf(conf) @@ -174,8 +182,6 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext, protected lazy val tableStructSchema: StructType = { val converted = AvroConversionUtils.convertAvroSchemaToStructType(tableAvroSchema) - -val resolver = sparkSession.sessionState.analyzer.resolver val metaFieldMetadata = sparkAdapter.createCatalystMetadataForMetaField // NOTE: Here we annotate meta-fields with corresponding metadata such that Spark (>= 3.2) @@ -466,10 +472,15 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext, * For enable hoodie.datasource.write.drop.partition.columns, need to create an InternalRow on partition values * and pass this reader on parquet file. So that, we can query the partition columns. */ - protected def getPartitionColumnsAsInternalRow(file: FileStatus): InternalRow = { + + protected def getPartitionColumnsAsInternalRow(file: FileStatus): InternalRow = +getPartitionColumnsAsInternalRowInternal(file, shouldExtractPartitionValuesFromPartitionPath) + + protected def getPartitionColumnsAsInternalRowInternal(file: FileStatus, + extractPartitionValuesFromPartitionPath: Boolean): InternalRow = { try { val tableConfig = metaClient.getTableConfig - if (shouldExtractPartitionValuesFromPartitionPath) { + if (extractPartitionValuesFromPartitionPath) { val relativePath = new URI(metaClient.getBasePath).relativize(new
[GitHub] [hudi] alexeykudinkin merged pull request #7804: [HUDI-915][HUDI-5656] Rebased `HoodieBootstrapRelation` onto `HoodieBaseRelation`
alexeykudinkin merged PR #7804: URL: https://github.com/apache/hudi/pull/7804 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #8041: [HUDI-5847] Add support for multiple metric reporters and metric labels
hudi-bot commented on PR #8041: URL: https://github.com/apache/hudi/pull/8041#issuecomment-1443928774 ## CI report: * 2068af21c85e51254b849562f38cf6a2918f9bef Azure: [SUCCESS](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15386) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] asethia commented on issue #5537: hudi supports custom catalog name, spark_catalog is not mandatory
asethia commented on issue #5537: URL: https://github.com/apache/hudi/issues/5537#issuecomment-1443872559 Is any further update on this? If the hack is the solution, what does it take to add it as part of the main code? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #8027: [HUDI-5838] Mask sensitive info while printing hudi properties in DeltaStreamer
hudi-bot commented on PR #8027: URL: https://github.com/apache/hudi/pull/8027#issuecomment-1443833415 ## CI report: * d7c44d73145467406dcd5ecb33ee976059ea6d8f Azure: [SUCCESS](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15385) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #7998: [HUDI-5824] Fix: do not combine if write operation is Upsert and COMBINE_BEFORE_UPSERT is false
hudi-bot commented on PR #7998: URL: https://github.com/apache/hudi/pull/7998#issuecomment-1443833230 ## CI report: * 27d61f01fb6709e3aaa08de9ace7738dbedffb24 UNKNOWN * e8e3240aff997075065eb01d9277b227ab2bdf73 Azure: [SUCCESS](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15365) * cde8d4ffa1cae261731d94c2a0117ece6473a882 Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15389) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] soumilshah1995 commented on issue #8040: [SUPPORT] Getting error when writing into HUDI table if schema changed (datatype changed / column dropped)
soumilshah1995 commented on issue #8040: URL: https://github.com/apache/hudi/issues/8040#issuecomment-1443831871 sample code ``` import sys from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.sql.session import SparkSession from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job from pyspark.sql import DataFrame, Row from pyspark.sql.functions import * from pyspark.sql.functions import col, to_timestamp, monotonically_increasing_id, to_date, when import datetime from awsglue import DynamicFrame import boto3 ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, ["JOB_NAME", "database_name", "kinesis_table_name", "starting_position_of_kinesis_iterator", "hudi_table_name", "window_size", "s3_path_hudi", "s3_path_spark"]) spark = SparkSession.builder.config('spark.serializer', 'org.apache.spark.serializer.KryoSerializer').config( 'spark.sql.hive.convertMetastoreParquet', 'false').getOrCreate() sc = spark.sparkContext glueContext = GlueContext(sc) job = Job(glueContext) job.init(args['JOB_NAME'], args) database_name = args["database_name"] kinesis_table_name = args["kinesis_table_name"] hudi_table_name = args["hudi_table_name"] s3_path_hudi = args["s3_path_hudi"] s3_path_spark = args["s3_path_spark"] print("***") print(f""" database_name {database_name} kinesis_table_name = {kinesis_table_name} hudi_table_name ={hudi_table_name} s3_path_hudi = {s3_path_hudi} s3_path_spark = {s3_path_spark} """) # can be set to "latest", "trim_horizon" or "earliest" starting_position_of_kinesis_iterator = args["starting_position_of_kinesis_iterator"] # The amount of time to spend processing each batch window_size = args["window_size"] data_frame_DataSource0 = glueContext.create_data_frame.from_catalog( database=database_name, table_name=kinesis_table_name, transformation_ctx="DataSource0", additional_options={"inferSchema": "true", "startingPosition": starting_position_of_kinesis_iterator} ) # config commonConfig = { 'path': s3_path_hudi } hudiWriteConfig = { 'className': 'org.apache.hudi', 'hoodie.table.name': hudi_table_name, 'hoodie.datasource.write.operation': 'upsert', 'hoodie.datasource.write.table.type': 'COPY_ON_WRITE', 'hoodie.datasource.write.precombine.field': 'date', 'hoodie.datasource.write.recordkey.field': '_id', 'hoodie.datasource.write.partitionpath.field': 'year:SIMPLE,month:SIMPLE,day:SIMPLE', 'hoodie.datasource.write.keygenerator.class': 'org.apache.hudi.keygen.CustomKeyGenerator', 'hoodie.deltastreamer.keygen.timebased.timestamp.type': 'MIXED', 'hoodie.deltastreamer.keygen.timebased.input.dateformat': '-mm-dd', 'hoodie.deltastreamer.keygen.timebased.output.dateformat': '/MM/dd' } hudiGlueConfig = { 'hoodie.datasource.hive_sync.enable': 'true', 'hoodie.datasource.hive_sync.sync_as_datasource': 'false', 'hoodie.datasource.hive_sync.database': database_name, 'hoodie.datasource.hive_sync.table': hudi_table_name, 'hoodie.datasource.hive_sync.use_jdbc': 'false', 'hoodie.datasource.write.hive_style_partitioning': 'true', 'hoodie.datasource.hive_sync.partition_extractor_class': 'org.apache.hudi.hive.MultiPartKeysValueExtractor', 'hoodie.datasource.hive_sync.partition_fields': 'year,month,day' } combinedConf = { **commonConfig, **hudiWriteConfig, **hudiGlueConfig } # ensure the incomong record has the correct current schema, new fresh columns are fine, if a column exists in current schema but not in incoming record then manually add before inserting def evolveSchema(kinesis_df, table, forcecast=False): try: # get existing table's schema glue_catalog_df = spark.sql("SELECT * FROM " + table + " LIMIT 0") # sanitize for hudi specific system columns columns_to_drop = ['_hoodie_commit_time', '_hoodie_commit_seqno', '_hoodie_record_key', '_hoodie_partition_path', '_hoodie_file_name'] glue_catalog_df_sanitized = glue_catalog_df.drop(*columns_to_drop) if (kinesis_df.schema != glue_catalog_df_sanitized.schema): merged_df = kinesis_df.unionByName(glue_catalog_df_sanitized, allowMissingColumns=True) return (merged_df) except Exception as e: print(e) return (kinesis_df) def processBatch(data_frame, batchId): if (data_frame.count() > 0): kinesis_dynamic_frame = DynamicFrame.fromDF(data_frame, glueContext, "from_kinesis_data_frame")
[GitHub] [hudi] soumilshah1995 commented on issue #8040: [SUPPORT] Getting error when writing into HUDI table if schema changed (datatype changed / column dropped)
soumilshah1995 commented on issue #8040: URL: https://github.com/apache/hudi/issues/8040#issuecomment-1443831103 please share your job parameters -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #7680: [HUDI-5548] spark sql show | update hudi's table properties
hudi-bot commented on PR #7680: URL: https://github.com/apache/hudi/pull/7680#issuecomment-1443831794 ## CI report: * a0f3f705fefe86934c74bf2c1ccb97b03480fc99 Azure: [SUCCESS](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15294) * bde72fbb0e1f68dbd4d73954a204ae52e84f5b4d Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15387) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #7997: [HUDI-5822] Fix FileId not found exception when FileId is passed to HoodieMergeHa...
hudi-bot commented on PR #7997: URL: https://github.com/apache/hudi/pull/7997#issuecomment-1443823058 ## CI report: * 4ea65336bf55d988e388f7301a0cad9f42bd7b9b Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15311) Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15388) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #7680: [HUDI-5548] spark sql show | update hudi's table properties
hudi-bot commented on PR #7680: URL: https://github.com/apache/hudi/pull/7680#issuecomment-1443821665 ## CI report: * a0f3f705fefe86934c74bf2c1ccb97b03480fc99 Azure: [SUCCESS](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15294) * bde72fbb0e1f68dbd4d73954a204ae52e84f5b4d UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] voonhous commented on pull request #7997: [HUDI-5822] Fix FileId not found exception when FileId is passed to HoodieMergeHa...
voonhous commented on PR #7997: URL: https://github.com/apache/hudi/pull/7997#issuecomment-1443787040 @hudi-bot run azure -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #7998: [HUDI-5824] Fix: do not combine if write operation is Upsert and COMBINE_BEFORE_UPSERT is false
hudi-bot commented on PR #7998: URL: https://github.com/apache/hudi/pull/7998#issuecomment-1443746633 ## CI report: * 27d61f01fb6709e3aaa08de9ace7738dbedffb24 UNKNOWN * e8e3240aff997075065eb01d9277b227ab2bdf73 Azure: [SUCCESS](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15365) * cde8d4ffa1cae261731d94c2a0117ece6473a882 UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #8024: [MINOR] Improve RollbackToInstantTimeProcedure
hudi-bot commented on PR #8024: URL: https://github.com/apache/hudi/pull/8024#issuecomment-1443737757 ## CI report: * 404047bbd1277cb70a2c5c06fc95caad8a8cecd8 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15384) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] kazdy commented on a diff in pull request #7998: [HUDI-5824] Fix: do not combine if write operation is Upsert and COMBINE_BEFORE_UPSERT is false
kazdy commented on code in PR #7998: URL: https://github.com/apache/hudi/pull/7998#discussion_r1117025527 ## hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkSqlWriter.scala: ## @@ -960,6 +960,86 @@ class TestHoodieSparkSqlWriter { assert(spark.read.format("hudi").load(tempBasePath).where("age >= 2000").count() == 10) } + /** + * Test upsert for CoW table without precombine field and combine before upsert disabled. + */ + @Test + def testUpsertWithoutPrecombineFieldAndCombineBeforeUpsertDisabled(): Unit = { +val options = Map(DataSourceWriteOptions.TABLE_TYPE.key -> HoodieTableType.COPY_ON_WRITE.name(), Review Comment: only for CoW since I assume MoR requires pecombine field -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #8039: [HUDI-5846] use the right constructor of SparkBulkInsertDeltaCommitActionExecutor
hudi-bot commented on PR #8039: URL: https://github.com/apache/hudi/pull/8039#issuecomment-1443678413 ## CI report: * 79c19b07299dc7fd5c580d38b760b7803881811a Azure: [SUCCESS](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15383) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #7886: [HUDI-5726] Fix timestamp field is 8 hours longer than the time
hudi-bot commented on PR #7886: URL: https://github.com/apache/hudi/pull/7886#issuecomment-1443616706 ## CI report: * 4d58589ac15b41c133b97f3f8ec924adcf6222da Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15381) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #8041: [HUDI-5847] Add support for multiple metric reporters and metric labels
hudi-bot commented on PR #8041: URL: https://github.com/apache/hudi/pull/8041#issuecomment-1443609262 ## CI report: * 2068af21c85e51254b849562f38cf6a2918f9bef Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15386) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #8041: [HUDI-5847] Add support for multiple metric reporters and metric labels
hudi-bot commented on PR #8041: URL: https://github.com/apache/hudi/pull/8041#issuecomment-1443600686 ## CI report: * 2068af21c85e51254b849562f38cf6a2918f9bef UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Commented] (HUDI-2097) Fix unable to read commit metadata error
[ https://issues.apache.org/jira/browse/HUDI-2097?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17693177#comment-17693177 ] Xinglong Wang commented on HUDI-2097: - Fix exception like below? ``` org.apache.hudi.exception.HoodieException: java.io.IOException: unable to read commit metadata at org.apache.hudi.sink.partitioner.profile.WriteProfiles.getCommitMetadata(WriteProfiles.java:275) at org.apache.hudi.source.StreamReadMonitoringFunction.lambda$monitorDirAndForwardSplits$103(StreamReadMonitoringFunction.java:297) at java.util.stream.ReferencePipeline$3$1.accept(ReferencePipeline.java:193) at java.util.ArrayList$ArrayListSpliterator.forEachRemaining(ArrayList.java:1384) at java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:482) at java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:472) at java.util.stream.ReduceOps$ReduceOp.evaluateSequential(ReduceOps.java:708) at java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234) at java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:499) at org.apache.hudi.source.StreamReadMonitoringFunction.monitorDirAndForwardSplits(StreamReadMonitoringFunction.java:297) at org.apache.hudi.source.StreamReadMonitoringFunction.run(StreamReadMonitoringFunction.java:216) at org.apache.flink.streaming.api.operators.StreamSource.run(StreamSource.java:110) at org.apache.flink.streaming.api.operators.StreamSource.run(StreamSource.java:66) at org.apache.flink.streaming.runtime.tasks.SourceStreamTask$LegacySourceFunctionThread.run(SourceStreamTask.java:263) Caused by: java.io.IOException: unable to read commit metadata at org.apache.hudi.common.model.HoodieCommitMetadata.fromBytes(HoodieCommitMetadata.java:405) at org.apache.hudi.sink.partitioner.profile.WriteProfiles.getCommitMetadata(WriteProfiles.java:271) ... 13 more Caused by: com.fasterxml.jackson.core.JsonParseException: Unrecognized token 'Objavro': was expecting ('true', 'false' or 'null') at [Source: UNKNOWN; line: 1, column: 11] at com.fasterxml.jackson.core.JsonParser._constructError(JsonParser.java:1798) at com.fasterxml.jackson.core.base.ParserMinimalBase._reportError(ParserMinimalBase.java:673) at com.fasterxml.jackson.core.json.ReaderBasedJsonParser._reportInvalidToken(ReaderBasedJsonParser.java:2835) at com.fasterxml.jackson.core.json.ReaderBasedJsonParser._handleOddValue(ReaderBasedJsonParser.java:1889) at com.fasterxml.jackson.core.json.ReaderBasedJsonParser.nextToken(ReaderBasedJsonParser.java:747) at com.fasterxml.jackson.databind.ObjectMapper._initForReading(ObjectMapper.java:4129) at com.fasterxml.jackson.databind.ObjectMapper._readMapAndClose(ObjectMapper.java:3988) at com.fasterxml.jackson.databind.ObjectMapper.readValue(ObjectMapper.java:2992) at org.apache.hudi.common.model.HoodieCommitMetadata.fromJsonString(HoodieCommitMetadata.java:187) at org.apache.hudi.common.model.HoodieCommitMetadata.fromBytes(HoodieCommitMetadata.java:403) ... 14 more ``` > Fix unable to read commit metadata error > > > Key: HUDI-2097 > URL: https://issues.apache.org/jira/browse/HUDI-2097 > Project: Apache Hudi > Issue Type: Bug > Components: flink >Reporter: Zheng yunhong >Assignee: Zheng yunhong >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > > Fix unable to read commit metadata error while reading json style data. -- This message was sent by Atlassian Jira (v8.20.10#820010)
[GitHub] [hudi] danielfordfc opened a new issue, #8042: [SUPPORT] Errors ingesting enum's when using SQL Transformer
danielfordfc opened a new issue, #8042: URL: https://github.com/apache/hudi/issues/8042 Created off the back of https://github.com/apache/hudi/issues/7867 to focus more on the SQL Transformer **Describe the problem you faced** We are using the DeltaStreamer on EMR 6.8.0, sourcing data from Confluent Kafka Avro topics and using our Confluent Schema Registry to deserialize the messages, which we write to the Glue Data Catalog and query with Athena. For the majority of topics this works well, however, we noticed deserialisation errors when topics have Avro enum types in the schema. Errors come in two forms, based on whether we use the default KafkaAvroDeserializer, or the KafkaAvroSchemaDeserializer **with an SQL Transformer** With the KafkaAvroDeserializer ``` Scala.MatchError Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 1.0 failed 4 times, most recent failure: Lost task 0.3 in stage 1.0 (TID 4) (ip-10-154-13-123.eu-west-1.compute.internal executor 1): scala.MatchError: processing (of class org.apache.avro.generic.GenericData$EnumSymbol) at org.apache.hudi.org.apache.spark.sql.avro.AvroDeserializer.$anonfun$newWriter$13(AvroDeserializer.scala:178) at org.apache.hudi.org.apache.spark.sql.avro.AvroDeserializer.$anonfun$newWriter$13$adapted(AvroDeserializer.scala:177) at org.apache.hudi.org.apache.spark.sql.avro.AvroDeserializer.$anonfun$getRecordWriter$2(AvroDeserializer.scala:379) at org.apache.hudi.org.apache.spark.sql.avro.AvroDeserializer.$anonfun$getRecordWriter$2$adapted(AvroDeserializer.scala:375) at org.apache.hudi.org.apache.spark.sql.avro.AvroDeserializer.$anonfun$getRecordWriter$3(AvroDeserializer.scala:389) at org.apache.hudi.org.apache.spark.sql.avro.AvroDeserializer.$anonfun$getRecordWriter$3$adapted(AvroDeserializer.scala:385) at org.apache.hudi.org.apache.spark.sql.avro.AvroDeserializer.$anonfun$converter$4(AvroDeserializer.scala:87) at org.apache.hudi.org.apache.spark.sql.avro.AvroDeserializer.deserialize(AvroDeserializer.scala:105) at org.apache.hudi.org.apache.spark.sql.avro.HoodieSpark3_3AvroDeserializer.deserialize(HoodieSpark3_3AvroDeserializer.scala:30) ``` When using the KafkaAvroSchemaDeserializer, we get the following: ``` org.apache.avro.AvroTypeException Caused by: org.apache.kafka.common.errors.SerializationException: Error deserializing key/value for partition {topic}-0 at offset 7202. If needed, please seek past the record to continue consumption. Caused by: org.apache.kafka.common.errors.SerializationException: Error deserializing Avro message for id 4144 Caused by: org.apache.avro.AvroTypeException: Found {avro_record_namespace}.{enum_name}, expecting string ``` for an enum field resembling the following (note that sometimes there are default: added to the field, but never a default: at the symbol level... ``` { "name": "status", "type": { "type": "enum", "name": "status_options", "symbols": [ "processing", "completed", "error" ] } }, ``` With the KafkaAvroSchemaDeserializer --> org.apache.avro.AvroTypeException: Found {avro_record_namespace}.status_options, expecting string. Without the KafkaAvroSchemaDeserializer --> scala.MatchError: {one_of_the_enum_symbols} (of class org.apache.avro.generic.GenericData$EnumSymbol **To Reproduce** With a duplicate environment to the one mentioned at the beginning, our Spark command is: ``` "spark-submit", "--class", "org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer", "--conf", "spark.scheduler.mode=FAIR", "--conf", "spark.serializer=org.apache.spark.serializer.KryoSerializer", "--conf", "spark.sql.catalogImplementation=hive", "--conf", "spark.sql.hive.convertMetastoreParquet=false", "--conf", "spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension", "--conf", "spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog", "--conf", "spark.hadoop.hive.metastore.client.factory.class=com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory", "--conf", "spark.streaming.kafka.allowNonConsecutiveOffsets=true", # IMPORTANT: hudi-utilities-bundle must be declared immediately before any Hudi spark commands "/usr/lib/hudi/hudi-utilities-bundle.jar", "--source-class", "org.apache.hudi.utilities.sources.{{ source_type }}", "--source-ordering-field", "{{ timestamp_field }}", "--table-type", "COPY_ON_WRITE", "--op", "UPSERT", "--enable-sync", "--continuous", # Hudi write config "--target-base-path", f"s3://{bucket}/raw/{{ table }}", "--target-table", "{{ table }}", "--hoodie-conf", "hoodie.database.name={{ database }}_raw",
[GitHub] [hudi] kazdy commented on issue #8018: [SUPPORT] why is the schema evolution done while not setting hoodie.schema.on.read.enable
kazdy commented on issue #8018: URL: https://github.com/apache/hudi/issues/8018#issuecomment-1443575017 It feels like target table schema enforcement is needed (this topic also returns from time to time in Hudi slack). Afaik Hudi does not support such a thing overall, but it does this for MERGE INTO statement (new columns are dropped to fit the target table schema). -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Comment Edited] (HUDI-5839) Insert in non-strict mode deduplices dataset in "append" mode - spark
[ https://issues.apache.org/jira/browse/HUDI-5839?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17692748#comment-17692748 ] kazdy edited comment on HUDI-5839 at 2/24/23 11:35 AM: --- Hi [~codope] could take a look at this issue? I'm not sure if this is working "as expected" or if it's a bug. Looking at behavior in 0.12.1 it is a bug, in 0.12.1 it creates duplicates when I do insert in non-strict mode using spark append mode. was (Author: JIRAUSER284048): Hi [~codope] could take a look at this issue? I'm not sure if this is working "as expected" or if it's a bug. > Insert in non-strict mode deduplices dataset in "append" mode - spark > - > > Key: HUDI-5839 > URL: https://issues.apache.org/jira/browse/HUDI-5839 > Project: Apache Hudi > Issue Type: Bug > Components: spark, writer-core >Affects Versions: 0.13.0 >Reporter: kazdy >Priority: Major > > There seem to be a bug with non-strict insert mode when precombine is not > defined (but I have not checked for when it is). > When using spark datasource it can insert duplicates only in overwrite mode > or append mode when data is inserted to the table for the first time, but if > I want to insert in append mode for the second time it deduplicates the > dataset as if it was working in upsert mode. Found in master (0.13.0). > I happens to be a regression, because I'm using this functionality in Hudi > 0.12.1. > {code:java} > from pyspark.sql.functions import expr > path = "/tmp/huditbl" > opt_insert = { > 'hoodie.table.name': 'huditbl', > 'hoodie.datasource.write.recordkey.field': 'keyid', > 'hoodie.datasource.write.table.name': 'huditbl', > 'hoodie.datasource.write.operation': 'insert', > 'hoodie.sql.insert.mode': 'non-strict', > 'hoodie.upsert.shuffle.parallelism': 2, > 'hoodie.insert.shuffle.parallelism': 2, > 'hoodie.combine.before.upsert': 'false', > 'hoodie.combine.before.insert': 'false', > 'hoodie.datasource.write.insert.drop.duplicates': 'false' > } > df = spark.range(0, 10).toDF("keyid") \ > .withColumn("age", expr("keyid + 1000")) > df.write.format("hudi"). \ > options(**opt_insert). \ > mode("overwrite"). \ > save(path) > spark.read.format("hudi").load(path).count() # returns 10 > df = df.union(df) # creates duplicates > df.write.format("hudi"). \ > options(**opt_insert). \ > mode("append"). \ > save(path) > spark.read.format("hudi").load(path).count() # returns 10 but should return > 20 > # note > # this works: > df = df.union(df) # creates duplicates > df.write.format("hudi"). \ > options(**opt_insert). \ > mode("overwrite"). \ > save(path) > spark.read.format("hudi").load(path).count() # returns 20 as it should{code} > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Closed] (HUDI-5825) Disable Spark UI in tests if SPARK_EVLOG_DIR not set
[ https://issues.apache.org/jira/browse/HUDI-5825?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] kazdy closed HUDI-5825. --- Resolution: Implemented > Disable Spark UI in tests if SPARK_EVLOG_DIR not set > > > Key: HUDI-5825 > URL: https://issues.apache.org/jira/browse/HUDI-5825 > Project: Apache Hudi > Issue Type: Improvement > Components: tests-ci >Reporter: kazdy >Assignee: kazdy >Priority: Minor > Labels: pull-request-available > Fix For: 0.13.1 > > > Disable spark UI in tests if no SPARK_EVLOG_DIR is specified. > There's no reason for Spark UI to be enabled for tests in most cases. > If that's a requirement one can set SPARK_EVLOG_DIR to enable it. > In the Spark project UI is disabled for tests, with some minor exceptions > (testing metrics available via UI). I think we can follow. > Hopefullt this will make tests a bit faster. -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-5847) Add support for multiple metric reporters and metric labels
[ https://issues.apache.org/jira/browse/HUDI-5847?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] ASF GitHub Bot updated HUDI-5847: - Labels: pull-request-available (was: ) > Add support for multiple metric reporters and metric labels > --- > > Key: HUDI-5847 > URL: https://issues.apache.org/jira/browse/HUDI-5847 > Project: Apache Hudi > Issue Type: Bug >Reporter: Lokesh Jain >Assignee: Lokesh Jain >Priority: Major > Labels: pull-request-available > > The jira aims to add support for multiple metric reporters within a Metric > Registry. Further it also adds labels to metrics. -- This message was sent by Atlassian Jira (v8.20.10#820010)
[GitHub] [hudi] lokeshj1703 opened a new pull request, #8041: [HUDI-5847] Add support for multiple metric reporters and metric labels
lokeshj1703 opened a new pull request, #8041: URL: https://github.com/apache/hudi/pull/8041 ### Change Logs The PR aims to add support for multiple metric reporters within a Metric Registry. Further it also adds labels to metrics. ### Impact NA ### Risk level (write none, low medium or high below) low ### Documentation Update _Describe any necessary documentation update if there is any new feature, config, or user-facing change_ - _The config description must be updated if new configs are added or the default value of the configs are changed_ - _Any new feature or user-facing change requires updating the Hudi website. Please create a Jira ticket, attach the ticket number here and follow the [instruction](https://hudi.apache.org/contribute/developer-setup#website) to make changes to the website._ ### Contributor's checklist - [ ] Read through [contributor's guide](https://hudi.apache.org/contribute/how-to-contribute) - [ ] Change Logs and Impact were stated clearly - [ ] Adequate tests were added if applicable - [ ] CI passed -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Created] (HUDI-5847) Add support for multiple metric reporters and metric labels
Lokesh Jain created HUDI-5847: - Summary: Add support for multiple metric reporters and metric labels Key: HUDI-5847 URL: https://issues.apache.org/jira/browse/HUDI-5847 Project: Apache Hudi Issue Type: Bug Reporter: Lokesh Jain Assignee: Lokesh Jain The jira aims to add support for multiple metric reporters within a Metric Registry. Further it also adds labels to metrics. -- This message was sent by Atlassian Jira (v8.20.10#820010)
[GitHub] [hudi] hudi-bot commented on pull request #8027: [HUDI-5838] Mask sensitive info while printing hudi properties in DeltaStreamer
hudi-bot commented on PR #8027: URL: https://github.com/apache/hudi/pull/8027#issuecomment-1443558467 ## CI report: * 2d0c3faee7dbb5cc6b70ed641b42c74dea8efb3a Azure: [CANCELED](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15382) * d7c44d73145467406dcd5ecb33ee976059ea6d8f Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15385) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #8024: [MINOR] Improve RollbackToInstantTimeProcedure
hudi-bot commented on PR #8024: URL: https://github.com/apache/hudi/pull/8024#issuecomment-1443558386 ## CI report: * 7c0543843874887de825c3af7bc1941e2abe0049 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15356) * 404047bbd1277cb70a2c5c06fc95caad8a8cecd8 Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15384) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #7987: [HUDI-5514] Record Keys Auto-gen Prototype
hudi-bot commented on PR #7987: URL: https://github.com/apache/hudi/pull/7987#issuecomment-1443558232 ## CI report: * 5cfa69e4c1c487e5cedb4f8d7d3a4c7334cfe266 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15380) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] phantomcoder62 opened a new issue, #8040: [SUPPORT] Getting error when writing into HUDI table if schema changed (datatype changed / column dropped)
phantomcoder62 opened a new issue, #8040: URL: https://github.com/apache/hudi/issues/8040 **Problem** We were trying schema evolution in our MOR Hudi table and we are able to add new column but when we delete a column/change its type, it gives this error - "py4j.protocol.Py4JJavaError: An error occurred while calling o130.save. org.apache.parquet.io.InvalidRecordException: Parquet/Avro schema mismatch: Avro field 'deleted_column_name' not found at org.apache.parquet.avro.AvroRecordConverter.getAvroField(AvroRecordConverter.java:221)" We did more research on this issue as it was coming occasionally. But once it comes, the error continues for all the subsequent write operation on the hudi MOR table and we can't write anything on that hudi table. So our finding is that schema evolution may work fine until compaction takes place. When compaction happens in MOR, it get some file with the deleted column and some without it (or different datatypes) and that is causing this issue. You can test this by keeping the compaction, min/max commit, cleaner commit retained configurations to low values in MOR. **To Reproduce** This is the code snippet. We are using Glue 4.0 and Hudi version 0.12.1. (Tried using EC2 as well) import sys from awsglue.transforms import * from awsglue.utils import getResolvedOptions from awsglue.context import GlueContext from awsglue.job import Job from pyspark.sql.session import SparkSession args = getResolvedOptions(sys.argv, ['JOB_NAME','schema_evolution_testing']) spark = SparkSession.builder.config('spark.serializer','org.apache.spark.serializer.KryoSerializer').config('spark.sql.hive.convertMetastoreParquet','false').getOrCreate() sc = spark.sparkContext glueContext = GlueContext(sc) job = Job(glueContext) job.init(args['JOB_NAME'], args) def main(): # HUDI configuration commonOptions = { 'hoodie.upsert.shuffle.parallelism': 200, 'hoodie.datasource.write.table.type': 'MERGE_ON_READ', 'hoodie.datasource.write.operation': 'upsert', 'hoodie.compact.inline': 'true', 'hoodie.compact.inline.max.delta.commits': 5, 'hoodie.keep.max.commits': 5, 'hoodie.keep.min.commits': 4, 'hoodie.datasource.write.precombine.field': 'cdc_timestamp', 'hoodie.cleaner.policy': 'KEEP_LATEST_COMMITS', 'hoodie.cleaner.commits.retained': 3, 'hoodie.datasource.write.keygenerator.class': 'org.apache.hudi.keygen.NonpartitionedKeyGenerator' } tableConfig = { "hoodie.table.name": 'emp', "hoodie.datasource.write.recordkey.field": 'id', 'hoodie.datasource.write.table.name': 'emp' } CombinedConfig = {**tableConfig, **commonOptions} # Reading Parquet files from raw S3 DMS output df = spark.read.parquet('s3://bucket/folder/emp/') print("df schema is:",df.schema.simpleString()) df.show() # Writing into hudi ( df.write.format("org.apache.hudi") .options(**CombinedConfig) .mode("append") .save(f"s3://hudi_bucket/hudi_folder/emp") ) print("Upsertion into HUDI completed") main() job.commit() These are the steps we followed for testing this issue - **Column Type Changed :** run 1 - creation and insertion of 4000 records in hudi table emp with schema - struct. This job was successful and it created a parquet file in hudi table run 2 - Altered emp table and added column mgr_id integer. Updated 4000 records. New schema is - struct. This job was successful and it created a .log file in Hudi table. run 3 - Altered emp table changed type of mgr_id column from integer to character varying and updated 3000 records & inserted 5 records of mgr_id. New schema is - struct. This job was successful and it created another .log file and another parquet file in Hudi table. run 4 - Updated one record in DB. This job was successful and it created another .log file. run 5 - Updated one record in DB. This job failed with this ERROR- An error occurred while calling o120.save. Found int, expecting union Error log - File "/tmp/schema_evolution_testing.py", line 51, in main .save(hudi_table_path) File "/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/readwriter.py", line 968, in save self._jwrite.save(path) File "/opt/amazon/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1321, in __call__ return_value = get_return_value( File "/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 190, in deco return f(*a, **kw) File
[GitHub] [hudi] hudi-bot commented on pull request #8027: [HUDI-5838] Mask sensitive info while printing hudi properties in DeltaStreamer
hudi-bot commented on PR #8027: URL: https://github.com/apache/hudi/pull/8027#issuecomment-1443550748 ## CI report: * 10e71ac0feb93693f00ea82dabe07d0807cd1e8a Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15378) * 2d0c3faee7dbb5cc6b70ed641b42c74dea8efb3a Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15382) * d7c44d73145467406dcd5ecb33ee976059ea6d8f UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #8024: [MINOR] Improve RollbackToInstantTimeProcedure
hudi-bot commented on PR #8024: URL: https://github.com/apache/hudi/pull/8024#issuecomment-1443550646 ## CI report: * 7c0543843874887de825c3af7bc1941e2abe0049 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15356) * 404047bbd1277cb70a2c5c06fc95caad8a8cecd8 UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] littleeleventhwolf closed issue #8028: [SUPPORT] Globally sort by a certain field and ensure data consistency in streaming and batch query / ETL
littleeleventhwolf closed issue #8028: [SUPPORT] Globally sort by a certain field and ensure data consistency in streaming and batch query / ETL URL: https://github.com/apache/hudi/issues/8028 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #7847: [HUDI-5697] Revisiting refreshing of Hudi relations after write operations on the tables
hudi-bot commented on PR #7847: URL: https://github.com/apache/hudi/pull/7847#issuecomment-1443522421 ## CI report: * 24f50e8f624dd2b928cf1c6c4ca7db8b84c760fd Azure: [SUCCESS](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15379) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] XuQianJin-Stars commented on a diff in pull request #8024: [MINOR] Improve RollbackToInstantTimeProcedure
XuQianJin-Stars commented on code in PR #8024: URL: https://github.com/apache/hudi/pull/8024#discussion_r1116834151 ## hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RollbackToInstantTimeProcedure.scala: ## @@ -73,10 +73,14 @@ class RollbackToInstantTimeProcedure extends BaseProcedure with ProcedureBuilder throw new HoodieException(s"Commit $instantTime not found in Commits $completedTimeline") } - val result = if (client.rollback(instantTime)) true else false - val outputRow = Row(result) + val outputRow = new util.ArrayList[Row] + val allInstants: List[HoodieInstant] = completedTimeline +.findInstantsAfterOrEquals(instantTime, Integer.MAX_VALUE).getReverseOrderedInstants.toArray() +.map(r => r.asInstanceOf[HoodieInstant]).toList - Seq(outputRow) + allInstants.foreach(p => outputRow.add(Row(client.rollback(p.getTimestamp), p.getTimestamp))) Review Comment: > why we need outputRow, how about return allInstants.map(p => Row(client.rollback(p.getTimestamp), p.getTimestamp)) directly A bit hesitant to add `limit` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] XuQianJin-Stars commented on a diff in pull request #8024: [MINOR] Improve RollbackToInstantTimeProcedure
XuQianJin-Stars commented on code in PR #8024: URL: https://github.com/apache/hudi/pull/8024#discussion_r1116830778 ## hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RollbackToInstantTimeProcedure.scala: ## @@ -38,7 +37,8 @@ class RollbackToInstantTimeProcedure extends BaseProcedure with ProcedureBuilder ProcedureParameter.required(1, "instant_time", DataTypes.StringType, None)) private val OUTPUT_TYPE = new StructType(Array[StructField]( -StructField("rollback_result", DataTypes.BooleanType, nullable = true, Metadata.empty)) +StructField("rollback_result", DataTypes.BooleanType, nullable = true, Metadata.empty), +StructField("instant_time", DataTypes.StringType, nullable = true, Metadata.empty)) Review Comment: > Like `rollback_to_savepoint`, the request argument of `rollback_to_instant` has the `instant_time`, therefore the return result doesn't need the `instant_time`. This will iteratively output multiple lines of rollback instants. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] simonjobs commented on issue #8020: [SUPPORT] org.apache.avro.AvroTypeException: Cannot encode decimal with precision 4 as max precision 2
simonjobs commented on issue #8020: URL: https://github.com/apache/hudi/issues/8020#issuecomment-1443487821 @xiarixiaoyao First of all, yes you seem to have found the culprit. Thanks a lot for the help. Re 1: Me running 0.12.1 explains why I couldn't reproduce your first example. For our specific use case we are limited to the connectors available through AWS. So far they have been reasonably fast to upgrade these. Re 2: Your suggested change to SchemaChangeUtils.isTypeUpdateAllow looks great. I am assuming that changing isWiderThan is not a good option due to existing uses. With this possible patch together with what is already merged from #7326 my understanding is that any kind of "increase" to a decimal will be supported. Thanks again and let met know if I can assist further. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] voonhous commented on pull request #7997: [HUDI-5822] Fix FileId not found exception when FileId is passed to HoodieMergeHa...
voonhous commented on PR #7997: URL: https://github.com/apache/hudi/pull/7997#issuecomment-1443459830 @SteNicholas @danny0405 Can you please help to review this commit? Thank you -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #8039: [HUDI-5846] use the right constructor of SparkBulkInsertDeltaCommitActionExecutor
hudi-bot commented on PR #8039: URL: https://github.com/apache/hudi/pull/8039#issuecomment-1443429580 ## CI report: * 79c19b07299dc7fd5c580d38b760b7803881811a Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15383) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #8027: [HUDI-5838] Mask sensitive info while printing hudi properties in DeltaStreamer
hudi-bot commented on PR #8027: URL: https://github.com/apache/hudi/pull/8027#issuecomment-1443429433 ## CI report: * 10e71ac0feb93693f00ea82dabe07d0807cd1e8a Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15378) * 2d0c3faee7dbb5cc6b70ed641b42c74dea8efb3a Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15382) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #8039: [HUDI-5846] use the right constructor of SparkBulkInsertDeltaCommitActionExecutor
hudi-bot commented on PR #8039: URL: https://github.com/apache/hudi/pull/8039#issuecomment-1443414405 ## CI report: * 79c19b07299dc7fd5c580d38b760b7803881811a UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #8027: [HUDI-5838] Mask sensitive info while printing hudi properties in DeltaStreamer
hudi-bot commented on PR #8027: URL: https://github.com/apache/hudi/pull/8027#issuecomment-1443414321 ## CI report: * 10e71ac0feb93693f00ea82dabe07d0807cd1e8a Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15378) * 2d0c3faee7dbb5cc6b70ed641b42c74dea8efb3a UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #7886: [HUDI-5726] Fix timestamp field is 8 hours longer than the time
hudi-bot commented on PR #7886: URL: https://github.com/apache/hudi/pull/7886#issuecomment-1443413679 ## CI report: * fe0ffd40e67f732677d1439092a1dedfba7ea7aa Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15309) Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15329) * 4d58589ac15b41c133b97f3f8ec924adcf6222da Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15381) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #7886: [HUDI-5726] Fix timestamp field is 8 hours longer than the time
hudi-bot commented on PR #7886: URL: https://github.com/apache/hudi/pull/7886#issuecomment-1443401117 ## CI report: * fe0ffd40e67f732677d1439092a1dedfba7ea7aa Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15309) Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15329) * 4d58589ac15b41c133b97f3f8ec924adcf6222da UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] stayrascal commented on issue #8038: [SUPPORT] How to query different queries via Hive 3 & Trino
stayrascal commented on issue #8038: URL: https://github.com/apache/hudi/issues/8038#issuecomment-1443400947 And for Trino case, it cannot count all records base on RT table. ``` trino> select count(*) from hive.hudi_hms_db.flink_hudi_mor_streaming_tbl_rt; _col0 --- 8163 (1 row) Query 20230224_100730_8_sxxri, FINISHED, 2 nodes Splits: 21 total, 21 done (100.00%) 0.65 [8.16K rows, 1.72MB] [12.6K rows/s, 2.66MB/s] trino> select count(*) from hive.hudi_hms_db.flink_hudi_mor_streaming_tbl_ro; _col0 --- 8163 (1 row) Query 20230224_100735_9_sxxri, FINISHED, 2 nodes Splits: 21 total, 21 done (100.00%) 0.61 [8.16K rows, 1.72MB] [13.3K rows/s, 2.8MB/s] trino> select count(*) from hive.hudi_hms_db.flink_hudi_mor_streaming_tbl; _col0 --- 0 (1 row) Query 20230224_100738_00010_sxxri, FINISHED, 2 nodes Splits: 18 total, 18 done (100.00%) 0.56 [0 rows, 0B] [0 rows/s, 0B/s] ``` And if the MOR table haven't done any compaction, query on RT table will throw a exception that the base file not exist, is an expected behavior? ``` trino> select * from hive.hudi_hms_db.flink_hudi_mor_tbl_rt; Query 20230224_100913_00011_sxxri, FAILED, 2 nodes Splits: 4 total, 0 done (0.00%) 0.51 [0 rows, 0B] [0 rows/s, 0B/s] Query 20230224_100913_00011_sxxri failed: Not valid Parquet file: hdfs://xxx/hive/hudi_hms_db/flink_hudi_mor_tbl/par3/.83b4db58-a84b-40b5-b38d-d79acfa8db3c_20230216160153391.log.1_0-1-0 expected magic number: PAR1 got: # ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #7804: [HUDI-915][HUDI-5656] Rebased `HoodieBootstrapRelation` onto `HoodieBaseRelation`
hudi-bot commented on PR #7804: URL: https://github.com/apache/hudi/pull/7804#issuecomment-1443400674 ## CI report: * 96daf49ab19a803bfe8ce25f1fc9945f685db473 Azure: [SUCCESS](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=15376) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Updated] (HUDI-5846) throw ClassCastException while run_bootstrap
[ https://issues.apache.org/jira/browse/HUDI-5846?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] ASF GitHub Bot updated HUDI-5846: - Labels: pull-request-available (was: ) > throw ClassCastException while run_bootstrap > -- > > Key: HUDI-5846 > URL: https://issues.apache.org/jira/browse/HUDI-5846 > Project: Apache Hudi > Issue Type: Bug > Components: bootstrap, spark >Reporter: lvyanquan >Priority: Trivial > Labels: pull-request-available > > throw ClassCastException while run_bootstrap for MERGE_ON_READ table type > h4. *version* > Spark 3.3 & hudi 0.13 > *error message* > {code:java} > Caused by: java.lang.ClassCastException: java.util.HashMap cannot be cast to > org.apache.hudi.table.BulkInsertPartitioner > at > org.apache.hudi.table.action.commit.SparkBulkInsertHelper.bulkInsert(SparkBulkInsertHelper.java:77) > at > org.apache.hudi.table.action.deltacommit.SparkBulkInsertDeltaCommitActionExecutor.execute(SparkBulkInsertDeltaCommitActionExecutor.java:60) > {code} > > *how to reproduce* > Spark SQL > {code:java} > create table hive_table ( > id int, > ts int > ) stored as parquet; > insert into hive_table values (1, 1); > create table hudi_mor ( > id int, > ts int > ) using hudi > tblproperties ( > type = 'mor', > primaryKey = 'id', > preCombineField = 'ts' > ); > call run_bootstrap(table => 'hudi_mor', table_type => 'MERGE_ON_READ', > bootstrap_path => 'hdfs://ns1/dtInsight/hive/warehouse/kunni.db/hive_table', > base_path => 'hdfs://ns1/dtInsight/hive/warehouse/kunni.db/hudi_mor', > rowKey_field => 'id', key_generator_class => > 'org.apache.hudi.keygen.NonpartitionedKeyGenerator', bootstrap_overwrite => > true, selector_class=> > 'org.apache.hudi.client.bootstrap.selector.FullRecordBootstrapModeSelector'); > {code} > > {*}cause{*}{*}{*} > org.apache.hudi.table.action.bootstrap.SparkBootstrapDeltaCommitActionExecutor#getBulkInsertActionExecutor > method > used wrong constructor of SparkBulkInsertDeltaCommitActionExecutor class. -- This message was sent by Atlassian Jira (v8.20.10#820010)
[GitHub] [hudi] lvyanquan opened a new pull request, #8039: [HUDI-5846] use the right constructor of SparkBulkInsertDeltaCommitActionExecutor
lvyanquan opened a new pull request, #8039: URL: https://github.com/apache/hudi/pull/8039 ### Change Logs JIRA: [HUDI-5846](https://issues.apache.org/jira/browse/HUDI-5846). Avoid ClassCastException while using constructor of [SparkBulkInsertDeltaCommitActionExecutor](https://github.com/apache/hudi/blob/master/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkBulkInsertDeltaCommitActionExecutor.java) for 'Map' to 'BulkInsertPartitioner' ### Impact none ### Risk level (write none, low medium or high below) none ### Documentation Update none ### Contributor's checklist - [ ] Read through [contributor's guide](https://hudi.apache.org/contribute/how-to-contribute) - [ ] Change Logs and Impact were stated clearly - [ ] Adequate tests were added if applicable - [ ] CI passed -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] stayrascal commented on issue #8038: [SUPPORT] How to query different queries via Hive 3
stayrascal commented on issue #8038: URL: https://github.com/apache/hudi/issues/8038#issuecomment-1443380149 So regarding using Hive to query incremental queries of COW & MOR table, we have to add `_hoodie_commit_time ` filter condition on MOR, but COW doesn't need, right? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] sandyfog commented on pull request #7886: [HUDI-5726] Fix timestamp field is 8 hours longer than the time
sandyfog commented on PR #7886: URL: https://github.com/apache/hudi/pull/7886#issuecomment-1443376677 > @sandyfog Seems some test failure for Flink ITs, can you run pass the `ITTestHoodieDataSource` locally ? AvroToRowDataConverters also has timezone issues. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] lokeshj1703 commented on a diff in pull request #8027: [HUDI-5838] Mask sensitive info while printing hudi properties in DeltaStreamer
lokeshj1703 commented on code in PR #8027: URL: https://github.com/apache/hudi/pull/8027#discussion_r1116737680 ## hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamer.java: ## @@ -528,7 +533,10 @@ public String toString() { } } - private static String toSortedTruncatedString(TypedProperties props) { + static String toSortedTruncatedString(TypedProperties props) { +List sensitiveConfigList = props.getStringList(HoodieWriteConfig.SENSITIVE_CONFIG_KEYS_FILTER.key(), Review Comment: We are iterating through all the elements so kept a list. I can change to a LinkedHashSet if we want to avoid duplicates? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] lokeshj1703 commented on a diff in pull request #8027: [HUDI-5838] Mask sensitive info while printing hudi properties in DeltaStreamer
lokeshj1703 commented on code in PR #8027: URL: https://github.com/apache/hudi/pull/8027#discussion_r1116737164 ## hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java: ## @@ -620,6 +620,12 @@ public class HoodieWriteConfig extends HoodieConfig { .withDocumentation("Whether to enable commit conflict checking or not during early " + "conflict detection."); + public static final ConfigProperty SENSITIVE_CONFIG_KEYS_FILTER = ConfigProperty + .key("hoodie.sensitive.config.keys") + .defaultValue("ssl,tls,sasl,auth,credentials") + .withDocumentation("Comma separated list of filters for sensitive config keys. Delta Streamer " + + "avoids printing any configurations which contains the configured filter."); Review Comment: Addressed in latest commit. ## hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestHoodieDeltaStreamer.java: ## @@ -2319,6 +2319,22 @@ public void testDeletePartitions() throws Exception { TestHelpers.assertNoPartitionMatch(tableBasePath, sqlContext, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH); } + @Test + public void testToSortedTruncatedStringSecretsMasked() { +TypedProperties props = +new DFSPropertiesConfiguration(fs.getConf(), new Path(basePath + "/" + PROPS_FILENAME_TEST_SOURCE)).getProps(); +props.put("ssl.trustore.location", "SSL SECRET KEY"); +props.put("sasl.jaas.config", "SASL SECRET KEY"); +props.put("auth.credentials", "AUTH CREDENTIALS"); +props.put("auth.user.info", "AUTH USER INFO"); + +String truncatedKeys = HoodieDeltaStreamer.toSortedTruncatedString(props); +assertFalse(truncatedKeys.contains("SSL SECRET KEY")); Review Comment: I think we can have both. Added that check as well. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Updated] (HUDI-5846) throw ClassCastException while run_bootstrap
[ https://issues.apache.org/jira/browse/HUDI-5846?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] lvyanquan updated HUDI-5846: Description: throw ClassCastException while run_bootstrap for MERGE_ON_READ table type h4. *version* Spark 3.3 & hudi 0.13 *error message* {code:java} Caused by: java.lang.ClassCastException: java.util.HashMap cannot be cast to org.apache.hudi.table.BulkInsertPartitioner at org.apache.hudi.table.action.commit.SparkBulkInsertHelper.bulkInsert(SparkBulkInsertHelper.java:77) at org.apache.hudi.table.action.deltacommit.SparkBulkInsertDeltaCommitActionExecutor.execute(SparkBulkInsertDeltaCommitActionExecutor.java:60) {code} *how to reproduce* Spark SQL {code:java} create table hive_table ( id int, ts int ) stored as parquet; insert into hive_table values (1, 1); create table hudi_mor ( id int, ts int ) using hudi tblproperties ( type = 'mor', primaryKey = 'id', preCombineField = 'ts' ); call run_bootstrap(table => 'hudi_mor', table_type => 'MERGE_ON_READ', bootstrap_path => 'hdfs://ns1/dtInsight/hive/warehouse/kunni.db/hive_table', base_path => 'hdfs://ns1/dtInsight/hive/warehouse/kunni.db/hudi_mor', rowKey_field => 'id', key_generator_class => 'org.apache.hudi.keygen.NonpartitionedKeyGenerator', bootstrap_overwrite => true, selector_class=> 'org.apache.hudi.client.bootstrap.selector.FullRecordBootstrapModeSelector'); {code} {*}cause{*}{*}{*} org.apache.hudi.table.action.bootstrap.SparkBootstrapDeltaCommitActionExecutor#getBulkInsertActionExecutor method used wrong constructor of SparkBulkInsertDeltaCommitActionExecutor class. was: throw ClassCastException while run_bootstrap for MERGE_ON_READ table type h4. *version* Spark 3.3 & hudi 0.13 *error message* {code:java} Caused by: java.lang.ClassCastException: java.util.HashMap cannot be cast to org.apache.hudi.table.BulkInsertPartitioner at org.apache.hudi.table.action.commit.SparkBulkInsertHelper.bulkInsert(SparkBulkInsertHelper.java:77) at org.apache.hudi.table.action.deltacommit.SparkBulkInsertDeltaCommitActionExecutor.execute(SparkBulkInsertDeltaCommitActionExecutor.java:60) {code} *how to reproduce* Spark SQL {code:java} create table hive_table ( id int, ts int ) stored as parquet; insert into hive_table values (1, 1); create table hudi_mor ( id int, ts int ) using hudi tblproperties ( type = 'mor', primaryKey = 'id', preCombineField = 'ts' ); call run_bootstrap(table => 'hudi_mor', table_type => 'MERGE_ON_READ', bootstrap_path => 'hdfs://ns1/dtInsight/hive/warehouse/kunni.db/hive_table', base_path => 'hdfs://ns1/dtInsight/hive/warehouse/kunni.db/hudi_mor', rowKey_field => 'id', key_generator_class => 'org.apache.hudi.keygen.NonpartitionedKeyGenerator', bootstrap_overwrite => true, selector_class=> 'org.apache.hudi.client.bootstrap.selector.FullRecordBootstrapModeSelector'); {code} {*}cause{*}{*}{*} org.apache.hudi.table.action.bootstrap.SparkBootstrapDeltaCommitActionExecutor#getBulkInsertActionExecutor method used wrong constructor of SparkBulkInsertDeltaCommitActionExecutor class. > throw ClassCastException while run_bootstrap > -- > > Key: HUDI-5846 > URL: https://issues.apache.org/jira/browse/HUDI-5846 > Project: Apache Hudi > Issue Type: Bug > Components: bootstrap, spark >Reporter: lvyanquan >Priority: Trivial > > throw ClassCastException while run_bootstrap for MERGE_ON_READ table type > h4. *version* > Spark 3.3 & hudi 0.13 > *error message* > {code:java} > Caused by: java.lang.ClassCastException: java.util.HashMap cannot be cast to > org.apache.hudi.table.BulkInsertPartitioner > at > org.apache.hudi.table.action.commit.SparkBulkInsertHelper.bulkInsert(SparkBulkInsertHelper.java:77) > at > org.apache.hudi.table.action.deltacommit.SparkBulkInsertDeltaCommitActionExecutor.execute(SparkBulkInsertDeltaCommitActionExecutor.java:60) > {code} > > *how to reproduce* > Spark SQL > {code:java} > create table hive_table ( > id int, > ts int > ) stored as parquet; > insert into hive_table values (1, 1); > create table hudi_mor ( > id int, > ts int > ) using hudi > tblproperties ( > type = 'mor', > primaryKey = 'id', > preCombineField = 'ts' > ); > call run_bootstrap(table => 'hudi_mor', table_type => 'MERGE_ON_READ', > bootstrap_path => 'hdfs://ns1/dtInsight/hive/warehouse/kunni.db/hive_table', > base_path => 'hdfs://ns1/dtInsight/hive/warehouse/kunni.db/hudi_mor', > rowKey_field => 'id', key_generator_class => > 'org.apache.hudi.keygen.NonpartitionedKeyGenerator', bootstrap_overwrite => > true, selector_class=> > 'org.apache.hudi.client.bootstrap.selector.FullRecordBootstrapModeSelector'); > {code} > > {*}cause{*}{*}{*} >