[GitHub] [hudi] hudi-bot commented on pull request #9365: [HUDI-6646] Add default lock provider for spark offline compaction an…
hudi-bot commented on PR #9365: URL: https://github.com/apache/hudi/pull/9365#issuecomment-1666415439 ## CI report: * 5599728990aa2b2d7e6dae8ef5c54196e94c773e Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19096) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9371: [HUDI-6647] Expand Hudi Java Client Functionality
hudi-bot commented on PR #9371: URL: https://github.com/apache/hudi/pull/9371#issuecomment-1666415453 ## CI report: * 0e39684d85c18aef48131ba838ffc63e48b5fcf2 UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9261: [HUDI-6579] Adding support for upsert and deletes with spark datasource for pk less table
hudi-bot commented on PR #9261: URL: https://github.com/apache/hudi/pull/9261#issuecomment-1666415390 ## CI report: * 3968bf360001624911a6c04b9a3c44af36a4dbf7 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19076) * c1977e21cdf02d75158bd3dec3b335e27755915c Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19102) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] codope commented on a diff in pull request #9345: [HUDI-6649] Fix column stat based data filtering for MOR
codope commented on code in PR #9345: URL: https://github.com/apache/hudi/pull/9345#discussion_r1284982434 ## hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala: ## @@ -122,81 +125,120 @@ case class HoodieFileIndex(spark: SparkSession, * @return list of PartitionDirectory containing partition to base files mapping */ override def listFiles(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Seq[PartitionDirectory] = { +// Prune the partition path by the partition filters +// NOTE: Non-partitioned tables are assumed to consist from a single partition +// encompassing the whole table +val partitionsAndFileSlices = getFileSlicesForPrunedPartitions(partitionFilters) +val listedPartitions = filterFileSlices(dataFilters, partitionsAndFileSlices).map { + case (partition, fileSlices) => +val allCandidateFiles: Seq[FileStatus] = fileSlices.flatMap(fs => { + val baseFileStatusOpt = getBaseFileStatus(Option.apply(fs.getBaseFile.orElse(null))) + val logFilesStatus = if (includeLogFiles) { + fs.getLogFiles.map[FileStatus](JFunction.toJavaFunction[HoodieLogFile, FileStatus](lf => lf.getFileStatus)) + } else { +java.util.stream.Stream.empty() + } + val files = logFilesStatus.collect(Collectors.toList[FileStatus]).asScala + baseFileStatusOpt.foreach(f => files.append(f)) + files +}) + +PartitionDirectory(InternalRow.fromSeq(partition.get.values), allCandidateFiles) +} + +hasPushedDownPartitionPredicates = true + +if (shouldReadAsPartitionedTable()) { + listedPartitions +} else { + Seq(PartitionDirectory(InternalRow.empty, listedPartitions.flatMap(_.files))) +} + } + + def filterFileSlices(dataFilters: Seq[Expression], partitionAndFileSlices: Seq[(Option[BaseHoodieTableFileIndex.PartitionPath], Seq[FileSlice])]) + : Seq[(Option[BaseHoodieTableFileIndex.PartitionPath], Seq[FileSlice])] = { // Look up candidate files names in the col-stats index, if all of the following conditions are true //- Data-skipping is enabled //- Col-Stats Index is present //- List of predicates (filters) is present val candidateFilesNamesOpt: Option[Set[String]] = - lookupCandidateFilesInMetadataTable(dataFilters) match { -case Success(opt) => opt -case Failure(e) => - logError("Failed to lookup candidate files in File Index", e) - - spark.sqlContext.getConf(DataSkippingFailureMode.configName, DataSkippingFailureMode.Fallback.value) match { -case DataSkippingFailureMode.Fallback.value => Option.empty -case DataSkippingFailureMode.Strict.value => throw new HoodieException(e); - } - } +lookupCandidateFilesInMetadataTable(dataFilters) match { + case Success(opt) => opt + case Failure(e) => +logError("Failed to lookup candidate files in File Index", e) + +spark.sqlContext.getConf(DataSkippingFailureMode.configName, DataSkippingFailureMode.Fallback.value) match { + case DataSkippingFailureMode.Fallback.value => Option.empty + case DataSkippingFailureMode.Strict.value => throw new HoodieException(e); +} +} logDebug(s"Overlapping candidate files from Column Stats Index: ${candidateFilesNamesOpt.getOrElse(Set.empty)}") -var totalFileSize = 0 -var candidateFileSize = 0 - -// Prune the partition path by the partition filters -// NOTE: Non-partitioned tables are assumed to consist from a single partition -// encompassing the whole table -val prunedPartitions = listMatchingPartitionPaths(partitionFilters) -val listedPartitions = getInputFileSlices(prunedPartitions: _*).asScala.toSeq.map { - case (partition, fileSlices) => -val baseFileStatuses: Seq[FileStatus] = getBaseFileStatus(fileSlices - .asScala - .map(fs => fs.getBaseFile.orElse(null)) - .filter(_ != null)) +var totalFileSliceSize = 0 +var candidateFileSliceSize = 0 +val listedPartitions = partitionAndFileSlices.map { + case (partitionOpt, fileSlices) => // Filter in candidate files based on the col-stats index lookup -val candidateFiles = baseFileStatuses.filter(fs => - // NOTE: This predicate is true when {@code Option} is empty - candidateFilesNamesOpt.forall(_.contains(fs.getPath.getName))) +val candidateFileSlices: Seq[FileSlice] = { + fileSlices.filter(fs => { +val baseFileStatusOpt = getBaseFileStatus(Option.apply(fs.getBaseFile.orElse(null))) +val logFiles = fs.getLogFiles.collect(Collectors.toSet[HoodieLogFile]).asScala.toSet[HoodieLogFile] +// NOTE: This predicate is true when {@code Option} is empty +if (candidateFilesNamesOpt.forall(files =>
[jira] [Updated] (HUDI-6647) Expand Hudi Java Client Functionality
[ https://issues.apache.org/jira/browse/HUDI-6647?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] ASF GitHub Bot updated HUDI-6647: - Labels: pull-request-available (was: ) > Expand Hudi Java Client Functionality > - > > Key: HUDI-6647 > URL: https://issues.apache.org/jira/browse/HUDI-6647 > Project: Apache Hudi > Issue Type: Improvement >Reporter: Timothy Brown >Assignee: Timothy Brown >Priority: Major > Labels: pull-request-available > > With recent improvements to the abstractions in the Hudi codebase we can > expand the functionality in the java client with a lower amount of effort by > moving common code into the base client and table services. -- This message was sent by Atlassian Jira (v8.20.10#820010)
[GitHub] [hudi] the-other-tim-brown opened a new pull request, #9371: [HUDI-6647] Expand Hudi Java Client Functionality
the-other-tim-brown opened a new pull request, #9371: URL: https://github.com/apache/hudi/pull/9371 ### Change Logs Moves common logic into BaseHoodieTableServiceClient and BaseHoodieWriteClient. Adds support for simple index in java client by relying on existing classes and helpers. ### Impact This enables other features for java client users and moves us towards more shared code, which means fixes and improvements don't need to be applied across 3 clients in the future. ### Risk level (write none, low medium or high below) low ### Documentation Update _Describe any necessary documentation update if there is any new feature, config, or user-facing change_ - _The config description must be updated if new configs are added or the default value of the configs are changed_ - _Any new feature or user-facing change requires updating the Hudi website. Please create a Jira ticket, attach the ticket number here and follow the [instruction](https://hudi.apache.org/contribute/developer-setup#website) to make changes to the website._ ### Contributor's checklist - [ ] Read through [contributor's guide](https://hudi.apache.org/contribute/how-to-contribute) - [ ] Change Logs and Impact were stated clearly - [ ] Adequate tests were added if applicable - [ ] CI passed -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9345: [HUDI-6649] Fix column stat based data filtering for MOR
hudi-bot commented on PR #9345: URL: https://github.com/apache/hudi/pull/9345#issuecomment-1666413441 ## CI report: * 08eeb5b214d2ac2fe81011fbcf990c513e06e1b7 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19094) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9261: [HUDI-6579] Adding support for upsert and deletes with spark datasource for pk less table
hudi-bot commented on PR #9261: URL: https://github.com/apache/hudi/pull/9261#issuecomment-1666413395 ## CI report: * 3968bf360001624911a6c04b9a3c44af36a4dbf7 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19076) * c1977e21cdf02d75158bd3dec3b335e27755915c UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] lokeshj1703 commented on pull request #9345: [HUDI-6649] Fix column stat based data filtering for MOR
lokeshj1703 commented on PR #9345: URL: https://github.com/apache/hudi/pull/9345#issuecomment-1666408258 @yihua I have summarized the approach. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9367: [HUDI-6648] Support building table views from existing files
hudi-bot commented on PR #9367: URL: https://github.com/apache/hudi/pull/9367#issuecomment-1666403172 ## CI report: * 0e4813a189457ef09caf85dca562698f6ba46a4e Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19092) * 3da7a8536e1c4bb4a7450d59b0ad32e9ed048c20 Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19101) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9367: [HUDI-6648] Support building table views from existing files
hudi-bot commented on PR #9367: URL: https://github.com/apache/hudi/pull/9367#issuecomment-1666400574 ## CI report: * 0e4813a189457ef09caf85dca562698f6ba46a4e Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19092) * 3da7a8536e1c4bb4a7450d59b0ad32e9ed048c20 UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] codope commented on a diff in pull request #9345: [HUDI-6649] Fix column stat based data filtering for MOR
codope commented on code in PR #9345: URL: https://github.com/apache/hudi/pull/9345#discussion_r1284972391 ## hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala: ## @@ -122,86 +141,132 @@ case class HoodieFileIndex(spark: SparkSession, * @return list of PartitionDirectory containing partition to base files mapping */ override def listFiles(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Seq[PartitionDirectory] = { -// Look up candidate files names in the col-stats index, if all of the following conditions are true -//- Data-skipping is enabled -//- Col-Stats Index is present -//- List of predicates (filters) is present -val candidateFilesNamesOpt: Option[Set[String]] = +// Prune the partition path by the partition filters +// NOTE: Non-partitioned tables are assumed to consist from a single partition +// encompassing the whole table +val partitionsAndFileSlices = getFileSlicesForPrunedPartitions(partitionFilters) +val partitionsAndFilteredFileSlices = filterFileSlices(dataFilters, partitionsAndFileSlices).map { + case (partitionOpt, fileSlices) => +val allCandidateFiles: Seq[FileStatus] = fileSlices.flatMap(fs => { + val baseFileStatusOpt = getBaseFileStatus(Option.apply(fs.getBaseFile.orElse(null))) + val logFilesStatus = if (includeLogFiles) { + fs.getLogFiles.map[FileStatus](JFunction.toJavaFunction[HoodieLogFile, FileStatus](lf => lf.getFileStatus)) + } else { +java.util.stream.Stream.empty() + } + val files = logFilesStatus.collect(Collectors.toList[FileStatus]).asScala + baseFileStatusOpt.foreach(f => files.append(f)) + files +}) + +PartitionDirectory(InternalRow.fromSeq(partitionOpt.get.values), allCandidateFiles) +} + +hasPushedDownPartitionPredicates = true + +if (shouldReadAsPartitionedTable()) { + partitionsAndFilteredFileSlices +} else { + Seq(PartitionDirectory(InternalRow.empty, partitionsAndFilteredFileSlices.flatMap(_.files))) +} + } + + def filterFileSlices(dataFilters: Seq[Expression], partitionAndFileSlices: Seq[(Option[BaseHoodieTableFileIndex.PartitionPath], Seq[FileSlice])]) + : Seq[(Option[BaseHoodieTableFileIndex.PartitionPath], Seq[FileSlice])] = { +// If there are no data filters, return all the file slices. +// If there are no file slices, return empty list. +if (partitionAndFileSlices.isEmpty || dataFilters.isEmpty) { + partitionAndFileSlices +} else { + // Look up candidate files names in the col-stats index, if all of the following conditions are true + //- Data-skipping is enabled + //- Col-Stats Index is present + //- List of predicates (filters) is present + val candidateFilesNamesOpt: Option[Set[String]] = Review Comment: Can you point me where are the file slices being filtered based on column stats? The `prunedCandidateFileNames` in `lookupCandidateFilesInMetadataTable` will return a set of base files and log files.. Then my understanding is that we further need to check whether any of these pruned files are part of file slices returned by `getAllFiles()`. If so, then take the complete file slice as a candidate, if none, then skip that file slice. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] codope commented on a diff in pull request #9370: [HUDI-6650] Do not set default preCombine field for Flink table config
codope commented on code in PR #9370: URL: https://github.com/apache/hudi/pull/9370#discussion_r1284971137 ## hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/OptionsResolver.java: ## @@ -126,6 +126,17 @@ public static String getPreCombineField(Configuration conf) { return preCombineField.equals(FlinkOptions.NO_PRE_COMBINE) ? null : preCombineField; } + /** + * Returns the preCombine field + * or null if the value is set as {@link FlinkOptions#NO_PRE_COMBINE} or the user does not config it explicitly. + */ + public static String getPreCombineFieldNoDefaultValue(Configuration conf) { +if (!conf.contains(FlinkOptions.PRECOMBINE_FIELD)) { + return null; Review Comment: Better to return `Option`? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] codope commented on a diff in pull request #9370: [HUDI-6650] Do not set default preCombine field for Flink table config
codope commented on code in PR #9370: URL: https://github.com/apache/hudi/pull/9370#discussion_r1284971137 ## hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/OptionsResolver.java: ## @@ -126,6 +126,17 @@ public static String getPreCombineField(Configuration conf) { return preCombineField.equals(FlinkOptions.NO_PRE_COMBINE) ? null : preCombineField; } + /** + * Returns the preCombine field + * or null if the value is set as {@link FlinkOptions#NO_PRE_COMBINE} or the user does not config it explicitly. + */ + public static String getPreCombineFieldNoDefaultValue(Configuration conf) { +if (!conf.contains(FlinkOptions.PRECOMBINE_FIELD)) { + return null; Review Comment: Better to return Option? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] the-other-tim-brown commented on a diff in pull request #9367: [HUDI-6648] Support building table views from existing files
the-other-tim-brown commented on code in PR #9367: URL: https://github.com/apache/hudi/pull/9367#discussion_r1284971042 ## hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java: ## @@ -324,33 +325,45 @@ public static HoodieRecord createPartitionListRecord(List * @param partitionThe name of the partition * @param filesAdded Mapping of files to their sizes for files which have been added to this partition * @param filesDeleted List of files which have been deleted from this partition + * @param instantTime Commit time of the commit responsible for adding and/or deleting these files, will be empty during bootstrapping of the metadata table */ public static HoodieRecord createPartitionFilesRecord(String partition, - Option> filesAdded, - Option> filesDeleted) { -Map fileInfo = new HashMap<>(); -filesAdded.ifPresent(filesMap -> -fileInfo.putAll( -filesMap.entrySet().stream().collect( -Collectors.toMap(Map.Entry::getKey, (entry) -> { - long fileSize = entry.getValue(); - // Assert that the file-size of the file being added is positive, since Hudi - // should not be creating empty files - checkState(fileSize > 0); - return new HoodieMetadataFileInfo(fileSize, false); -}))) -); -filesDeleted.ifPresent(filesList -> -fileInfo.putAll( -filesList.stream().collect( -Collectors.toMap(Function.identity(), (ignored) -> new HoodieMetadataFileInfo(0L, true -); + Map filesAdded, + List filesDeleted, + Option instantTime) { +int size = filesAdded.size() + filesDeleted.size(); +Map fileInfo = new HashMap<>(size, 1); +filesAdded.forEach((fileName, fileSize) -> { + // Assert that the file-size of the file being added is positive, since Hudi + // should not be creating empty files + checkState(fileSize > 0); + fileInfo.put(handleFileName(fileName, instantTime), new HoodieMetadataFileInfo(fileSize, false)); +}); + +filesDeleted.forEach(fileName -> fileInfo.put(handleFileName(fileName, instantTime), DELETE_FILE_METADATA)); HoodieKey key = new HoodieKey(partition, MetadataPartitionType.FILES.getPartitionPath()); HoodieMetadataPayload payload = new HoodieMetadataPayload(key.getRecordKey(), METADATA_TYPE_FILE_LIST, fileInfo); return new HoodieAvroRecord<>(key, payload); } + /** + * In the case where a file was created by something other than a Hudi writer, the file name will not contain the commit time. We will prefix the file name with hudiext_[commitTime] before storing + * in the metadata table. The constructor for {@link org.apache.hudi.common.model.HoodieBaseFile} will properly handle this prefix. + * @param fileName incoming file name + * @param commitTime time of the commit (will be empty during bootstrap operations) + * @return file name with commit time prefix if the input file name does not contain the commit time, otherwise returns the original input + */ + private static String handleFileName(String fileName, Option commitTime) { +return commitTime.map(commit -> { + if (fileName.contains(commit) || FSUtils.isLogFile(fileName)) { Review Comment: I've decided to pursue the approach in the other comment -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] codope closed pull request #8933: [HUDI-5329] Spark reads table error when Flink creates table without record key and primary key
codope closed pull request #8933: [HUDI-5329] Spark reads table error when Flink creates table without record key and primary key URL: https://github.com/apache/hudi/pull/8933 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] the-other-tim-brown commented on a diff in pull request #9367: [HUDI-6648] Support building table views from existing files
the-other-tim-brown commented on code in PR #9367: URL: https://github.com/apache/hudi/pull/9367#discussion_r1284970820 ## hudi-common/src/main/java/org/apache/hudi/common/util/ExternalFilePathUtil.java: ## @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util; + +public class ExternalFilePathUtil { Review Comment: Added -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[hudi] branch master updated: [HUDI-6639] Rename hoodie.sql.write.operation to hoodie.spark.sql.insert.into.operation (#9359)
This is an automated email from the ASF dual-hosted git repository. codope pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git The following commit(s) were added to refs/heads/master by this push: new 2fd72c9c2f5 [HUDI-6639] Rename hoodie.sql.write.operation to hoodie.spark.sql.insert.into.operation (#9359) 2fd72c9c2f5 is described below commit 2fd72c9c2f5d65bbe8aec20f425815e580773f66 Author: Amrish Lal AuthorDate: Fri Aug 4 22:19:01 2023 -0700 [HUDI-6639] Rename hoodie.sql.write.operation to hoodie.spark.sql.insert.into.operation (#9359) --- .../scala/org/apache/hudi/DataSourceOptions.scala | 14 ++-- .../spark/sql/hudi/ProvidesHoodieConfig.scala | 10 - .../apache/spark/sql/hudi/TestCDCForSparkSQL.scala | 5 +++-- .../apache/spark/sql/hudi/TestCreateTable.scala| 4 ++-- .../sql/hudi/TestHoodieTableValuedFunction.scala | 5 +++-- .../apache/spark/sql/hudi/TestInsertTable.scala| 26 +++--- .../org/apache/spark/sql/hudi/TestSpark3DDL.scala | 18 +++ 7 files changed, 42 insertions(+), 40 deletions(-) diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala index e8c247a7c03..59c2a60a3ad 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala @@ -441,7 +441,7 @@ object DataSourceWriteOptions { .markAdvanced() .deprecatedAfter("0.14.0") .withDocumentation("When set to true, the sql insert statement will use bulk insert. " + - "This config is deprecated as of 0.14.0. Please use hoodie.sql.write.operation instead.") + "This config is deprecated as of 0.14.0. Please use hoodie.spark.sql.insert.into.operation instead.") @Deprecated val SQL_INSERT_MODE: ConfigProperty[String] = ConfigProperty @@ -452,7 +452,7 @@ object DataSourceWriteOptions { "For upsert mode, insert statement do the upsert operation for the pk-table which will update the duplicate record." + "For strict mode, insert statement will keep the primary key uniqueness constraint which do not allow duplicate record." + "While for non-strict mode, hudi just do the insert operation for the pk-table. This config is deprecated as of 0.14.0. Please use " + - "hoodie.sql.write.operation and hoodie.datasource.insert.dup.policy as you see fit.") + "hoodie.spark.sql.insert.into.operation and hoodie.datasource.insert.dup.policy as you see fit.") val COMMIT_METADATA_KEYPREFIX: ConfigProperty[String] = ConfigProperty .key("hoodie.datasource.write.commitmeta.key.prefix") @@ -528,13 +528,13 @@ object DataSourceWriteOptions { val MAKE_NEW_COLUMNS_NULLABLE: ConfigProperty[java.lang.Boolean] = HoodieCommonConfig.MAKE_NEW_COLUMNS_NULLABLE - val SQL_WRITE_OPERATION: ConfigProperty[String] = ConfigProperty -.key("hoodie.sql.write.operation") -.defaultValue("insert") -.withValidValues("bulk_insert","insert","upsert") + val SPARK_SQL_INSERT_INTO_OPERATION: ConfigProperty[String] = ConfigProperty +.key("hoodie.spark.sql.insert.into.operation") +.defaultValue(WriteOperationType.INSERT.value()) +.withValidValues(WriteOperationType.BULK_INSERT.value(), WriteOperationType.INSERT.value(), WriteOperationType.UPSERT.value()) .withDocumentation("Sql write operation to use with INSERT_INTO spark sql command. This comes with 3 possible values, bulk_insert, " + "insert and upsert. bulk_insert is generally meant for initial loads and is known to be performant compared to insert. But bulk_insert may not " + - "do small file managmeent. If you prefer hudi to automatically managee small files, then you can go with \"insert\". There is no precombine " + + "do small file management. If you prefer hudi to automatically manage small files, then you can go with \"insert\". There is no precombine " + "(if there are duplicates within the same batch being ingested, same dups will be ingested) with bulk_insert and insert and there is no index " + "look up as well. If you may use INSERT_INTO for mutable dataset, then you may have to set this config value to \"upsert\". With upsert, you will " + "get both precombine and updates to existing records on storage is also honored. If not, you may see duplicates. ") diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala index c66dcc19549..f85032790dd 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala +++ b/hudi-spark-datasource
[GitHub] [hudi] codope merged pull request #9359: [HUDI-6639] Rename hoodie.sql.write.operation to hoodie.spark.sql.insert.into.operation
codope merged PR #9359: URL: https://github.com/apache/hudi/pull/9359 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] the-other-tim-brown commented on a diff in pull request #9367: [HUDI-6648] Support building table views from existing files
the-other-tim-brown commented on code in PR #9367: URL: https://github.com/apache/hudi/pull/9367#discussion_r1284969268 ## hudi-common/src/main/java/org/apache/hudi/common/util/ExternalFilePathUtil.java: ## @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util; + +public class ExternalFilePathUtil { + private static final String EXTERNAL_FILE_PREFIX = "hudiext_"; + + public static String prefixExternalFileWithCommitTime(String fileName, String commitTime) { Review Comment: I think that may be misleading since we're not converting this to a traditional hudi file path and this is only a fileName (not full path) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9370: [HUDI-6650] Do not set default preCombine field for Flink table config
hudi-bot commented on PR #9370: URL: https://github.com/apache/hudi/pull/9370#issuecomment-1666389367 ## CI report: * 241f148dd3740636568416a5f7d37a9cd329e951 Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19099) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9370: [HUDI-6650] Do not set default preCombine field for Flink table config
hudi-bot commented on PR #9370: URL: https://github.com/apache/hudi/pull/9370#issuecomment-1666387643 ## CI report: * 241f148dd3740636568416a5f7d37a9cd329e951 UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9366: [HUDI-6476][FOLLOW-UP] Path filter by FileStatus to avoid additional fs request
hudi-bot commented on PR #9366: URL: https://github.com/apache/hudi/pull/9366#issuecomment-1666387633 ## CI report: * f26271b49440962d10ad12ba619d3caf12a6091d Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19083) * 849fd2cea9047912250cddea3b5bad288b4f1661 Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19097) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9365: [HUDI-6646] Add default lock provider for spark offline compaction an…
hudi-bot commented on PR #9365: URL: https://github.com/apache/hudi/pull/9365#issuecomment-1666385863 ## CI report: * 24a796b7d68ed9b0fced46a0bf297ac1ebf21c13 Azure: [CANCELED](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19095) * 5599728990aa2b2d7e6dae8ef5c54196e94c773e Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19096) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9366: [HUDI-6476][FOLLOW-UP] Path filter by FileStatus to avoid additional fs request
hudi-bot commented on PR #9366: URL: https://github.com/apache/hudi/pull/9366#issuecomment-1666385872 ## CI report: * f26271b49440962d10ad12ba619d3caf12a6091d Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19083) * 849fd2cea9047912250cddea3b5bad288b4f1661 UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9367: [HUDI-6648] Support building table views from existing files
hudi-bot commented on PR #9367: URL: https://github.com/apache/hudi/pull/9367#issuecomment-1666385880 ## CI report: * 0e4813a189457ef09caf85dca562698f6ba46a4e Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19092) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[hudi] branch asf-site updated: [DOCS][HUDI-6536] Add docs on table version change in 0.11.0 (#9206)
This is an automated email from the ASF dual-hosted git repository. sivabalan pushed a commit to branch asf-site in repository https://gitbox.apache.org/repos/asf/hudi.git The following commit(s) were added to refs/heads/asf-site by this push: new c44da42339d [DOCS][HUDI-6536] Add docs on table version change in 0.11.0 (#9206) c44da42339d is described below commit c44da42339dbe513e1c32fc1aaa3de8bc8f61309 Author: Y Ethan Guo AuthorDate: Fri Aug 4 21:36:43 2023 -0700 [DOCS][HUDI-6536] Add docs on table version change in 0.11.0 (#9206) --- website/releases/release-0.11.0.md | 6 ++ 1 file changed, 6 insertions(+) diff --git a/website/releases/release-0.11.0.md b/website/releases/release-0.11.0.md index 96d39035453..9e11f1f5861 100644 --- a/website/releases/release-0.11.0.md +++ b/website/releases/release-0.11.0.md @@ -12,6 +12,12 @@ import TabItem from '@theme/TabItem'; ## Migration Guide +With 0.11.0, we have added a checksum mechanism for validating the `hoodie.proerties`, which introduces a new table version, `4`. + Whenever a Hudi job is launched with this release on a table with older table version, an upgrade step is executed automatically to upgrade the table to table version `4`. + This automatic upgrade step happens just once per Hudi table as the hoodie.table.version will be updated in property file after upgrade is completed. + Similarly, a command line tool for Downgrading (command - downgrade) is added if in case some users want to downgrade Hudi + from table version `4` to `3` or move from Hudi 0.11.0 to pre 0.11.0. This needs to be executed from a 0.11.0 hudi-cli binary/script. + ### Bundle usage updates - Spark bundle for 3.0.x is no longer officially supported. Users are encouraged to upgrade to Spark 3.2 or 3.1.
[GitHub] [hudi] nsivabalan merged pull request #9206: [DOCS][HUDI-6536] Add docs on table version change in 0.11.0
nsivabalan merged PR #9206: URL: https://github.com/apache/hudi/pull/9206 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[hudi] branch master updated (35be9bbbc7e -> 84030beb863)
This is an automated email from the ASF dual-hosted git repository. sivabalan pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git from 35be9bbbc7e [HUDI-6324] Fixing deleting of MDT index (#9248) add 84030beb863 [MINOR] Fix typo in config docs (#9295) No new revisions were added by this update. Summary of changes: .../src/main/java/org/apache/hudi/common/config/ConfigGroups.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
[GitHub] [hudi] nsivabalan merged pull request #9295: [MINOR] Fix typo in config docs
nsivabalan merged PR #9295: URL: https://github.com/apache/hudi/pull/9295 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[hudi] branch asf-site updated: [DOCS] Improve Developer Setup page (#9208)
This is an automated email from the ASF dual-hosted git repository. sivabalan pushed a commit to branch asf-site in repository https://gitbox.apache.org/repos/asf/hudi.git The following commit(s) were added to refs/heads/asf-site by this push: new ca8c6879f75 [DOCS] Improve Developer Setup page (#9208) ca8c6879f75 is described below commit ca8c6879f75c597257e6078f6fd6fc94311fa540 Author: Y Ethan Guo AuthorDate: Fri Aug 4 21:35:13 2023 -0700 [DOCS] Improve Developer Setup page (#9208) --- website/contribute/developer-setup.md | 18 ++ 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/website/contribute/developer-setup.md b/website/contribute/developer-setup.md index 5ca759250d6..4b80cee3e15 100644 --- a/website/contribute/developer-setup.md +++ b/website/contribute/developer-setup.md @@ -20,32 +20,34 @@ To contribute code, you need - (Recommended) Join our dev mailing list & slack channel, listed on [community](/community/get-involved) page. -## IDE Setup +## IntelliJ Setup -To contribute, you would need to do the following +IntelliJ is the recommended IDE for developing Hudi. To contribute, you would need to do the following -- Fork the Hudi code on Github & then clone your own fork locally. Once cloned, we recommend building as per instructions on [spark quickstart](/docs/quick-start-guide) or [flink quickstart](/docs/flink-quick-start-guide) +- Fork the Hudi code on Github & then clone your own fork locally. Once cloned, we recommend building as per instructions on [spark quickstart](/docs/quick-start-guide) or [flink quickstart](/docs/flink-quick-start-guide). -- In `Project Structure`, select Java 1.8 as the Project SDK +- In IntelliJ, select `File` > `New` > `Project from Existing Sources...` and select the `pom.xml` file under your local Hudi source folder. + +- In `Project Structure`, select Java 1.8 as the Project SDK. ![IDE setup java](/assets/images/contributing/IDE_setup_java.png) - Make the following configuration in `Preferences` or `Settings` in newer IntelliJ so the Hudi code can compile in the IDE: - * Enable annotation processing in compiler + * Enable annotation processing in compiler. ![IDE setup annotation processing](/assets/images/contributing/IDE_setup_annotation.png) - * Configure Maven *NOT* to delegate IDE build/run actions to Maven so you can run tests in IntelliJ directly + * Configure Maven *NOT* to delegate IDE build/run actions to Maven so you can run tests in IntelliJ directly. ![IDE setup maven 1](/assets/images/contributing/IDE_setup_maven_1.png) ![IDE setup maven 2](/assets/images/contributing/IDE_setup_maven_2.png) - If you switch maven build profile, e.g., from Spark 3.2 to Spark 3.3, you need to first build Hudi in the command line first and `Reload All Maven Projects` in IntelliJ like below, -so that IntelliJ re-indexes the code +so that IntelliJ re-indexes the code. ![IDE setup reload](/assets/images/contributing/IDE_setup_reload.png) - \[Recommended\] We have embraced the code style largely based on [google format](https://google.github.io/styleguide/javaguide.html). Please set up your IDE with style files from [\/style/](https://github.com/apache/hudi/tree/master/style). These instructions have been tested on IntelliJ. -* Open Project Preferences in IDE +* Open Project Preferences in IntelliJ * Install and activate CheckStyle plugin ![IDE_setup_checkstyle_1](/assets/images/contributing/IDE_setup_checkstyle_1.png)
[GitHub] [hudi] nsivabalan merged pull request #9208: [DOCS] Improve Developer Setup page
nsivabalan merged PR #9208: URL: https://github.com/apache/hudi/pull/9208 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Updated] (HUDI-6650) Do not set default preCombine field for Flink table config
[ https://issues.apache.org/jira/browse/HUDI-6650?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] ASF GitHub Bot updated HUDI-6650: - Labels: pull-request-available (was: ) > Do not set default preCombine field for Flink table config > -- > > Key: HUDI-6650 > URL: https://issues.apache.org/jira/browse/HUDI-6650 > Project: Apache Hudi > Issue Type: Bug > Components: flink >Reporter: Danny Chen >Priority: Major > Labels: pull-request-available > Fix For: 0.14.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[GitHub] [hudi] danny0405 opened a new pull request, #9370: [HUDI-6650] Do not set default preCombine field for Flink table config
danny0405 opened a new pull request, #9370: URL: https://github.com/apache/hudi/pull/9370 ### Change Logs Do not set preCombine field when use does not set it up explicitly. This behavior is kept in line with Spark. ### Impact none. ### Risk level (write none, low medium or high below) none ### Documentation Update _Describe any necessary documentation update if there is any new feature, config, or user-facing change_ - _The config description must be updated if new configs are added or the default value of the configs are changed_ - _Any new feature or user-facing change requires updating the Hudi website. Please create a Jira ticket, attach the ticket number here and follow the [instruction](https://hudi.apache.org/contribute/developer-setup#website) to make changes to the website._ ### Contributor's checklist - [ ] Read through [contributor's guide](https://hudi.apache.org/contribute/how-to-contribute) - [ ] Change Logs and Impact were stated clearly - [ ] Adequate tests were added if applicable - [ ] CI passed -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9369: [HUDI-6386] Fix flakey multiwriter tests
hudi-bot commented on PR #9369: URL: https://github.com/apache/hudi/pull/9369#issuecomment-1666376792 ## CI report: * 68ba8ae97df89a261298dbb06c80f43c57159969 UNKNOWN * 2d7ced1d3723f9c6357378c3849ebca7e2db4904 Azure: [SUCCESS](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19090) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9345: [HUDI-6649] Fix column stat based data filtering for MOR
hudi-bot commented on PR #9345: URL: https://github.com/apache/hudi/pull/9345#issuecomment-1666376768 ## CI report: * 600d19c601ec8b224643b7e4c0a7c0f7a2e1e290 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19089) * 08eeb5b214d2ac2fe81011fbcf990c513e06e1b7 Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19094) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9365: [HUDI-6646] Add default lock provider for spark offline compaction an…
hudi-bot commented on PR #9365: URL: https://github.com/apache/hudi/pull/9365#issuecomment-1666376785 ## CI report: * 0aed2250937742799e04e42c511e7fa08a5b92a2 Azure: [CANCELED](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19093) * 24a796b7d68ed9b0fced46a0bf297ac1ebf21c13 Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19095) * 5599728990aa2b2d7e6dae8ef5c54196e94c773e UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Created] (HUDI-6650) Do not set default preCombine field for Flink table config
Danny Chen created HUDI-6650: Summary: Do not set default preCombine field for Flink table config Key: HUDI-6650 URL: https://issues.apache.org/jira/browse/HUDI-6650 Project: Apache Hudi Issue Type: Bug Components: flink Reporter: Danny Chen Fix For: 0.14.0 -- This message was sent by Atlassian Jira (v8.20.10#820010)
[GitHub] [hudi] hudi-bot commented on pull request #9365: [HUDI-6646] Add default lock provider for spark offline compaction an…
hudi-bot commented on PR #9365: URL: https://github.com/apache/hudi/pull/9365#issuecomment-1666375333 ## CI report: * d6ffde7bdc33f53357e131703071c5040793b715 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19072) * 0aed2250937742799e04e42c511e7fa08a5b92a2 Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19093) * 24a796b7d68ed9b0fced46a0bf297ac1ebf21c13 UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9345: [HUDI-6649] Fix column stat based data filtering for MOR
hudi-bot commented on PR #9345: URL: https://github.com/apache/hudi/pull/9345#issuecomment-1666375320 ## CI report: * 600d19c601ec8b224643b7e4c0a7c0f7a2e1e290 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19089) * 08eeb5b214d2ac2fe81011fbcf990c513e06e1b7 UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading
hudi-bot commented on PR #9276: URL: https://github.com/apache/hudi/pull/9276#issuecomment-1666373712 ## CI report: * 662f3b320ab6ea06462bad9a4448add1ec2f380a UNKNOWN * def394b73203814bbb635841c5f07c216c0575cc Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19091) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9336: [HUDI-6629] - Changes for s3/gcs IncrSource job to taken into sourceLimit during ingestion
hudi-bot commented on PR #9336: URL: https://github.com/apache/hudi/pull/9336#issuecomment-1666358274 ## CI report: * 77d7b455ee5cd668a005f6f7e6f04135608f2b7a UNKNOWN * ae20a8550d647dbaec9428f11bfac0751a2c871c Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19086) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9365: [HUDI-6646] Add default lock provider for spark offline compaction an…
hudi-bot commented on PR #9365: URL: https://github.com/apache/hudi/pull/9365#issuecomment-1666344460 ## CI report: * d6ffde7bdc33f53357e131703071c5040793b715 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19072) * 0aed2250937742799e04e42c511e7fa08a5b92a2 Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19093) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9365: [HUDI-6646] Add default lock provider for spark offline compaction an…
hudi-bot commented on PR #9365: URL: https://github.com/apache/hudi/pull/9365#issuecomment-1666340015 ## CI report: * d6ffde7bdc33f53357e131703071c5040793b715 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19072) * 0aed2250937742799e04e42c511e7fa08a5b92a2 UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #8718: [HUDI-6214] Enabling compaction by default for batch writes with MOR table
hudi-bot commented on PR #8718: URL: https://github.com/apache/hudi/pull/8718#issuecomment-1666339426 ## CI report: * e15a030cdb08d00401b55e6d49cebcb6892f218e Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19087) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9359: [HUDI-6639] Rename hoodie.sql.write.operation to hoodie.spark.sql.insert.into.operation
hudi-bot commented on PR #9359: URL: https://github.com/apache/hudi/pull/9359#issuecomment-1666337335 ## CI report: * c695d8fc59ebdbf339f40ea31db465858ea83f98 Azure: [SUCCESS](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19085) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] xuzifu666 closed pull request #9364: [HUDI-6645] Relax the restriction for Spark MDT rollback
xuzifu666 closed pull request #9364: [HUDI-6645] Relax the restriction for Spark MDT rollback URL: https://github.com/apache/hudi/pull/9364 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] xuzifu666 commented on a diff in pull request #9364: [HUDI-6645] Relax the restriction for Spark MDT rollback
xuzifu666 commented on code in PR #9364: URL: https://github.com/apache/hudi/pull/9364#discussion_r1284932889 ## hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java: ## @@ -1022,22 +1020,6 @@ public void update(HoodieRollbackMetadata rollbackMetadata, String instantTime) } } - protected void validateRollback( Review Comment: this change need for compation with spark , otherwise would report error @yihua -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] danny0405 commented on a diff in pull request #9365: [HUDI-6646] Add default lock provider for spark offline compaction an…
danny0405 commented on code in PR #9365: URL: https://github.com/apache/hudi/pull/9365#discussion_r1284932402 ## hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java: ## @@ -578,6 +580,20 @@ public static HoodieTableMetaClient createMetaClient( .build(); } + public static void addLockOptions(String basePath, TypedProperties props) { +if (!props.containsKey(HoodieLockConfig.LOCK_PROVIDER_CLASS_NAME.key())) { + HoodieLockConfig lockConfig = HoodieLockConfig.newBuilder() + .withLockProvider(FileSystemBasedLockProvider.class) + .withLockWaitTimeInMillis(2000L) // 2s + .withFileSystemLockExpire(1) // 1 minute + .withClientNumRetries(30) Review Comment: Not sure, I just keep the config same with Flink to make it more friendly to streaming job. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9345: [HUDI-6649] Fix column stat based data filtering for MOR
hudi-bot commented on PR #9345: URL: https://github.com/apache/hudi/pull/9345#issuecomment-1666307069 ## CI report: * 600d19c601ec8b224643b7e4c0a7c0f7a2e1e290 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19089) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] the-other-tim-brown commented on a diff in pull request #9367: [HUDI-6648] Support building table views from existing files
the-other-tim-brown commented on code in PR #9367: URL: https://github.com/apache/hudi/pull/9367#discussion_r1284923832 ## hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java: ## @@ -324,33 +325,45 @@ public static HoodieRecord createPartitionListRecord(List * @param partitionThe name of the partition * @param filesAdded Mapping of files to their sizes for files which have been added to this partition * @param filesDeleted List of files which have been deleted from this partition + * @param instantTime Commit time of the commit responsible for adding and/or deleting these files, will be empty during bootstrapping of the metadata table */ public static HoodieRecord createPartitionFilesRecord(String partition, - Option> filesAdded, - Option> filesDeleted) { -Map fileInfo = new HashMap<>(); -filesAdded.ifPresent(filesMap -> -fileInfo.putAll( -filesMap.entrySet().stream().collect( -Collectors.toMap(Map.Entry::getKey, (entry) -> { - long fileSize = entry.getValue(); - // Assert that the file-size of the file being added is positive, since Hudi - // should not be creating empty files - checkState(fileSize > 0); - return new HoodieMetadataFileInfo(fileSize, false); -}))) -); -filesDeleted.ifPresent(filesList -> -fileInfo.putAll( -filesList.stream().collect( -Collectors.toMap(Function.identity(), (ignored) -> new HoodieMetadataFileInfo(0L, true -); + Map filesAdded, + List filesDeleted, + Option instantTime) { +int size = filesAdded.size() + filesDeleted.size(); +Map fileInfo = new HashMap<>(size, 1); +filesAdded.forEach((fileName, fileSize) -> { + // Assert that the file-size of the file being added is positive, since Hudi + // should not be creating empty files + checkState(fileSize > 0); + fileInfo.put(handleFileName(fileName, instantTime), new HoodieMetadataFileInfo(fileSize, false)); +}); + +filesDeleted.forEach(fileName -> fileInfo.put(handleFileName(fileName, instantTime), DELETE_FILE_METADATA)); HoodieKey key = new HoodieKey(partition, MetadataPartitionType.FILES.getPartitionPath()); HoodieMetadataPayload payload = new HoodieMetadataPayload(key.getRecordKey(), METADATA_TYPE_FILE_LIST, fileInfo); return new HoodieAvroRecord<>(key, payload); } + /** + * In the case where a file was created by something other than a Hudi writer, the file name will not contain the commit time. We will prefix the file name with hudiext_[commitTime] before storing + * in the metadata table. The constructor for {@link org.apache.hudi.common.model.HoodieBaseFile} will properly handle this prefix. + * @param fileName incoming file name + * @param commitTime time of the commit (will be empty during bootstrap operations) + * @return file name with commit time prefix if the input file name does not contain the commit time, otherwise returns the original input + */ + private static String handleFileName(String fileName, Option commitTime) { Review Comment: Yes that's correct, see https://github.com/apache/hudi/pull/9367#discussion_r1284890274 for a slightly different approach. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] the-other-tim-brown commented on a diff in pull request #9367: [HUDI-6648] Support building table views from existing files
the-other-tim-brown commented on code in PR #9367: URL: https://github.com/apache/hudi/pull/9367#discussion_r1284923741 ## hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java: ## @@ -324,33 +325,45 @@ public static HoodieRecord createPartitionListRecord(List * @param partitionThe name of the partition * @param filesAdded Mapping of files to their sizes for files which have been added to this partition * @param filesDeleted List of files which have been deleted from this partition + * @param instantTime Commit time of the commit responsible for adding and/or deleting these files, will be empty during bootstrapping of the metadata table */ public static HoodieRecord createPartitionFilesRecord(String partition, - Option> filesAdded, - Option> filesDeleted) { -Map fileInfo = new HashMap<>(); -filesAdded.ifPresent(filesMap -> -fileInfo.putAll( -filesMap.entrySet().stream().collect( -Collectors.toMap(Map.Entry::getKey, (entry) -> { - long fileSize = entry.getValue(); - // Assert that the file-size of the file being added is positive, since Hudi - // should not be creating empty files - checkState(fileSize > 0); - return new HoodieMetadataFileInfo(fileSize, false); -}))) -); -filesDeleted.ifPresent(filesList -> -fileInfo.putAll( -filesList.stream().collect( -Collectors.toMap(Function.identity(), (ignored) -> new HoodieMetadataFileInfo(0L, true -); + Map filesAdded, + List filesDeleted, + Option instantTime) { +int size = filesAdded.size() + filesDeleted.size(); +Map fileInfo = new HashMap<>(size, 1); +filesAdded.forEach((fileName, fileSize) -> { + // Assert that the file-size of the file being added is positive, since Hudi + // should not be creating empty files + checkState(fileSize > 0); + fileInfo.put(handleFileName(fileName, instantTime), new HoodieMetadataFileInfo(fileSize, false)); +}); + +filesDeleted.forEach(fileName -> fileInfo.put(handleFileName(fileName, instantTime), DELETE_FILE_METADATA)); HoodieKey key = new HoodieKey(partition, MetadataPartitionType.FILES.getPartitionPath()); HoodieMetadataPayload payload = new HoodieMetadataPayload(key.getRecordKey(), METADATA_TYPE_FILE_LIST, fileInfo); return new HoodieAvroRecord<>(key, payload); } + /** + * In the case where a file was created by something other than a Hudi writer, the file name will not contain the commit time. We will prefix the file name with hudiext_[commitTime] before storing + * in the metadata table. The constructor for {@link org.apache.hudi.common.model.HoodieBaseFile} will properly handle this prefix. + * @param fileName incoming file name + * @param commitTime time of the commit (will be empty during bootstrap operations) + * @return file name with commit time prefix if the input file name does not contain the commit time, otherwise returns the original input + */ + private static String handleFileName(String fileName, Option commitTime) { +return commitTime.map(commit -> { + if (fileName.contains(commit) || FSUtils.isLogFile(fileName)) { Review Comment: Can you check out my notes on this and add to the conversation there? https://github.com/apache/hudi/pull/9367#discussion_r1284890274 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] the-other-tim-brown commented on a diff in pull request #9367: [HUDI-6648] Support building table views from existing files
the-other-tim-brown commented on code in PR #9367: URL: https://github.com/apache/hudi/pull/9367#discussion_r1284923366 ## hudi-common/src/main/java/org/apache/hudi/common/model/HoodieBaseFile.java: ## @@ -61,16 +62,39 @@ public HoodieBaseFile(String filePath) { public HoodieBaseFile(String filePath, BaseFile bootstrapBaseFile) { super(filePath); this.bootstrapBaseFile = Option.ofNullable(bootstrapBaseFile); -String[] fileIdAndCommitTime = getFileIdAndCommitTimeFromFileName(); +String[] fileIdAndCommitTime = getFileIdAndCommitTimeFromFileName(getFileName()); this.fileId = fileIdAndCommitTime[0]; this.commitTime = fileIdAndCommitTime[1]; } + public HoodieBaseFile(String filePath, String fileId, String commitTime, BaseFile bootstrapBaseFile) { +super(filePath); +this.bootstrapBaseFile = Option.ofNullable(bootstrapBaseFile); +this.fileId = fileId; +this.commitTime = commitTime; + } + + private HoodieBaseFile(FileStatus fileStatus, String[] fileIdAndCommitTime, BaseFile bootstrapBaseFile) { +this(fileStatus, fileIdAndCommitTime[0], fileIdAndCommitTime[1], bootstrapBaseFile); + } + + public HoodieBaseFile(FileStatus fileStatus, String fileId, String commitTime, BaseFile bootstrapBaseFile) { +super(handleExternallyGeneratedFileName(fileStatus, fileId)); Review Comment: No, you can see the usages in the PR are by other constructors and the DTO class. That means that a lot of paths will hit this code. We currently don't have any validations on file paths, just assumptions. Should we start adding that now? It should go in a separate patch if so. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] jonvex commented on a diff in pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading
jonvex commented on code in PR #9276: URL: https://github.com/apache/hudi/pull/9276#discussion_r1284922051 ## hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/NewHoodieParquetFileFormat.scala: ## @@ -0,0 +1,353 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.execution.datasources.parquet + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.hudi.DataSourceReadOptions.{REALTIME_PAYLOAD_COMBINE_OPT_VAL, REALTIME_SKIP_MERGE_OPT_VAL} +import org.apache.hudi.MergeOnReadSnapshotRelation.createPartitionedFile +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.common.model.{BaseFile, FileSlice, HoodieLogFile, HoodieRecord} +import org.apache.hudi.common.util.ValidationUtils.checkState +import org.apache.hudi.{HoodieBaseRelation, HoodieSparkUtils, HoodieTableSchema, HoodieTableState, LogFileIterator, MergeOnReadSnapshotRelation, PartitionFileSliceMapping, RecordMergingFileIterator, SkipMergeIterator, SparkAdapterSupport} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.sql.HoodieCatalystExpressionUtils.generateUnsafeProjection +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.JoinedRow +import org.apache.spark.sql.execution.datasources.PartitionedFile +import org.apache.spark.sql.hudi.HoodieSqlCommonUtils.isMetaField +import org.apache.spark.sql.sources.Filter +import org.apache.spark.sql.types.{StructField, StructType} +import org.apache.spark.sql.vectorized.{ColumnVector, ColumnarBatch} +import org.apache.spark.util.SerializableConfiguration + +import scala.collection.mutable +import scala.jdk.CollectionConverters.asScalaIteratorConverter + +class NewHoodieParquetFileFormat(tableState: Broadcast[HoodieTableState], + tableSchema: Broadcast[HoodieTableSchema], + tableName: String, + mergeType: String, + mandatoryFields: Seq[String], + isMOR: Boolean, + isBootstrap: Boolean) extends ParquetFileFormat with SparkAdapterSupport { + + //Used so that the planner only projects once and does not stack overflow + var isProjected = false + + /** + * Support batch needs to remain consistent, even if one side of a bootstrap merge can support + * while the other side can't + */ + private var supportBatchCalled = false + private var supportBatchResult = false + override def supportBatch(sparkSession: SparkSession, schema: StructType): Boolean = { +if (!supportBatchCalled) { + supportBatchCalled = true + supportBatchResult = !isMOR && super.supportBatch(sparkSession, schema) +} +supportBatchResult + } + + override def buildReaderWithPartitionValues(sparkSession: SparkSession, + dataSchema: StructType, + partitionSchema: StructType, + requiredSchema: StructType, + filters: Seq[Filter], + options: Map[String, String], + hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = { + +val outputSchema = StructType(requiredSchema.fields ++ partitionSchema.fields) + +val requiredSchemaWithMandatory = if (!isMOR || MergeOnReadSnapshotRelation.isProjectionCompatible(tableState.value)) { + //add mandatory fields to required schema + val added: mutable.Buffer[StructField] = mutable.Buffer[StructField]() + for (field <- mandatoryFields) { +if (requiredSchema.getFieldIndex(field).isEmpty) { + val fieldToAdd = dataSchema.fields(dataSchema.getFieldIndex(field).get) + added.append(fieldToAdd) +} + } + val addedFields = StructType(added.toArray) + StructType(requiredSchema.toArray ++ addedFields.fields) +} else {
[GitHub] [hudi] jonvex commented on a diff in pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading
jonvex commented on code in PR #9276: URL: https://github.com/apache/hudi/pull/9276#discussion_r1284921590 ## hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/LegacyHoodieParquetFileFormat.scala: ## @@ -23,12 +23,12 @@ import org.apache.hudi.{DataSourceReadOptions, HoodieSparkUtils, SparkAdapterSup import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.PartitionedFile -import org.apache.spark.sql.execution.datasources.parquet.HoodieParquetFileFormat.FILE_FORMAT_ID +import org.apache.spark.sql.execution.datasources.parquet.LegacyHoodieParquetFileFormat.FILE_FORMAT_ID import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.types.{AtomicType, StructType} -class HoodieParquetFileFormat extends ParquetFileFormat with SparkAdapterSupport { +class LegacyHoodieParquetFileFormat extends ParquetFileFormat with SparkAdapterSupport { Review Comment: I think changes to the relation and especially rdd classes are more likely to require changes in the new file format than changes to LegacyHoodieParquetFileFormat. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9366: [HUDI-6476][FOLLOW-UP] Restore FileStatus instead of Path to avoid additional fs request
hudi-bot commented on PR #9366: URL: https://github.com/apache/hudi/pull/9366#issuecomment-1666275223 ## CI report: * f26271b49440962d10ad12ba619d3caf12a6091d Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19083) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] yihua commented on pull request #9223: [HUDI-6553] Speedup column stats and bloom index creation on large datasets.
yihua commented on PR #9223: URL: https://github.com/apache/hudi/pull/9223#issuecomment-1666269468 Looks like a similar optimization has been merged before: #8856. Should this PR be rebased first before being ready for review again? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] yihua commented on a diff in pull request #9255: [HUDI-6503] Make TableServiceClient's txnManager consistent with Writ…
yihua commented on code in PR #9255: URL: https://github.com/apache/hudi/pull/9255#discussion_r1284902386 ## hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDTableServiceClient.java: ## @@ -67,8 +68,9 @@ public class SparkRDDTableServiceClient extends BaseHoodieTableServiceClient< protected SparkRDDTableServiceClient(HoodieEngineContext context, HoodieWriteConfig clientConfig, - Option timelineService) { -super(context, clientConfig, timelineService); + Option timelineService, + Option txnManager) { Review Comment: Not sure if it's a good idea to expose this in the constructor, as the client can escape the transaction manager and cause correctness issues in concurrency control. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] yihua commented on pull request #9255: [HUDI-6503] Make TableServiceClient's txnManager consistent with Writ…
yihua commented on PR #9255: URL: https://github.com/apache/hudi/pull/9255#issuecomment-1666264355 > > Can you elaborate a little more why the table service client can hold a separate lock ? > > Because `InProcessLockProvider` is valid as long as it is in the same JVM process (see `static final Map LOCK_INSTANCE_PER_BASEPATH = new ConcurrentHashMap<>();`), other locks can not even be in the same JVM. Maybe i'm missing something. > > Of course, it is best to use the same lock manager, just like it used to be (before #6732). And CI seems to be stable now by the way. `LOCK_INSTANCE_PER_BASEPATH` is already static final, and multiple `InProcessLockProvider` instances using the same table base path in the same process should get the same underlying `ReentrantReadWriteLock` instance. Check this test which passes now: `TestInProcessLockProvider.testLockIdentity()`. There was a bug before in `InProcessLockProvider` which is fixed in #8658. Not sure if you hit that. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] nsivabalan commented on a diff in pull request #9367: [HUDI-6648] Support building table views from existing files
nsivabalan commented on code in PR #9367: URL: https://github.com/apache/hudi/pull/9367#discussion_r1284898855 ## hudi-common/src/main/java/org/apache/hudi/common/util/ExternalFilePathUtil.java: ## @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util; + +public class ExternalFilePathUtil { + private static final String EXTERNAL_FILE_PREFIX = "hudiext_"; + + public static String prefixExternalFileWithCommitTime(String fileName, String commitTime) { Review Comment: naming: convertExternalFileToHudiFilePath ## hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java: ## @@ -324,33 +325,45 @@ public static HoodieRecord createPartitionListRecord(List * @param partitionThe name of the partition * @param filesAdded Mapping of files to their sizes for files which have been added to this partition * @param filesDeleted List of files which have been deleted from this partition + * @param instantTime Commit time of the commit responsible for adding and/or deleting these files, will be empty during bootstrapping of the metadata table */ public static HoodieRecord createPartitionFilesRecord(String partition, - Option> filesAdded, - Option> filesDeleted) { -Map fileInfo = new HashMap<>(); -filesAdded.ifPresent(filesMap -> -fileInfo.putAll( -filesMap.entrySet().stream().collect( -Collectors.toMap(Map.Entry::getKey, (entry) -> { - long fileSize = entry.getValue(); - // Assert that the file-size of the file being added is positive, since Hudi - // should not be creating empty files - checkState(fileSize > 0); - return new HoodieMetadataFileInfo(fileSize, false); -}))) -); -filesDeleted.ifPresent(filesList -> -fileInfo.putAll( -filesList.stream().collect( -Collectors.toMap(Function.identity(), (ignored) -> new HoodieMetadataFileInfo(0L, true -); + Map filesAdded, + List filesDeleted, + Option instantTime) { +int size = filesAdded.size() + filesDeleted.size(); +Map fileInfo = new HashMap<>(size, 1); +filesAdded.forEach((fileName, fileSize) -> { + // Assert that the file-size of the file being added is positive, since Hudi + // should not be creating empty files + checkState(fileSize > 0); + fileInfo.put(handleFileName(fileName, instantTime), new HoodieMetadataFileInfo(fileSize, false)); +}); + +filesDeleted.forEach(fileName -> fileInfo.put(handleFileName(fileName, instantTime), DELETE_FILE_METADATA)); HoodieKey key = new HoodieKey(partition, MetadataPartitionType.FILES.getPartitionPath()); HoodieMetadataPayload payload = new HoodieMetadataPayload(key.getRecordKey(), METADATA_TYPE_FILE_LIST, fileInfo); return new HoodieAvroRecord<>(key, payload); } + /** + * In the case where a file was created by something other than a Hudi writer, the file name will not contain the commit time. We will prefix the file name with hudiext_[commitTime] before storing Review Comment: lets try to use the same terminology everywhere. if we standardize on "externally created files", lets try to use the same phrase everywhere to avoid confusion ## hudi-common/src/main/java/org/apache/hudi/common/config/HoodieMetadataConfig.java: ## @@ -312,6 +312,13 @@ public final class HoodieMetadataConfig extends HoodieConfig { .withDocumentation("Maximum size in bytes of a single log file. Larger log files can contain larger log blocks " + "thereby reducing the number of blocks to search for keys"); + public static final ConfigProperty DISABLE_FILE
[GitHub] [hudi] the-other-tim-brown commented on a diff in pull request #9367: [HUDI-6648] Support building table views from existing files
the-other-tim-brown commented on code in PR #9367: URL: https://github.com/apache/hudi/pull/9367#discussion_r1284897744 ## hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java: ## @@ -324,33 +325,45 @@ public static HoodieRecord createPartitionListRecord(List * @param partitionThe name of the partition * @param filesAdded Mapping of files to their sizes for files which have been added to this partition * @param filesDeleted List of files which have been deleted from this partition + * @param instantTime Commit time of the commit responsible for adding and/or deleting these files, will be empty during bootstrapping of the metadata table */ public static HoodieRecord createPartitionFilesRecord(String partition, - Option> filesAdded, - Option> filesDeleted) { -Map fileInfo = new HashMap<>(); -filesAdded.ifPresent(filesMap -> -fileInfo.putAll( -filesMap.entrySet().stream().collect( -Collectors.toMap(Map.Entry::getKey, (entry) -> { - long fileSize = entry.getValue(); - // Assert that the file-size of the file being added is positive, since Hudi - // should not be creating empty files - checkState(fileSize > 0); - return new HoodieMetadataFileInfo(fileSize, false); -}))) -); -filesDeleted.ifPresent(filesList -> -fileInfo.putAll( -filesList.stream().collect( -Collectors.toMap(Function.identity(), (ignored) -> new HoodieMetadataFileInfo(0L, true -); + Map filesAdded, + List filesDeleted, + Option instantTime) { +int size = filesAdded.size() + filesDeleted.size(); +Map fileInfo = new HashMap<>(size, 1); +filesAdded.forEach((fileName, fileSize) -> { + // Assert that the file-size of the file being added is positive, since Hudi + // should not be creating empty files + checkState(fileSize > 0); + fileInfo.put(handleFileName(fileName, instantTime), new HoodieMetadataFileInfo(fileSize, false)); +}); + +filesDeleted.forEach(fileName -> fileInfo.put(handleFileName(fileName, instantTime), DELETE_FILE_METADATA)); HoodieKey key = new HoodieKey(partition, MetadataPartitionType.FILES.getPartitionPath()); HoodieMetadataPayload payload = new HoodieMetadataPayload(key.getRecordKey(), METADATA_TYPE_FILE_LIST, fileInfo); return new HoodieAvroRecord<>(key, payload); } + /** + * In the case where a file was created by something other than a Hudi writer, the file name will not contain the commit time. We will prefix the file name with hudiext_[commitTime] before storing + * in the metadata table. The constructor for {@link org.apache.hudi.common.model.HoodieBaseFile} will properly handle this prefix. + * @param fileName incoming file name + * @param commitTime time of the commit (will be empty during bootstrap operations) + * @return file name with commit time prefix if the input file name does not contain the commit time, otherwise returns the original input + */ + private static String handleFileName(String fileName, Option commitTime) { Review Comment: I tested this out locally and this approach will work. The more I consider it, I think that it could be best to force the caller to pass in the modified path rather than trying to infer whether or not we should modify it here. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] yihua commented on pull request #9345: [HUDI-6649] Fix column stat based data filtering for MOR
yihua commented on PR #9345: URL: https://github.com/apache/hudi/pull/9345#issuecomment-1666254533 @lokeshj1703 Could you summarize the approach in the PR description? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] yihua commented on pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading
yihua commented on PR #9276: URL: https://github.com/apache/hudi/pull/9276#issuecomment-1666254051 @jonvex Could you also update the PR description with details of the approach? Before merging this PR, let's create a new PR based on this patch with `hoodie.datasource.read.use.legacy.parquet.file.format=false` and make sure tests pass on MOR and bootstrap queries. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] nsivabalan commented on a diff in pull request #9367: [HUDI-6648] Support building table views from existing files
nsivabalan commented on code in PR #9367: URL: https://github.com/apache/hudi/pull/9367#discussion_r1284895611 ## hudi-common/src/main/java/org/apache/hudi/common/config/HoodieMetadataConfig.java: ## @@ -312,6 +312,13 @@ public final class HoodieMetadataConfig extends HoodieConfig { .withDocumentation("Maximum size in bytes of a single log file. Larger log files can contain larger log blocks " + "thereby reducing the number of blocks to search for keys"); + public static final ConfigProperty DISABLE_FILESYSTEM_BOOTSTRAP = ConfigProperty + .key(METADATA_PREFIX + ".filesystem.bootstrap.disabled") + .defaultValue(false) + .sinceVersion("0.14.0") + .withDocumentation("Disable bootstrapping metadata table from the file system when the table is first created. " Review Comment: also, should we prefix it with "_" for now so that we keep it internally. and so we avoid exposing to general users ? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] yihua commented on a diff in pull request #9364: [HUDI-6645] Relax the restriction for Spark MDT rollback
yihua commented on code in PR #9364: URL: https://github.com/apache/hudi/pull/9364#discussion_r1284895154 ## hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java: ## @@ -1022,22 +1020,6 @@ public void update(HoodieRollbackMetadata rollbackMetadata, String instantTime) } } - protected void validateRollback( Review Comment: Why should this be relaxed? Does not look safe for Spark. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] yihua commented on a diff in pull request #9365: [HUDI-6646] Add default lock provider for spark offline compaction an…
yihua commented on code in PR #9365: URL: https://github.com/apache/hudi/pull/9365#discussion_r1284893578 ## hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java: ## @@ -578,6 +580,20 @@ public static HoodieTableMetaClient createMetaClient( .build(); } + public static void addLockOptions(String basePath, TypedProperties props) { +if (!props.containsKey(HoodieLockConfig.LOCK_PROVIDER_CLASS_NAME.key())) { + HoodieLockConfig lockConfig = HoodieLockConfig.newBuilder() + .withLockProvider(FileSystemBasedLockProvider.class) + .withLockWaitTimeInMillis(2000L) // 2s + .withFileSystemLockExpire(1) // 1 minute + .withClientNumRetries(30) Review Comment: Use the default values defined in the `ConfigProperty` in `HoodieLockConfig` instead of hard-coding the values? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] the-other-tim-brown commented on a diff in pull request #9367: [HUDI-6648] Support building table views from existing files
the-other-tim-brown commented on code in PR #9367: URL: https://github.com/apache/hudi/pull/9367#discussion_r1284890274 ## hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java: ## @@ -324,33 +325,45 @@ public static HoodieRecord createPartitionListRecord(List * @param partitionThe name of the partition * @param filesAdded Mapping of files to their sizes for files which have been added to this partition * @param filesDeleted List of files which have been deleted from this partition + * @param instantTime Commit time of the commit responsible for adding and/or deleting these files, will be empty during bootstrapping of the metadata table */ public static HoodieRecord createPartitionFilesRecord(String partition, - Option> filesAdded, - Option> filesDeleted) { -Map fileInfo = new HashMap<>(); -filesAdded.ifPresent(filesMap -> -fileInfo.putAll( -filesMap.entrySet().stream().collect( -Collectors.toMap(Map.Entry::getKey, (entry) -> { - long fileSize = entry.getValue(); - // Assert that the file-size of the file being added is positive, since Hudi - // should not be creating empty files - checkState(fileSize > 0); - return new HoodieMetadataFileInfo(fileSize, false); -}))) -); -filesDeleted.ifPresent(filesList -> -fileInfo.putAll( -filesList.stream().collect( -Collectors.toMap(Function.identity(), (ignored) -> new HoodieMetadataFileInfo(0L, true -); + Map filesAdded, + List filesDeleted, + Option instantTime) { +int size = filesAdded.size() + filesDeleted.size(); +Map fileInfo = new HashMap<>(size, 1); +filesAdded.forEach((fileName, fileSize) -> { + // Assert that the file-size of the file being added is positive, since Hudi + // should not be creating empty files + checkState(fileSize > 0); + fileInfo.put(handleFileName(fileName, instantTime), new HoodieMetadataFileInfo(fileSize, false)); +}); + +filesDeleted.forEach(fileName -> fileInfo.put(handleFileName(fileName, instantTime), DELETE_FILE_METADATA)); HoodieKey key = new HoodieKey(partition, MetadataPartitionType.FILES.getPartitionPath()); HoodieMetadataPayload payload = new HoodieMetadataPayload(key.getRecordKey(), METADATA_TYPE_FILE_LIST, fileInfo); return new HoodieAvroRecord<>(key, payload); } + /** + * In the case where a file was created by something other than a Hudi writer, the file name will not contain the commit time. We will prefix the file name with hudiext_[commitTime] before storing + * in the metadata table. The constructor for {@link org.apache.hudi.common.model.HoodieBaseFile} will properly handle this prefix. + * @param fileName incoming file name + * @param commitTime time of the commit (will be empty during bootstrap operations) + * @return file name with commit time prefix if the input file name does not contain the commit time, otherwise returns the original input + */ + private static String handleFileName(String fileName, Option commitTime) { Review Comment: If we want to avoid this call, we can try to send the prefixed filename through the WriteStatus instead so the fileName is a pass through at this point in the code. @vinothchandar and @nsivabalan, what are your thoughts? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9367: [HUDI-6648] Support building table views from existing files
hudi-bot commented on PR #9367: URL: https://github.com/apache/hudi/pull/9367#issuecomment-1666241805 ## CI report: * e1af45ae2e4eb423ffa0bd14376583b881b64d3b Azure: [CANCELED](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19088) * 0e4813a189457ef09caf85dca562698f6ba46a4e Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19092) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] yihua commented on pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading
yihua commented on PR #9276: URL: https://github.com/apache/hudi/pull/9276#issuecomment-1666241001 @jonvex let's track the follow-ups in JIRA. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] yihua commented on a diff in pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading
yihua commented on code in PR #9276: URL: https://github.com/apache/hudi/pull/9276#discussion_r1284886934 ## hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestNewHoodieParquetFileFormat.java: ## @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.functional; + +import org.apache.hudi.DataSourceReadOptions; +import org.apache.hudi.common.model.HoodieTableType; + +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SaveMode; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.util.Arrays; +import java.util.Map; +import java.util.stream.Stream; + +import static org.apache.hudi.common.model.HoodieTableType.COPY_ON_WRITE; +import static org.apache.hudi.common.model.HoodieTableType.MERGE_ON_READ; +import static org.junit.jupiter.api.Assertions.assertEquals; + +@Tag("functional") +public class TestNewHoodieParquetFileFormat extends TestBootstrapReadBase { + + private static Stream testArgs() { +Stream.Builder b = Stream.builder(); +HoodieTableType[] tableType = {COPY_ON_WRITE, MERGE_ON_READ}; +Integer[] nPartitions = {0, 1, 2}; +for (HoodieTableType tt : tableType) { + for (Integer n : nPartitions) { +b.add(Arguments.of(tt, n)); + } +} +return b.build(); + } + + @ParameterizedTest + @MethodSource("testArgs") + public void runTests(HoodieTableType tableType, Integer nPartitions) { +this.bootstrapType = nPartitions == 0 ? "metadata" : "mixed"; +this.dashPartitions = true; +this.tableType = tableType; +this.nPartitions = nPartitions; +setupDirs(); + +//do bootstrap +Map options = setBootstrapOptions(); +Dataset bootstrapDf = sparkSession.emptyDataFrame(); +bootstrapDf.write().format("hudi") +.options(options) +.mode(SaveMode.Overwrite) +.save(bootstrapTargetPath); +runComparisons(); + +options = basicOptions(); +doUpdate(options, "001"); +runComparisons(); + +doInsert(options, "002"); +runComparisons(); + +doDelete(options, "003"); +runComparisons(); + } + + protected void runComparisons() { +if (tableType.equals(MERGE_ON_READ)) { + runComparison(hudiBasePath); +} +runComparison(bootstrapTargetPath); + } + + protected void runComparison(String tableBasePath) { +testCount(tableBasePath); +runIndividualComparison(tableBasePath); +runIndividualComparison(tableBasePath, "partition_path"); +runIndividualComparison(tableBasePath, "_hoodie_record_key", "_hoodie_commit_time", "_hoodie_partition_path"); +runIndividualComparison(tableBasePath, "_hoodie_commit_time", "_hoodie_commit_seqno"); +runIndividualComparison(tableBasePath, "_hoodie_commit_time", "_hoodie_commit_seqno", "partition_path"); +runIndividualComparison(tableBasePath, "_row_key", "_hoodie_commit_seqno", "_hoodie_record_key", "_hoodie_partition_path"); +runIndividualComparison(tableBasePath, "_row_key", "partition_path", "_hoodie_is_deleted", "begin_lon"); + } + + protected void testCount(String tableBasePath) { +Dataset legacyDf = sparkSession.read().format("hudi") + .option(DataSourceReadOptions.USE_LEGACY_HUDI_PARQUET_FILE_FORMAT().key(),"true").load(tableBasePath); +Dataset fileFormatDf = sparkSession.read().format("hudi") + .option(DataSourceReadOptions.USE_LEGACY_HUDI_PARQUET_FILE_FORMAT().key(),"false").load(tableBasePath); +assertEquals(legacyDf.count(), fileFormatDf.count()); + } + + protected scala.collection.Seq seq(String... a) { +return scala.collection.JavaConverters.asScalaBufferConverter(Arrays.asList(a)).asScala().toSeq(); + } + + protected void runIndividualComparison(String tableBasePath) { +runIndividualComparison(tableBasePath, ""); + } + + protected void runIndividualComparison(String tableBasePath, String firstColumn, String... columns) { +Dataset legacyDf = sparkSession.read().format("hudi") + .option(DataSourceReadOptions.USE_LEGACY_HU
[GitHub] [hudi] yihua commented on a diff in pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading
yihua commented on code in PR #9276: URL: https://github.com/apache/hudi/pull/9276#discussion_r1284886809 ## hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/NewHoodieParquetFileFormat.scala: ## @@ -0,0 +1,353 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.execution.datasources.parquet + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.hudi.DataSourceReadOptions.{REALTIME_PAYLOAD_COMBINE_OPT_VAL, REALTIME_SKIP_MERGE_OPT_VAL} +import org.apache.hudi.MergeOnReadSnapshotRelation.createPartitionedFile +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.common.model.{BaseFile, FileSlice, HoodieLogFile, HoodieRecord} +import org.apache.hudi.common.util.ValidationUtils.checkState +import org.apache.hudi.{HoodieBaseRelation, HoodieSparkUtils, HoodieTableSchema, HoodieTableState, LogFileIterator, MergeOnReadSnapshotRelation, PartitionFileSliceMapping, RecordMergingFileIterator, SkipMergeIterator, SparkAdapterSupport} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.sql.HoodieCatalystExpressionUtils.generateUnsafeProjection +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.JoinedRow +import org.apache.spark.sql.execution.datasources.PartitionedFile +import org.apache.spark.sql.hudi.HoodieSqlCommonUtils.isMetaField +import org.apache.spark.sql.sources.Filter +import org.apache.spark.sql.types.{StructField, StructType} +import org.apache.spark.sql.vectorized.{ColumnVector, ColumnarBatch} +import org.apache.spark.util.SerializableConfiguration + +import scala.collection.mutable +import scala.jdk.CollectionConverters.asScalaIteratorConverter + +class NewHoodieParquetFileFormat(tableState: Broadcast[HoodieTableState], + tableSchema: Broadcast[HoodieTableSchema], + tableName: String, + mergeType: String, + mandatoryFields: Seq[String], + isMOR: Boolean, + isBootstrap: Boolean) extends ParquetFileFormat with SparkAdapterSupport { + + //Used so that the planner only projects once and does not stack overflow + var isProjected = false + + /** + * Support batch needs to remain consistent, even if one side of a bootstrap merge can support + * while the other side can't + */ + private var supportBatchCalled = false + private var supportBatchResult = false + override def supportBatch(sparkSession: SparkSession, schema: StructType): Boolean = { +if (!supportBatchCalled) { + supportBatchCalled = true + supportBatchResult = !isMOR && super.supportBatch(sparkSession, schema) +} +supportBatchResult + } + + override def buildReaderWithPartitionValues(sparkSession: SparkSession, + dataSchema: StructType, + partitionSchema: StructType, + requiredSchema: StructType, + filters: Seq[Filter], + options: Map[String, String], + hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = { + +val outputSchema = StructType(requiredSchema.fields ++ partitionSchema.fields) + +val requiredSchemaWithMandatory = if (!isMOR || MergeOnReadSnapshotRelation.isProjectionCompatible(tableState.value)) { + //add mandatory fields to required schema + val added: mutable.Buffer[StructField] = mutable.Buffer[StructField]() + for (field <- mandatoryFields) { +if (requiredSchema.getFieldIndex(field).isEmpty) { + val fieldToAdd = dataSchema.fields(dataSchema.getFieldIndex(field).get) + added.append(fieldToAdd) +} + } + val addedFields = StructType(added.toArray) + StructType(requiredSchema.toArray ++ addedFields.fields) +} else { +
[GitHub] [hudi] nsivabalan commented on a diff in pull request #9367: [HUDI-6648] Support building table views from existing files
nsivabalan commented on code in PR #9367: URL: https://github.com/apache/hudi/pull/9367#discussion_r1284886054 ## hudi-common/src/main/java/org/apache/hudi/common/config/HoodieMetadataConfig.java: ## @@ -312,6 +312,13 @@ public final class HoodieMetadataConfig extends HoodieConfig { .withDocumentation("Maximum size in bytes of a single log file. Larger log files can contain larger log blocks " + "thereby reducing the number of blocks to search for keys"); + public static final ConfigProperty DISABLE_FILESYSTEM_BOOTSTRAP = ConfigProperty + .key(METADATA_PREFIX + ".filesystem.bootstrap.disabled") + .defaultValue(false) + .sinceVersion("0.14.0") + .withDocumentation("Disable bootstrapping metadata table from the file system when the table is first created. " Review Comment: should we consider explicitly calling external metadata sync in the config naming. current naming "hoodie.metadata.filesystem.bootstrap.disabled" is kind of abstract. I am sure OSS dev may not understand what this config serves or when would someone enable it. Not too strong that we should change it. but just wanted to give it a 2nd thought as the naming will stay for ever. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] nsivabalan commented on a diff in pull request #9367: [HUDI-6648] Support building table views from existing files
nsivabalan commented on code in PR #9367: URL: https://github.com/apache/hudi/pull/9367#discussion_r1284886054 ## hudi-common/src/main/java/org/apache/hudi/common/config/HoodieMetadataConfig.java: ## @@ -312,6 +312,13 @@ public final class HoodieMetadataConfig extends HoodieConfig { .withDocumentation("Maximum size in bytes of a single log file. Larger log files can contain larger log blocks " + "thereby reducing the number of blocks to search for keys"); + public static final ConfigProperty DISABLE_FILESYSTEM_BOOTSTRAP = ConfigProperty + .key(METADATA_PREFIX + ".filesystem.bootstrap.disabled") + .defaultValue(false) + .sinceVersion("0.14.0") + .withDocumentation("Disable bootstrapping metadata table from the file system when the table is first created. " Review Comment: should we consider explicitly calling external metadata sync in the config naming. current naming "hoodie.metadata.filesystem.bootstrap.disabled" is kind of abstract. I am sure OSS dev may not understand what this config serves or when would someone enable it -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] nsivabalan commented on a diff in pull request #9367: [HUDI-6648] Support building table views from existing files
nsivabalan commented on code in PR #9367: URL: https://github.com/apache/hudi/pull/9367#discussion_r1284886054 ## hudi-common/src/main/java/org/apache/hudi/common/config/HoodieMetadataConfig.java: ## @@ -312,6 +312,13 @@ public final class HoodieMetadataConfig extends HoodieConfig { .withDocumentation("Maximum size in bytes of a single log file. Larger log files can contain larger log blocks " + "thereby reducing the number of blocks to search for keys"); + public static final ConfigProperty DISABLE_FILESYSTEM_BOOTSTRAP = ConfigProperty + .key(METADATA_PREFIX + ".filesystem.bootstrap.disabled") + .defaultValue(false) + .sinceVersion("0.14.0") + .withDocumentation("Disable bootstrapping metadata table from the file system when the table is first created. " Review Comment: should we consider explicitly calling external metadata sync in the config naming. current naming "hoodie.metadata.filesystem.bootstrap.disabled" is kind of abstract. I am sure OSS dev may not understand what this config serves. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] yihua commented on a diff in pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading
yihua commented on code in PR #9276: URL: https://github.com/apache/hudi/pull/9276#discussion_r1284873987 ## hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala: ## @@ -87,6 +87,14 @@ object DataSourceReadOptions { s"payload implementation to merge (${REALTIME_PAYLOAD_COMBINE_OPT_VAL}) or skip merging altogether" + s"${REALTIME_SKIP_MERGE_OPT_VAL}") + val USE_LEGACY_HUDI_PARQUET_FILE_FORMAT: ConfigProperty[String] = ConfigProperty +.key("hoodie.datasource.read.use.legacy.parquet.file.format") +.defaultValue("true") +.markAdvanced() +.sinceVersion("0.14.0") +.withDocumentation("Read using the legacy Hudi parquet file format. The new Hudi parquet file format is " + + "introduced as an experimental feature in 0.14.0") Review Comment: Mention that this new file format applies to MOR and Bootstrap queries only, and full schema evaluation is not supported by the new file format (i.e., `hoodie.schema.on.read.enable=true`). ## hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DefaultSource.scala: ## @@ -247,6 +245,9 @@ object DefaultSource { Option(schema) } +val useMORBootstrapFF = parameters.getOrElse(MOR_BOOTSTRAP_FILE_READER.key, + MOR_BOOTSTRAP_FILE_READER.defaultValue).toBoolean && (globPaths == null || globPaths.isEmpty) Review Comment: Got it. Then let's leave this as a follow-up. The new file format should support this too for feature completeness. ## hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/Iterators.scala: ## @@ -83,8 +94,8 @@ class LogFileIterator(logFiles: List[HoodieLogFile], } .getOrElse(new TypedProperties()) - protected override val avroSchema: Schema = new Schema.Parser().parse(requiredSchema.avroSchemaStr) - protected override val structTypeSchema: StructType = requiredSchema.structTypeSchema + protected override val avroSchema: Schema = requiredAvroSchema Review Comment: Makes sense. ## hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadSnapshotRelation.scala: ## @@ -45,13 +45,27 @@ case class MergeOnReadSnapshotRelation(override val sqlContext: SQLContext, private val globPaths: Seq[Path], private val userSchema: Option[StructType], private val prunedDataSchema: Option[StructType] = None) - extends BaseMergeOnReadSnapshotRelation(sqlContext, optParams, metaClient, globPaths, userSchema, prunedDataSchema) { + extends BaseMergeOnReadSnapshotRelation(sqlContext, optParams, metaClient, globPaths, userSchema, prunedDataSchema) with SparkAdapterSupport { override type Relation = MergeOnReadSnapshotRelation override def updatePrunedDataSchema(prunedSchema: StructType): MergeOnReadSnapshotRelation = this.copy(prunedDataSchema = Some(prunedSchema)) + def toHadoopFsRelation: HadoopFsRelation = { +fileIndex.shouldBroadcast = true +HadoopFsRelation( + location = fileIndex, + partitionSchema = fileIndex.partitionSchema, + dataSchema = fileIndex.dataSchema, + bucketSpec = None, + fileFormat = sparkAdapter.createMORBootstrapFileFormat(shouldExtractPartitionValuesFromPartitionPath, +sparkSession.sparkContext.broadcast(tableState), + sparkSession.sparkContext.broadcast(HoodieTableSchema(tableStructSchema, tableAvroSchema.toString, internalSchemaOpt)), +metaClient.getTableConfig.getTableName, mergeType, mandatoryFields, isMOR = true, isBootstrap = false).get, + optParams)(sparkSession) + } Review Comment: Yeah, sg -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] yihua commented on a diff in pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading
yihua commented on code in PR #9276: URL: https://github.com/apache/hudi/pull/9276#discussion_r1284872593 ## hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieBootstrapConfig.java: ## @@ -67,13 +67,6 @@ public class HoodieBootstrapConfig extends HoodieConfig { .sinceVersion("0.6.0") .withDocumentation("Selects the mode in which each file/partition in the bootstrapped dataset gets bootstrapped"); - public static final ConfigProperty DATA_QUERIES_ONLY = ConfigProperty - .key("hoodie.bootstrap.data.queries.only") - .defaultValue("false") - .markAdvanced() - .sinceVersion("0.14.0") - .withDocumentation("Improves query performance, but queries cannot use hudi metadata fields"); Review Comment: btw, the config doc update should be in a separate PR. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9367: [HUDI-6648] Support building table views from existing files
hudi-bot commented on PR #9367: URL: https://github.com/apache/hudi/pull/9367#issuecomment-1666213879 ## CI report: * e1af45ae2e4eb423ffa0bd14376583b881b64d3b Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19088) * 0e4813a189457ef09caf85dca562698f6ba46a4e UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading
hudi-bot commented on PR #9276: URL: https://github.com/apache/hudi/pull/9276#issuecomment-1666213683 ## CI report: * 662f3b320ab6ea06462bad9a4448add1ec2f380a UNKNOWN * ef8eaadd4f817aa08253b938e19ab3fa61d27b5c Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19046) * def394b73203814bbb635841c5f07c216c0575cc Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19091) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] yihua commented on a diff in pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading
yihua commented on code in PR #9276: URL: https://github.com/apache/hudi/pull/9276#discussion_r1284867759 ## hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieTypes.scala: ## @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi + +import org.apache.hudi.common.config.HoodieMetadataConfig +import org.apache.hudi.internal.schema.InternalSchema +import org.apache.spark.sql.types.StructType + +case class HoodieTableSchema(structTypeSchema: StructType, avroSchemaStr: String, internalSchema: Option[InternalSchema] = None) + +case class HoodieTableState(tablePath: String, Review Comment: Got it. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] yihua commented on a diff in pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading
yihua commented on code in PR #9276: URL: https://github.com/apache/hudi/pull/9276#discussion_r1284867523 ## hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieBootstrapConfig.java: ## @@ -67,13 +67,6 @@ public class HoodieBootstrapConfig extends HoodieConfig { .sinceVersion("0.6.0") .withDocumentation("Selects the mode in which each file/partition in the bootstrapped dataset gets bootstrapped"); - public static final ConfigProperty DATA_QUERIES_ONLY = ConfigProperty - .key("hoodie.bootstrap.data.queries.only") - .defaultValue("false") - .markAdvanced() - .sinceVersion("0.14.0") - .withDocumentation("Improves query performance, but queries cannot use hudi metadata fields"); Review Comment: Got it. Could you update the config documentation? As discussed, we'll keep this config since it can still be used when the existing file format and relations are used. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading
hudi-bot commented on PR #9276: URL: https://github.com/apache/hudi/pull/9276#issuecomment-1666208153 ## CI report: * 662f3b320ab6ea06462bad9a4448add1ec2f380a UNKNOWN * ef8eaadd4f817aa08253b938e19ab3fa61d27b5c Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19046) * def394b73203814bbb635841c5f07c216c0575cc UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9369: [HUDI-6386] Fix flakey multiwriter tests
hudi-bot commented on PR #9369: URL: https://github.com/apache/hudi/pull/9369#issuecomment-1666208352 ## CI report: * 68ba8ae97df89a261298dbb06c80f43c57159969 UNKNOWN * 2d7ced1d3723f9c6357378c3849ebca7e2db4904 Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19090) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] yihua commented on a diff in pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading
yihua commented on code in PR #9276: URL: https://github.com/apache/hudi/pull/9276#discussion_r1284854120 ## hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/LegacyHoodieParquetFileFormat.scala: ## @@ -23,12 +23,12 @@ import org.apache.hudi.{DataSourceReadOptions, HoodieSparkUtils, SparkAdapterSup import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.PartitionedFile -import org.apache.spark.sql.execution.datasources.parquet.HoodieParquetFileFormat.FILE_FORMAT_ID +import org.apache.spark.sql.execution.datasources.parquet.LegacyHoodieParquetFileFormat.FILE_FORMAT_ID import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.types.{AtomicType, StructType} -class HoodieParquetFileFormat extends ParquetFileFormat with SparkAdapterSupport { +class LegacyHoodieParquetFileFormat extends ParquetFileFormat with SparkAdapterSupport { Review Comment: Add docs here to link to the new file format implementation so that any changes to this format implementation should also reflect in the new file format class? ## hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark24LegacyHoodieParquetFileFormat.scala: ## @@ -50,7 +50,7 @@ import java.net.URI * Avoiding appending partition values to the rows read from the data file * */ -class Spark24HoodieParquetFileFormat(private val shouldAppendPartitionValues: Boolean) extends ParquetFileFormat { +class Spark24LegacyHoodieParquetFileFormat(private val shouldAppendPartitionValues: Boolean) extends ParquetFileFormat { Review Comment: Same here for version-specific file format classes: add docs here to link to the new file format implementation so that any changes to this format implementation should also reflect in the new file format class? ## hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/SparkAdapter.scala: ## @@ -165,7 +165,11 @@ trait SparkAdapter extends Serializable { /** * Create instance of [[ParquetFileFormat]] */ - def createHoodieParquetFileFormat(appendPartitionValues: Boolean): Option[ParquetFileFormat] + def createLegacyHoodieParquetFileFormat(appendPartitionValues: Boolean): Option[ParquetFileFormat] + + def getFilePath(file: PartitionedFile): Path Review Comment: There is an existing API with the same functionality: `HoodieSparkPartitionedFileUtils.getPathFromPartitionedFile` ## hudi-spark-datasource/hudi-spark-common/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister: ## @@ -17,4 +17,4 @@ org.apache.hudi.DefaultSource -org.apache.spark.sql.execution.datasources.parquet.HoodieParquetFileFormat \ No newline at end of file +org.apache.spark.sql.execution.datasources.parquet.LegacyHoodieParquetFileFormat Review Comment: Just curious, I don't have a clear answer. Since `createRelation` is overridden so functionality-wise it's ok. ## hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/NewHoodieParquetFileFormatUtils.scala: ## @@ -0,0 +1,215 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.avro.Schema +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.hadoop.mapred.JobConf +import org.apache.hudi.HoodieBaseRelation._ +import org.apache.hudi.HoodieConversionUtils.toScalaOption +import org.apache.hudi.common.config.ConfigProperty +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.common.model.HoodieRecord +import org.apache.hudi.common.model.HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE_PREFIX +import org.apache.hudi.common.table.timeline.HoodieTimeline +import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient, TableSchemaResolver} +import org.apache.hudi.common.util.ValidationUtils.checkState +import org.apache.hudi.common.util.{ConfigUtils, StringUtils} +import org.apache.hudi.config.HoodieBootstrapConfig.DATA_QUERIES_ONLY +import org.apache.
[GitHub] [hudi] jonvex commented on a diff in pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading
jonvex commented on code in PR #9276: URL: https://github.com/apache/hudi/pull/9276#discussion_r1284863793 ## hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/NewHoodieParquetFileFormat.scala: ## @@ -0,0 +1,353 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.execution.datasources.parquet + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.hudi.DataSourceReadOptions.{REALTIME_PAYLOAD_COMBINE_OPT_VAL, REALTIME_SKIP_MERGE_OPT_VAL} +import org.apache.hudi.MergeOnReadSnapshotRelation.createPartitionedFile +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.common.model.{BaseFile, FileSlice, HoodieLogFile, HoodieRecord} +import org.apache.hudi.common.util.ValidationUtils.checkState +import org.apache.hudi.{HoodieBaseRelation, HoodieSparkUtils, HoodieTableSchema, HoodieTableState, LogFileIterator, MergeOnReadSnapshotRelation, PartitionFileSliceMapping, RecordMergingFileIterator, SkipMergeIterator, SparkAdapterSupport} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.sql.HoodieCatalystExpressionUtils.generateUnsafeProjection +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.JoinedRow +import org.apache.spark.sql.execution.datasources.PartitionedFile +import org.apache.spark.sql.hudi.HoodieSqlCommonUtils.isMetaField +import org.apache.spark.sql.sources.Filter +import org.apache.spark.sql.types.{StructField, StructType} +import org.apache.spark.sql.vectorized.{ColumnVector, ColumnarBatch} +import org.apache.spark.util.SerializableConfiguration + +import scala.collection.mutable +import scala.jdk.CollectionConverters.asScalaIteratorConverter + +class NewHoodieParquetFileFormat(tableState: Broadcast[HoodieTableState], + tableSchema: Broadcast[HoodieTableSchema], + tableName: String, + mergeType: String, + mandatoryFields: Seq[String], + isMOR: Boolean, + isBootstrap: Boolean) extends ParquetFileFormat with SparkAdapterSupport { + + //Used so that the planner only projects once and does not stack overflow + var isProjected = false + + /** + * Support batch needs to remain consistent, even if one side of a bootstrap merge can support + * while the other side can't + */ + private var supportBatchCalled = false + private var supportBatchResult = false + override def supportBatch(sparkSession: SparkSession, schema: StructType): Boolean = { +if (!supportBatchCalled) { + supportBatchCalled = true + supportBatchResult = !isMOR && super.supportBatch(sparkSession, schema) +} +supportBatchResult + } + + override def buildReaderWithPartitionValues(sparkSession: SparkSession, + dataSchema: StructType, + partitionSchema: StructType, + requiredSchema: StructType, + filters: Seq[Filter], + options: Map[String, String], + hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = { + +val outputSchema = StructType(requiredSchema.fields ++ partitionSchema.fields) + +val requiredSchemaWithMandatory = if (!isMOR || MergeOnReadSnapshotRelation.isProjectionCompatible(tableState.value)) { + //add mandatory fields to required schema + val added: mutable.Buffer[StructField] = mutable.Buffer[StructField]() + for (field <- mandatoryFields) { +if (requiredSchema.getFieldIndex(field).isEmpty) { + val fieldToAdd = dataSchema.fields(dataSchema.getFieldIndex(field).get) + added.append(fieldToAdd) +} + } + val addedFields = StructType(added.toArray) + StructType(requiredSchema.toArray ++ addedFields.fields) +} else {
[GitHub] [hudi] hudi-bot commented on pull request #9369: [HUDI-6386] Fix flakey multiwriter tests
hudi-bot commented on PR #9369: URL: https://github.com/apache/hudi/pull/9369#issuecomment-1666202386 ## CI report: * 68ba8ae97df89a261298dbb06c80f43c57159969 UNKNOWN * 2d7ced1d3723f9c6357378c3849ebca7e2db4904 UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] jonvex commented on a diff in pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading
jonvex commented on code in PR #9276: URL: https://github.com/apache/hudi/pull/9276#discussion_r1284853617 ## hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/adapter/Spark2Adapter.scala: ## @@ -143,8 +146,8 @@ class Spark2Adapter extends SparkAdapter { partitions.toSeq } - override def createHoodieParquetFileFormat(appendPartitionValues: Boolean): Option[ParquetFileFormat] = { -Some(new Spark24HoodieParquetFileFormat(appendPartitionValues)) + override def createLegacyHoodieParquetFileFormat(appendPartitionValues: Boolean): Option[ParquetFileFormat] = { +Some(new Spark24LegacyHoodieParquetFileFormat(appendPartitionValues)) Review Comment: Yeah. We're going to have to figure out if there is a way to do schema evolution without porting code. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] jonvex commented on a diff in pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading
jonvex commented on code in PR #9276: URL: https://github.com/apache/hudi/pull/9276#discussion_r1284852746 ## hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestNewHoodieParquetFileFormat.java: ## @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.functional; + +import org.apache.hudi.DataSourceReadOptions; +import org.apache.hudi.common.model.HoodieTableType; + +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SaveMode; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.util.Arrays; +import java.util.Map; +import java.util.stream.Stream; + +import static org.apache.hudi.common.model.HoodieTableType.COPY_ON_WRITE; +import static org.apache.hudi.common.model.HoodieTableType.MERGE_ON_READ; +import static org.junit.jupiter.api.Assertions.assertEquals; + +@Tag("functional") +public class TestNewHoodieParquetFileFormat extends TestBootstrapReadBase { + + private static Stream testArgs() { +Stream.Builder b = Stream.builder(); +HoodieTableType[] tableType = {COPY_ON_WRITE, MERGE_ON_READ}; +Integer[] nPartitions = {0, 1, 2}; +for (HoodieTableType tt : tableType) { + for (Integer n : nPartitions) { +b.add(Arguments.of(tt, n)); + } +} +return b.build(); + } + + @ParameterizedTest + @MethodSource("testArgs") + public void runTests(HoodieTableType tableType, Integer nPartitions) { +this.bootstrapType = nPartitions == 0 ? "metadata" : "mixed"; +this.dashPartitions = true; +this.tableType = tableType; +this.nPartitions = nPartitions; +setupDirs(); + +//do bootstrap +Map options = setBootstrapOptions(); +Dataset bootstrapDf = sparkSession.emptyDataFrame(); +bootstrapDf.write().format("hudi") +.options(options) +.mode(SaveMode.Overwrite) +.save(bootstrapTargetPath); +runComparisons(); + +options = basicOptions(); +doUpdate(options, "001"); +runComparisons(); + +doInsert(options, "002"); +runComparisons(); + +doDelete(options, "003"); +runComparisons(); + } + + protected void runComparisons() { +if (tableType.equals(MERGE_ON_READ)) { + runComparison(hudiBasePath); +} +runComparison(bootstrapTargetPath); + } + + protected void runComparison(String tableBasePath) { +testCount(tableBasePath); +runIndividualComparison(tableBasePath); +runIndividualComparison(tableBasePath, "partition_path"); +runIndividualComparison(tableBasePath, "_hoodie_record_key", "_hoodie_commit_time", "_hoodie_partition_path"); +runIndividualComparison(tableBasePath, "_hoodie_commit_time", "_hoodie_commit_seqno"); +runIndividualComparison(tableBasePath, "_hoodie_commit_time", "_hoodie_commit_seqno", "partition_path"); +runIndividualComparison(tableBasePath, "_row_key", "_hoodie_commit_seqno", "_hoodie_record_key", "_hoodie_partition_path"); +runIndividualComparison(tableBasePath, "_row_key", "partition_path", "_hoodie_is_deleted", "begin_lon"); + } + + protected void testCount(String tableBasePath) { +Dataset legacyDf = sparkSession.read().format("hudi") + .option(DataSourceReadOptions.USE_LEGACY_HUDI_PARQUET_FILE_FORMAT().key(),"true").load(tableBasePath); +Dataset fileFormatDf = sparkSession.read().format("hudi") + .option(DataSourceReadOptions.USE_LEGACY_HUDI_PARQUET_FILE_FORMAT().key(),"false").load(tableBasePath); +assertEquals(legacyDf.count(), fileFormatDf.count()); + } + + protected scala.collection.Seq seq(String... a) { +return scala.collection.JavaConverters.asScalaBufferConverter(Arrays.asList(a)).asScala().toSeq(); + } + + protected void runIndividualComparison(String tableBasePath) { +runIndividualComparison(tableBasePath, ""); + } + + protected void runIndividualComparison(String tableBasePath, String firstColumn, String... columns) { +Dataset legacyDf = sparkSession.read().format("hudi") + .option(DataSourceReadOptions.USE_LEGACY_H
[GitHub] [hudi] yihua commented on a diff in pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading
yihua commented on code in PR #9276: URL: https://github.com/apache/hudi/pull/9276#discussion_r1284822235 ## hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/NewHoodieParquetFileFormat.scala: ## @@ -0,0 +1,353 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.execution.datasources.parquet + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.hudi.DataSourceReadOptions.{REALTIME_PAYLOAD_COMBINE_OPT_VAL, REALTIME_SKIP_MERGE_OPT_VAL} +import org.apache.hudi.MergeOnReadSnapshotRelation.createPartitionedFile +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.common.model.{BaseFile, FileSlice, HoodieLogFile, HoodieRecord} +import org.apache.hudi.common.util.ValidationUtils.checkState +import org.apache.hudi.{HoodieBaseRelation, HoodieSparkUtils, HoodieTableSchema, HoodieTableState, LogFileIterator, MergeOnReadSnapshotRelation, PartitionFileSliceMapping, RecordMergingFileIterator, SkipMergeIterator, SparkAdapterSupport} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.sql.HoodieCatalystExpressionUtils.generateUnsafeProjection +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.JoinedRow +import org.apache.spark.sql.execution.datasources.PartitionedFile +import org.apache.spark.sql.hudi.HoodieSqlCommonUtils.isMetaField +import org.apache.spark.sql.sources.Filter +import org.apache.spark.sql.types.{StructField, StructType} +import org.apache.spark.sql.vectorized.{ColumnVector, ColumnarBatch} +import org.apache.spark.util.SerializableConfiguration + +import scala.collection.mutable +import scala.jdk.CollectionConverters.asScalaIteratorConverter + +class NewHoodieParquetFileFormat(tableState: Broadcast[HoodieTableState], + tableSchema: Broadcast[HoodieTableSchema], + tableName: String, + mergeType: String, + mandatoryFields: Seq[String], + isMOR: Boolean, + isBootstrap: Boolean) extends ParquetFileFormat with SparkAdapterSupport { + + //Used so that the planner only projects once and does not stack overflow + var isProjected = false + + /** + * Support batch needs to remain consistent, even if one side of a bootstrap merge can support + * while the other side can't + */ + private var supportBatchCalled = false + private var supportBatchResult = false + override def supportBatch(sparkSession: SparkSession, schema: StructType): Boolean = { +if (!supportBatchCalled) { + supportBatchCalled = true + supportBatchResult = !isMOR && super.supportBatch(sparkSession, schema) +} +supportBatchResult + } + + override def buildReaderWithPartitionValues(sparkSession: SparkSession, + dataSchema: StructType, + partitionSchema: StructType, + requiredSchema: StructType, + filters: Seq[Filter], + options: Map[String, String], + hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = { + +val outputSchema = StructType(requiredSchema.fields ++ partitionSchema.fields) + +val requiredSchemaWithMandatory = if (!isMOR || MergeOnReadSnapshotRelation.isProjectionCompatible(tableState.value)) { + //add mandatory fields to required schema + val added: mutable.Buffer[StructField] = mutable.Buffer[StructField]() + for (field <- mandatoryFields) { +if (requiredSchema.getFieldIndex(field).isEmpty) { + val fieldToAdd = dataSchema.fields(dataSchema.getFieldIndex(field).get) + added.append(fieldToAdd) +} + } + val addedFields = StructType(added.toArray) + StructType(requiredSchema.toArray ++ addedFields.fields) +} else { +
[GitHub] [hudi] jonvex commented on a diff in pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading
jonvex commented on code in PR #9276: URL: https://github.com/apache/hudi/pull/9276#discussion_r1284845207 ## hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/NewHoodieParquetFileFormatUtils.scala: ## @@ -0,0 +1,215 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.avro.Schema +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.hadoop.mapred.JobConf +import org.apache.hudi.HoodieBaseRelation._ +import org.apache.hudi.HoodieConversionUtils.toScalaOption +import org.apache.hudi.common.config.ConfigProperty +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.common.model.HoodieRecord +import org.apache.hudi.common.model.HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE_PREFIX +import org.apache.hudi.common.table.timeline.HoodieTimeline +import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient, TableSchemaResolver} +import org.apache.hudi.common.util.ValidationUtils.checkState +import org.apache.hudi.common.util.{ConfigUtils, StringUtils} +import org.apache.hudi.config.HoodieBootstrapConfig.DATA_QUERIES_ONLY +import org.apache.hudi.config.HoodieWriteConfig +import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter +import org.apache.hudi.internal.schema.{HoodieSchemaException, InternalSchema} +import org.apache.spark.sql.catalyst.analysis.Resolver +import org.apache.spark.sql.execution.datasources.parquet.NewHoodieParquetFileFormat +import org.apache.spark.sql.execution.datasources.{FileStatusCache, HadoopFsRelation} +import org.apache.spark.sql.hudi.HoodieSqlCommonUtils +import org.apache.spark.sql.sources.BaseRelation +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.{SQLContext, SparkSession} + +import scala.collection.JavaConverters._ +import scala.util.{Failure, Success, Try} + +class NewHoodieParquetFileFormatUtils(val sqlContext: SQLContext, + val metaClient: HoodieTableMetaClient, + val optParamsInput: Map[String, String], + private val schemaSpec: Option[StructType]) extends SparkAdapterSupport { + protected val sparkSession: SparkSession = sqlContext.sparkSession + + protected val optParams: Map[String, String] = optParamsInput.filter(kv => !kv._1.equals(DATA_QUERIES_ONLY.key())) + protected def tableName: String = metaClient.getTableConfig.getTableName + + protected lazy val resolver: Resolver = sparkSession.sessionState.analyzer.resolver + + private lazy val metaFieldNames = HoodieRecord.HOODIE_META_COLUMNS.asScala.toSet + + protected lazy val fileIndex: HoodieFileIndex = +HoodieFileIndex(sparkSession, metaClient, Some(tableStructSchema), optParams, FileStatusCache.getOrCreate(sparkSession)) + + protected lazy val conf: Configuration = new Configuration(sqlContext.sparkContext.hadoopConfiguration) + protected lazy val jobConf = new JobConf(conf) + + protected lazy val tableConfig: HoodieTableConfig = metaClient.getTableConfig + + protected lazy val basePath: Path = metaClient.getBasePathV2 + + protected lazy val (tableAvroSchema: Schema, internalSchemaOpt: Option[InternalSchema]) = { Review Comment: Every singe method here is from base relation. You said to not use the relation so I just copied over what I needed. It was much simpler before -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] jonvex commented on a diff in pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading
jonvex commented on code in PR #9276: URL: https://github.com/apache/hudi/pull/9276#discussion_r1284844263 ## hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/Iterators.scala: ## @@ -83,8 +94,8 @@ class LogFileIterator(logFiles: List[HoodieLogFile], } .getOrElse(new TypedProperties()) - protected override val avroSchema: Schema = new Schema.Parser().parse(requiredSchema.avroSchemaStr) - protected override val structTypeSchema: StructType = requiredSchema.structTypeSchema + protected override val avroSchema: Schema = requiredAvroSchema Review Comment: There isn't any functional difference to this code. I just added an alternate constructor so that I didn't have to create another table schema just to deconstruct it ``` def this(logFiles: List[HoodieLogFile], partitionPath: Path, tableSchema: HoodieTableSchema, requiredSchema: HoodieTableSchema, tableState: HoodieTableState, config: Configuration) { this(logFiles, partitionPath, tableSchema, requiredSchema.structTypeSchema, new Schema.Parser().parse(requiredSchema.avroSchemaStr), tableState, config) } ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9369: [HUDI-6386] Fix flakey multiwriter tests
hudi-bot commented on PR #9369: URL: https://github.com/apache/hudi/pull/9369#issuecomment-1666166562 ## CI report: * 68ba8ae97df89a261298dbb06c80f43c57159969 UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9345: [HUDI-6649] Fix column stat based data filtering for MOR
hudi-bot commented on PR #9345: URL: https://github.com/apache/hudi/pull/9345#issuecomment-1666166448 ## CI report: * 600d19c601ec8b224643b7e4c0a7c0f7a2e1e290 Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19089) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9345: [HUDI-6649] Fix column stat based data filtering for MOR
hudi-bot commented on PR #9345: URL: https://github.com/apache/hudi/pull/9345#issuecomment-1666159747 ## CI report: * 600d19c601ec8b224643b7e4c0a7c0f7a2e1e290 UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] jonvex opened a new pull request, #9369: [HUDI-6386] Fix flakey multiwriter tests
jonvex opened a new pull request, #9369: URL: https://github.com/apache/hudi/pull/9369 ### Change Logs getHoodieWriteClient was not thread safe so I override the method so it is and more than 1 client could be used ### Impact Tests no longer flakey ### Risk level (write none, low medium or high below) none ### Documentation Update _Describe any necessary documentation update if there is any new feature, config, or user-facing change_ N/A ### Contributor's checklist - [ ] Read through [contributor's guide](https://hudi.apache.org/contribute/how-to-contribute) - [ ] Change Logs and Impact were stated clearly - [ ] Adequate tests were added if applicable - [ ] CI passed -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Updated] (HUDI-6649) Fix column stat based data filtering for MOR
[ https://issues.apache.org/jira/browse/HUDI-6649?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] ASF GitHub Bot updated HUDI-6649: - Labels: pull-request-available (was: ) > Fix column stat based data filtering for MOR > > > Key: HUDI-6649 > URL: https://issues.apache.org/jira/browse/HUDI-6649 > Project: Apache Hudi > Issue Type: Bug > Components: index, writer-core >Reporter: Lokesh Jain >Assignee: Lokesh Jain >Priority: Major > Labels: pull-request-available > > Currently MOR snapshot relation does not use the column stats index for > pruning the files in its queries. The Jira aims to add support for pruning > the file slices based on column stats in case of MOR. > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[GitHub] [hudi] lokeshj1703 commented on pull request #9345: [HUDI-6649] Fix column stat based data filtering for MOR
lokeshj1703 commented on PR #9345: URL: https://github.com/apache/hudi/pull/9345#issuecomment-1666135202 I couldn't find a way to setup async compaction in the test. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Updated] (HUDI-6649) Fix column stat based data filtering for MOR
[ https://issues.apache.org/jira/browse/HUDI-6649?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Lokesh Jain updated HUDI-6649: -- Description: Currently MOR snapshot relation does not use the column stats index for pruning the files in its queries. The Jira aims to add support for pruning the file slices based on column stats in case of MOR. was:Currently MOR snapshot and incremental relation does not use the column stats index for pruning the files in its queries. The Jira aims to add support for pruning the file slices based on column stats in case of MOR. > Fix column stat based data filtering for MOR > > > Key: HUDI-6649 > URL: https://issues.apache.org/jira/browse/HUDI-6649 > Project: Apache Hudi > Issue Type: Bug > Components: index, writer-core >Reporter: Lokesh Jain >Assignee: Lokesh Jain >Priority: Major > > Currently MOR snapshot relation does not use the column stats index for > pruning the files in its queries. The Jira aims to add support for pruning > the file slices based on column stats in case of MOR. > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Created] (HUDI-6649) Fix column stat based data filtering for MOR
Lokesh Jain created HUDI-6649: - Summary: Fix column stat based data filtering for MOR Key: HUDI-6649 URL: https://issues.apache.org/jira/browse/HUDI-6649 Project: Apache Hudi Issue Type: Bug Components: index, writer-core Reporter: Lokesh Jain Assignee: Lokesh Jain Currently MOR snapshot and incremental relation does not use the column stats index for pruning the files in its queries. The Jira aims to add support for pruning the file slices based on column stats. -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-6649) Fix column stat based data filtering for MOR
[ https://issues.apache.org/jira/browse/HUDI-6649?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Lokesh Jain updated HUDI-6649: -- Description: Currently MOR snapshot and incremental relation does not use the column stats index for pruning the files in its queries. The Jira aims to add support for pruning the file slices based on column stats in case of MOR. (was: Currently MOR snapshot and incremental relation does not use the column stats index for pruning the files in its queries. The Jira aims to add support for pruning the file slices based on column stats.) > Fix column stat based data filtering for MOR > > > Key: HUDI-6649 > URL: https://issues.apache.org/jira/browse/HUDI-6649 > Project: Apache Hudi > Issue Type: Bug > Components: index, writer-core >Reporter: Lokesh Jain >Assignee: Lokesh Jain >Priority: Major > > Currently MOR snapshot and incremental relation does not use the column stats > index for pruning the files in its queries. The Jira aims to add support for > pruning the file slices based on column stats in case of MOR. -- This message was sent by Atlassian Jira (v8.20.10#820010)
[GitHub] [hudi] yihua commented on a diff in pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading
yihua commented on code in PR #9276: URL: https://github.com/apache/hudi/pull/9276#discussion_r1284800578 ## hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/Iterators.scala: ## @@ -83,8 +94,8 @@ class LogFileIterator(logFiles: List[HoodieLogFile], } .getOrElse(new TypedProperties()) - protected override val avroSchema: Schema = new Schema.Parser().parse(requiredSchema.avroSchemaStr) - protected override val structTypeSchema: StructType = requiredSchema.structTypeSchema + protected override val avroSchema: Schema = requiredAvroSchema Review Comment: Is this for performance improvement? Should `requiredSchema: HoodieTableSchema` still be kept in the constructor for supporting schema evolution in the future? ## hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/Iterators.scala: ## @@ -181,17 +194,30 @@ class RecordMergingFileIterator(logFiles: List[HoodieLogFile], baseFileIterator: Iterator[InternalRow], readerSchema: StructType, dataSchema: HoodieTableSchema, -requiredSchema: HoodieTableSchema, +requiredStructTypeSchema: StructType, +requiredAvroSchema: Schema, tableState: HoodieTableState, config: Configuration) - extends LogFileIterator(logFiles, partitionPath, dataSchema, requiredSchema, tableState, config) { + extends LogFileIterator(logFiles, partitionPath, dataSchema, requiredStructTypeSchema, requiredAvroSchema, tableState, config) { + def this(logFiles: List[HoodieLogFile], + partitionPath: Path, + baseFileIterator: Iterator[InternalRow], + readerSchema: StructType, + dataSchema: HoodieTableSchema, + requiredSchema: HoodieTableSchema, + tableState: HoodieTableState, + config: Configuration) { +this(logFiles, partitionPath, baseFileIterator, readerSchema, dataSchema, requiredSchema.structTypeSchema, + new Schema.Parser().parse(requiredSchema.avroSchemaStr), tableState, config) + } def this(split: HoodieMergeOnReadFileSplit, baseFileReader: BaseFileReader, dataSchema: HoodieTableSchema, requiredSchema: HoodieTableSchema, tableState: HoodieTableState, config: Configuration) { this(split.logFiles, getPartitionPath(split), baseFileReader(split.dataFile.get), baseFileReader.schema, dataSchema, requiredSchema, tableState, config) } + Review Comment: nit: remove empty line ## hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/SparkHoodieTableFileIndex.scala: ## @@ -99,7 +96,7 @@ class SparkHoodieTableFileIndex(spark: SparkSession, AvroConversionUtils.convertAvroSchemaToStructType(schemaUtil.getTableAvroSchema) }) - protected lazy val shouldFastBootstrap = configProperties.getBoolean(DATA_QUERIES_ONLY.key, false) + protected lazy val shouldFastBootstrap: Boolean = configProperties.getBoolean(DATA_QUERIES_ONLY.key, false) Review Comment: nit: let's avoid cosmetic changes in this class. ## hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/NewHoodieParquetFileFormat.scala: ## @@ -0,0 +1,353 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.execution.datasources.parquet + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.hudi.DataSourceReadOptions.{REALTIME_PAYLOAD_COMBINE_OPT_VAL, REALTIME_SKIP_MERGE_OPT_VAL} +import org.apache.hudi.MergeOnReadSnapshotRelation.createPartitionedFile +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.common.model.{BaseFile, FileSlice, HoodieLogFile, HoodieRecord} +import org.apache.hudi.common.util.ValidationUtils.checkState +import org.apache.hudi.{HoodieBaseRelation, HoodieSparkUtils, HoodieTableSchema, HoodieTableState, LogFileIterator, MergeOnReadSnapshotRelation, PartitionFileSliceMapping, Re