date:20230804

[GitHub] [hudi] hudi-bot commented on pull request #9365: [HUDI-6646] Add default lock provider for spark offline compaction an…

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9365:
URL: https://github.com/apache/hudi/pull/9365#issuecomment-1666415439

   
   ## CI report:
   
   * 5599728990aa2b2d7e6dae8ef5c54196e94c773e Azure: 
[FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19096)
 
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] hudi-bot commented on pull request #9371: [HUDI-6647] Expand Hudi Java Client Functionality

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9371:
URL: https://github.com/apache/hudi/pull/9371#issuecomment-1666415453

   
   ## CI report:
   
   * 0e39684d85c18aef48131ba838ffc63e48b5fcf2 UNKNOWN
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] hudi-bot commented on pull request #9261: [HUDI-6579] Adding support for upsert and deletes with spark datasource for pk less table

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9261:
URL: https://github.com/apache/hudi/pull/9261#issuecomment-1666415390

   
   ## CI report:
   
   * 3968bf360001624911a6c04b9a3c44af36a4dbf7 Azure: 
[FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19076)
 
   * c1977e21cdf02d75158bd3dec3b335e27755915c Azure: 
[PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19102)
 
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] codope commented on a diff in pull request #9345: [HUDI-6649] Fix column stat based data filtering for MOR

2023-08-04 Thread via GitHub



codope commented on code in PR #9345:
URL: https://github.com/apache/hudi/pull/9345#discussion_r1284982434


##
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala:
##
@@ -122,81 +125,120 @@ case class HoodieFileIndex(spark: SparkSession,
* @return list of PartitionDirectory containing partition to base files 
mapping
*/
   override def listFiles(partitionFilters: Seq[Expression], dataFilters: 
Seq[Expression]): Seq[PartitionDirectory] = {
+// Prune the partition path by the partition filters
+// NOTE: Non-partitioned tables are assumed to consist from a single 
partition
+//   encompassing the whole table
+val partitionsAndFileSlices = 
getFileSlicesForPrunedPartitions(partitionFilters)
+val listedPartitions = filterFileSlices(dataFilters, 
partitionsAndFileSlices).map {
+  case (partition, fileSlices) =>
+val allCandidateFiles: Seq[FileStatus] = fileSlices.flatMap(fs => {
+  val baseFileStatusOpt = 
getBaseFileStatus(Option.apply(fs.getBaseFile.orElse(null)))
+  val logFilesStatus = if (includeLogFiles) {
+
fs.getLogFiles.map[FileStatus](JFunction.toJavaFunction[HoodieLogFile, 
FileStatus](lf => lf.getFileStatus))
+  } else {
+java.util.stream.Stream.empty()
+  }
+  val files = 
logFilesStatus.collect(Collectors.toList[FileStatus]).asScala
+  baseFileStatusOpt.foreach(f => files.append(f))
+  files
+})
+
+PartitionDirectory(InternalRow.fromSeq(partition.get.values), 
allCandidateFiles)
+}
+
+hasPushedDownPartitionPredicates = true
+
+if (shouldReadAsPartitionedTable()) {
+  listedPartitions
+} else {
+  Seq(PartitionDirectory(InternalRow.empty, 
listedPartitions.flatMap(_.files)))
+}
+  }
+
+  def filterFileSlices(dataFilters: Seq[Expression], partitionAndFileSlices: 
Seq[(Option[BaseHoodieTableFileIndex.PartitionPath], Seq[FileSlice])])
+  : Seq[(Option[BaseHoodieTableFileIndex.PartitionPath], Seq[FileSlice])] = {
 // Look up candidate files names in the col-stats index, if all of the 
following conditions are true
 //- Data-skipping is enabled
 //- Col-Stats Index is present
 //- List of predicates (filters) is present
 val candidateFilesNamesOpt: Option[Set[String]] =
-  lookupCandidateFilesInMetadataTable(dataFilters) match {
-case Success(opt) => opt
-case Failure(e) =>
-  logError("Failed to lookup candidate files in File Index", e)
-
-  spark.sqlContext.getConf(DataSkippingFailureMode.configName, 
DataSkippingFailureMode.Fallback.value) match {
-case DataSkippingFailureMode.Fallback.value => Option.empty
-case DataSkippingFailureMode.Strict.value   => throw new 
HoodieException(e);
-  }
-  }
+lookupCandidateFilesInMetadataTable(dataFilters) match {
+  case Success(opt) => opt
+  case Failure(e) =>
+logError("Failed to lookup candidate files in File Index", e)
+
+spark.sqlContext.getConf(DataSkippingFailureMode.configName, 
DataSkippingFailureMode.Fallback.value) match {
+  case DataSkippingFailureMode.Fallback.value => Option.empty
+  case DataSkippingFailureMode.Strict.value => throw new 
HoodieException(e);
+}
+}
 
 logDebug(s"Overlapping candidate files from Column Stats Index: 
${candidateFilesNamesOpt.getOrElse(Set.empty)}")
 
-var totalFileSize = 0
-var candidateFileSize = 0
-
-// Prune the partition path by the partition filters
-// NOTE: Non-partitioned tables are assumed to consist from a single 
partition
-//   encompassing the whole table
-val prunedPartitions = listMatchingPartitionPaths(partitionFilters)
-val listedPartitions = getInputFileSlices(prunedPartitions: 
_*).asScala.toSeq.map {
-  case (partition, fileSlices) =>
-val baseFileStatuses: Seq[FileStatus] = getBaseFileStatus(fileSlices
-  .asScala
-  .map(fs => fs.getBaseFile.orElse(null))
-  .filter(_ != null))
+var totalFileSliceSize = 0
+var candidateFileSliceSize = 0
 
+val listedPartitions = partitionAndFileSlices.map {
+  case (partitionOpt, fileSlices) =>
 // Filter in candidate files based on the col-stats index lookup
-val candidateFiles = baseFileStatuses.filter(fs =>
-  // NOTE: This predicate is true when {@code Option} is empty
-  candidateFilesNamesOpt.forall(_.contains(fs.getPath.getName)))
+val candidateFileSlices: Seq[FileSlice] = {
+  fileSlices.filter(fs => {
+val baseFileStatusOpt = 
getBaseFileStatus(Option.apply(fs.getBaseFile.orElse(null)))
+val logFiles = 
fs.getLogFiles.collect(Collectors.toSet[HoodieLogFile]).asScala.toSet[HoodieLogFile]
+// NOTE: This predicate is true when {@code Option} is empty
+if (candidateFilesNamesOpt.forall(files =>

[jira] [Updated] (HUDI-6647) Expand Hudi Java Client Functionality

2023-08-04 Thread ASF GitHub Bot (Jira)



 [ 
https://issues.apache.org/jira/browse/HUDI-6647?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

ASF GitHub Bot updated HUDI-6647:
-
Labels: pull-request-available  (was: )

> Expand Hudi Java Client Functionality
> -
>
> Key: HUDI-6647
> URL: https://issues.apache.org/jira/browse/HUDI-6647
> Project: Apache Hudi
>  Issue Type: Improvement
>Reporter: Timothy Brown
>Assignee: Timothy Brown
>Priority: Major
>  Labels: pull-request-available
>
> With recent improvements to the abstractions in the Hudi codebase we can 
> expand the functionality in the java client with a lower amount of effort by 
> moving common code into the base client and table services.



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

[GitHub] [hudi] the-other-tim-brown opened a new pull request, #9371: [HUDI-6647] Expand Hudi Java Client Functionality

2023-08-04 Thread via GitHub



the-other-tim-brown opened a new pull request, #9371:
URL: https://github.com/apache/hudi/pull/9371

   ### Change Logs
   
   Moves common logic into BaseHoodieTableServiceClient and 
BaseHoodieWriteClient.
   
   Adds support for simple index in java client by relying on existing classes 
and helpers.
   
   ### Impact
   
   This enables other features for java client users and moves us towards more 
shared code, which means fixes and improvements don't need to be applied across 
3 clients in the future.
   
   ### Risk level (write none, low medium or high below)
   
   low
   
   ### Documentation Update
   
   _Describe any necessary documentation update if there is any new feature, 
config, or user-facing change_
   
   - _The config description must be updated if new configs are added or the 
default value of the configs are changed_
   - _Any new feature or user-facing change requires updating the Hudi website. 
Please create a Jira ticket, attach the
 ticket number here and follow the 
[instruction](https://hudi.apache.org/contribute/developer-setup#website) to 
make
 changes to the website._
   
   ### Contributor's checklist
   
   - [ ] Read through [contributor's 
guide](https://hudi.apache.org/contribute/how-to-contribute)
   - [ ] Change Logs and Impact were stated clearly
   - [ ] Adequate tests were added if applicable
   - [ ] CI passed
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] hudi-bot commented on pull request #9345: [HUDI-6649] Fix column stat based data filtering for MOR

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9345:
URL: https://github.com/apache/hudi/pull/9345#issuecomment-1666413441

   
   ## CI report:
   
   * 08eeb5b214d2ac2fe81011fbcf990c513e06e1b7 Azure: 
[FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19094)
 
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] hudi-bot commented on pull request #9261: [HUDI-6579] Adding support for upsert and deletes with spark datasource for pk less table

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9261:
URL: https://github.com/apache/hudi/pull/9261#issuecomment-1666413395

   
   ## CI report:
   
   * 3968bf360001624911a6c04b9a3c44af36a4dbf7 Azure: 
[FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19076)
 
   * c1977e21cdf02d75158bd3dec3b335e27755915c UNKNOWN
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] lokeshj1703 commented on pull request #9345: [HUDI-6649] Fix column stat based data filtering for MOR

2023-08-04 Thread via GitHub



lokeshj1703 commented on PR #9345:
URL: https://github.com/apache/hudi/pull/9345#issuecomment-1666408258

   @yihua I have summarized the approach.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] hudi-bot commented on pull request #9367: [HUDI-6648] Support building table views from existing files

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9367:
URL: https://github.com/apache/hudi/pull/9367#issuecomment-1666403172

   
   ## CI report:
   
   * 0e4813a189457ef09caf85dca562698f6ba46a4e Azure: 
[FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19092)
 
   * 3da7a8536e1c4bb4a7450d59b0ad32e9ed048c20 Azure: 
[PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19101)
 
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] hudi-bot commented on pull request #9367: [HUDI-6648] Support building table views from existing files

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9367:
URL: https://github.com/apache/hudi/pull/9367#issuecomment-1666400574

   
   ## CI report:
   
   * 0e4813a189457ef09caf85dca562698f6ba46a4e Azure: 
[FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19092)
 
   * 3da7a8536e1c4bb4a7450d59b0ad32e9ed048c20 UNKNOWN
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] codope commented on a diff in pull request #9345: [HUDI-6649] Fix column stat based data filtering for MOR

2023-08-04 Thread via GitHub



codope commented on code in PR #9345:
URL: https://github.com/apache/hudi/pull/9345#discussion_r1284972391


##
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala:
##
@@ -122,86 +141,132 @@ case class HoodieFileIndex(spark: SparkSession,
* @return list of PartitionDirectory containing partition to base files 
mapping
*/
   override def listFiles(partitionFilters: Seq[Expression], dataFilters: 
Seq[Expression]): Seq[PartitionDirectory] = {
-// Look up candidate files names in the col-stats index, if all of the 
following conditions are true
-//- Data-skipping is enabled
-//- Col-Stats Index is present
-//- List of predicates (filters) is present
-val candidateFilesNamesOpt: Option[Set[String]] =
+// Prune the partition path by the partition filters
+// NOTE: Non-partitioned tables are assumed to consist from a single 
partition
+//   encompassing the whole table
+val partitionsAndFileSlices = 
getFileSlicesForPrunedPartitions(partitionFilters)
+val partitionsAndFilteredFileSlices = filterFileSlices(dataFilters, 
partitionsAndFileSlices).map {
+  case (partitionOpt, fileSlices) =>
+val allCandidateFiles: Seq[FileStatus] = fileSlices.flatMap(fs => {
+  val baseFileStatusOpt = 
getBaseFileStatus(Option.apply(fs.getBaseFile.orElse(null)))
+  val logFilesStatus = if (includeLogFiles) {
+
fs.getLogFiles.map[FileStatus](JFunction.toJavaFunction[HoodieLogFile, 
FileStatus](lf => lf.getFileStatus))
+  } else {
+java.util.stream.Stream.empty()
+  }
+  val files = 
logFilesStatus.collect(Collectors.toList[FileStatus]).asScala
+  baseFileStatusOpt.foreach(f => files.append(f))
+  files
+})
+
+PartitionDirectory(InternalRow.fromSeq(partitionOpt.get.values), 
allCandidateFiles)
+}
+
+hasPushedDownPartitionPredicates = true
+
+if (shouldReadAsPartitionedTable()) {
+  partitionsAndFilteredFileSlices
+} else {
+  Seq(PartitionDirectory(InternalRow.empty, 
partitionsAndFilteredFileSlices.flatMap(_.files)))
+}
+  }
+
+  def filterFileSlices(dataFilters: Seq[Expression], partitionAndFileSlices: 
Seq[(Option[BaseHoodieTableFileIndex.PartitionPath], Seq[FileSlice])])
+  : Seq[(Option[BaseHoodieTableFileIndex.PartitionPath], Seq[FileSlice])] = {
+// If there are no data filters, return all the file slices.
+// If there are no file slices, return empty list.
+if (partitionAndFileSlices.isEmpty || dataFilters.isEmpty) {
+  partitionAndFileSlices
+} else {
+  // Look up candidate files names in the col-stats index, if all of the 
following conditions are true
+  //- Data-skipping is enabled
+  //- Col-Stats Index is present
+  //- List of predicates (filters) is present
+  val candidateFilesNamesOpt: Option[Set[String]] =

Review Comment:
   Can you point me where are the file slices being filtered based on column 
stats? The `prunedCandidateFileNames` in `lookupCandidateFilesInMetadataTable` 
will return a set of base files and log files.. Then my understanding is that 
we further need to check whether any of these pruned files are part of file 
slices returned by `getAllFiles()`. If so, then take the complete file slice as 
a candidate, if none, then skip that file slice.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] codope commented on a diff in pull request #9370: [HUDI-6650] Do not set default preCombine field for Flink table config

2023-08-04 Thread via GitHub



codope commented on code in PR #9370:
URL: https://github.com/apache/hudi/pull/9370#discussion_r1284971137


##
hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/OptionsResolver.java:
##
@@ -126,6 +126,17 @@ public static String getPreCombineField(Configuration 
conf) {
 return preCombineField.equals(FlinkOptions.NO_PRE_COMBINE) ? null : 
preCombineField;
   }
 
+  /**
+   * Returns the preCombine field
+   * or null if the value is set as {@link FlinkOptions#NO_PRE_COMBINE} or the 
user does not config it explicitly.
+   */
+  public static String getPreCombineFieldNoDefaultValue(Configuration conf) {
+if (!conf.contains(FlinkOptions.PRECOMBINE_FIELD)) {
+  return null;

Review Comment:
   Better to return `Option`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] codope commented on a diff in pull request #9370: [HUDI-6650] Do not set default preCombine field for Flink table config

2023-08-04 Thread via GitHub



codope commented on code in PR #9370:
URL: https://github.com/apache/hudi/pull/9370#discussion_r1284971137


##
hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/OptionsResolver.java:
##
@@ -126,6 +126,17 @@ public static String getPreCombineField(Configuration 
conf) {
 return preCombineField.equals(FlinkOptions.NO_PRE_COMBINE) ? null : 
preCombineField;
   }
 
+  /**
+   * Returns the preCombine field
+   * or null if the value is set as {@link FlinkOptions#NO_PRE_COMBINE} or the 
user does not config it explicitly.
+   */
+  public static String getPreCombineFieldNoDefaultValue(Configuration conf) {
+if (!conf.contains(FlinkOptions.PRECOMBINE_FIELD)) {
+  return null;

Review Comment:
   Better to return Option?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] the-other-tim-brown commented on a diff in pull request #9367: [HUDI-6648] Support building table views from existing files

2023-08-04 Thread via GitHub



the-other-tim-brown commented on code in PR #9367:
URL: https://github.com/apache/hudi/pull/9367#discussion_r1284971042


##
hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java:
##
@@ -324,33 +325,45 @@ public static HoodieRecord 
createPartitionListRecord(List
* @param partitionThe name of the partition
* @param filesAdded   Mapping of files to their sizes for files which have 
been added to this partition
* @param filesDeleted List of files which have been deleted from this 
partition
+   * @param instantTime  Commit time of the commit responsible for adding 
and/or deleting these files, will be empty during bootstrapping of the metadata 
table
*/
   public static HoodieRecord 
createPartitionFilesRecord(String partition,
-   
Option> filesAdded,
-   
Option> filesDeleted) {
-Map fileInfo = new HashMap<>();
-filesAdded.ifPresent(filesMap ->
-fileInfo.putAll(
-filesMap.entrySet().stream().collect(
-Collectors.toMap(Map.Entry::getKey, (entry) -> {
-  long fileSize = entry.getValue();
-  // Assert that the file-size of the file being added is 
positive, since Hudi
-  // should not be creating empty files
-  checkState(fileSize > 0);
-  return new HoodieMetadataFileInfo(fileSize, false);
-})))
-);
-filesDeleted.ifPresent(filesList ->
-fileInfo.putAll(
-filesList.stream().collect(
-Collectors.toMap(Function.identity(), (ignored) -> new 
HoodieMetadataFileInfo(0L, true
-);
+   
Map filesAdded,
+   
List filesDeleted,
+   
Option instantTime) {
+int size = filesAdded.size() + filesDeleted.size();
+Map fileInfo = new HashMap<>(size, 1);
+filesAdded.forEach((fileName, fileSize) -> {
+  // Assert that the file-size of the file being added is positive, since 
Hudi
+  // should not be creating empty files
+  checkState(fileSize > 0);
+  fileInfo.put(handleFileName(fileName, instantTime), new 
HoodieMetadataFileInfo(fileSize, false));
+});
+
+filesDeleted.forEach(fileName -> fileInfo.put(handleFileName(fileName, 
instantTime), DELETE_FILE_METADATA));
 
 HoodieKey key = new HoodieKey(partition, 
MetadataPartitionType.FILES.getPartitionPath());
 HoodieMetadataPayload payload = new 
HoodieMetadataPayload(key.getRecordKey(), METADATA_TYPE_FILE_LIST, fileInfo);
 return new HoodieAvroRecord<>(key, payload);
   }
 
+  /**
+   * In the case where a file was created by something other than a Hudi 
writer, the file name will not contain the commit time. We will prefix the file 
name with hudiext_[commitTime] before storing
+   * in the metadata table. The constructor for {@link 
org.apache.hudi.common.model.HoodieBaseFile} will properly handle this prefix.
+   * @param fileName incoming file name
+   * @param commitTime time of the commit (will be empty during bootstrap 
operations)
+   * @return file name with commit time prefix if the input file name does not 
contain the commit time, otherwise returns the original input
+   */
+  private static String handleFileName(String fileName, Option 
commitTime) {
+return commitTime.map(commit -> {
+  if (fileName.contains(commit) || FSUtils.isLogFile(fileName)) {

Review Comment:
   I've decided to pursue the approach in the other comment



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] codope closed pull request #8933: [HUDI-5329] Spark reads table error when Flink creates table without record key and primary key

2023-08-04 Thread via GitHub



codope closed pull request #8933: [HUDI-5329] Spark reads table error when 
Flink creates table without record key and primary key
URL: https://github.com/apache/hudi/pull/8933


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] the-other-tim-brown commented on a diff in pull request #9367: [HUDI-6648] Support building table views from existing files

2023-08-04 Thread via GitHub



the-other-tim-brown commented on code in PR #9367:
URL: https://github.com/apache/hudi/pull/9367#discussion_r1284970820


##
hudi-common/src/main/java/org/apache/hudi/common/util/ExternalFilePathUtil.java:
##
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.common.util;
+
+public class ExternalFilePathUtil {

Review Comment:
   Added



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[hudi] branch master updated: [HUDI-6639] Rename hoodie.sql.write.operation to hoodie.spark.sql.insert.into.operation (#9359)

2023-08-04 Thread codope

This is an automated email from the ASF dual-hosted git repository.

codope pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new 2fd72c9c2f5 [HUDI-6639] Rename hoodie.sql.write.operation to 
hoodie.spark.sql.insert.into.operation (#9359)
2fd72c9c2f5 is described below

commit 2fd72c9c2f5d65bbe8aec20f425815e580773f66
Author: Amrish Lal 
AuthorDate: Fri Aug 4 22:19:01 2023 -0700

[HUDI-6639] Rename hoodie.sql.write.operation to 
hoodie.spark.sql.insert.into.operation (#9359)
---
 .../scala/org/apache/hudi/DataSourceOptions.scala  | 14 ++--
 .../spark/sql/hudi/ProvidesHoodieConfig.scala  | 10 -
 .../apache/spark/sql/hudi/TestCDCForSparkSQL.scala |  5 +++--
 .../apache/spark/sql/hudi/TestCreateTable.scala|  4 ++--
 .../sql/hudi/TestHoodieTableValuedFunction.scala   |  5 +++--
 .../apache/spark/sql/hudi/TestInsertTable.scala| 26 +++---
 .../org/apache/spark/sql/hudi/TestSpark3DDL.scala  | 18 +++
 7 files changed, 42 insertions(+), 40 deletions(-)

diff --git 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala
 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala
index e8c247a7c03..59c2a60a3ad 100644
--- 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala
+++ 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala
@@ -441,7 +441,7 @@ object DataSourceWriteOptions {
 .markAdvanced()
 .deprecatedAfter("0.14.0")
 .withDocumentation("When set to true, the sql insert statement will use 
bulk insert. " +
-  "This config is deprecated as of 0.14.0. Please use 
hoodie.sql.write.operation instead.")
+  "This config is deprecated as of 0.14.0. Please use 
hoodie.spark.sql.insert.into.operation instead.")
 
   @Deprecated
   val SQL_INSERT_MODE: ConfigProperty[String] = ConfigProperty
@@ -452,7 +452,7 @@ object DataSourceWriteOptions {
   "For upsert mode, insert statement do the upsert operation for the 
pk-table which will update the duplicate record." +
   "For strict mode, insert statement will keep the primary key uniqueness 
constraint which do not allow duplicate record." +
   "While for non-strict mode, hudi just do the insert operation for the 
pk-table. This config is deprecated as of 0.14.0. Please use " +
-  "hoodie.sql.write.operation and hoodie.datasource.insert.dup.policy as 
you see fit.")
+  "hoodie.spark.sql.insert.into.operation and 
hoodie.datasource.insert.dup.policy as you see fit.")
 
   val COMMIT_METADATA_KEYPREFIX: ConfigProperty[String] = ConfigProperty
 .key("hoodie.datasource.write.commitmeta.key.prefix")
@@ -528,13 +528,13 @@ object DataSourceWriteOptions {
 
   val MAKE_NEW_COLUMNS_NULLABLE: ConfigProperty[java.lang.Boolean] = 
HoodieCommonConfig.MAKE_NEW_COLUMNS_NULLABLE
 
-  val SQL_WRITE_OPERATION: ConfigProperty[String] = ConfigProperty
-.key("hoodie.sql.write.operation")
-.defaultValue("insert")
-.withValidValues("bulk_insert","insert","upsert")
+  val SPARK_SQL_INSERT_INTO_OPERATION: ConfigProperty[String] = ConfigProperty
+.key("hoodie.spark.sql.insert.into.operation")
+.defaultValue(WriteOperationType.INSERT.value())
+.withValidValues(WriteOperationType.BULK_INSERT.value(), 
WriteOperationType.INSERT.value(), WriteOperationType.UPSERT.value())
 .withDocumentation("Sql write operation to use with INSERT_INTO spark sql 
command. This comes with 3 possible values, bulk_insert, " +
   "insert and upsert. bulk_insert is generally meant for initial loads and 
is known to be performant compared to insert. But bulk_insert may not " +
-  "do small file managmeent. If you prefer hudi to automatically managee 
small files, then you can go with \"insert\". There is no precombine " +
+  "do small file management. If you prefer hudi to automatically manage 
small files, then you can go with \"insert\". There is no precombine " +
   "(if there are duplicates within the same batch being ingested, same 
dups will be ingested) with bulk_insert and insert and there is no index " +
   "look up as well. If you may use INSERT_INTO for mutable dataset, then 
you may have to set this config value to \"upsert\". With upsert, you will " +
   "get both precombine and updates to existing records on storage is also 
honored. If not, you may see duplicates. ")
diff --git 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala
 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala
index c66dcc19549..f85032790dd 100644
--- 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala
+++ 
b/hudi-spark-datasource

[GitHub] [hudi] codope merged pull request #9359: [HUDI-6639] Rename hoodie.sql.write.operation to hoodie.spark.sql.insert.into.operation

2023-08-04 Thread via GitHub



codope merged PR #9359:
URL: https://github.com/apache/hudi/pull/9359


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] the-other-tim-brown commented on a diff in pull request #9367: [HUDI-6648] Support building table views from existing files

2023-08-04 Thread via GitHub



the-other-tim-brown commented on code in PR #9367:
URL: https://github.com/apache/hudi/pull/9367#discussion_r1284969268


##
hudi-common/src/main/java/org/apache/hudi/common/util/ExternalFilePathUtil.java:
##
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.common.util;
+
+public class ExternalFilePathUtil {
+  private static final String EXTERNAL_FILE_PREFIX = "hudiext_";
+
+  public static String prefixExternalFileWithCommitTime(String fileName, 
String commitTime) {

Review Comment:
   I think that may be misleading since we're not converting this to a 
traditional hudi file path and this is only a fileName (not full path)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] hudi-bot commented on pull request #9370: [HUDI-6650] Do not set default preCombine field for Flink table config

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9370:
URL: https://github.com/apache/hudi/pull/9370#issuecomment-1666389367

   
   ## CI report:
   
   * 241f148dd3740636568416a5f7d37a9cd329e951 Azure: 
[PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19099)
 
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] hudi-bot commented on pull request #9370: [HUDI-6650] Do not set default preCombine field for Flink table config

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9370:
URL: https://github.com/apache/hudi/pull/9370#issuecomment-1666387643

   
   ## CI report:
   
   * 241f148dd3740636568416a5f7d37a9cd329e951 UNKNOWN
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] hudi-bot commented on pull request #9366: [HUDI-6476][FOLLOW-UP] Path filter by FileStatus to avoid additional fs request

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9366:
URL: https://github.com/apache/hudi/pull/9366#issuecomment-1666387633

   
   ## CI report:
   
   * f26271b49440962d10ad12ba619d3caf12a6091d Azure: 
[FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19083)
 
   * 849fd2cea9047912250cddea3b5bad288b4f1661 Azure: 
[PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19097)
 
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] hudi-bot commented on pull request #9365: [HUDI-6646] Add default lock provider for spark offline compaction an…

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9365:
URL: https://github.com/apache/hudi/pull/9365#issuecomment-1666385863

   
   ## CI report:
   
   * 24a796b7d68ed9b0fced46a0bf297ac1ebf21c13 Azure: 
[CANCELED](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19095)
 
   * 5599728990aa2b2d7e6dae8ef5c54196e94c773e Azure: 
[PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19096)
 
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] hudi-bot commented on pull request #9366: [HUDI-6476][FOLLOW-UP] Path filter by FileStatus to avoid additional fs request

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9366:
URL: https://github.com/apache/hudi/pull/9366#issuecomment-1666385872

   
   ## CI report:
   
   * f26271b49440962d10ad12ba619d3caf12a6091d Azure: 
[FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19083)
 
   * 849fd2cea9047912250cddea3b5bad288b4f1661 UNKNOWN
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] hudi-bot commented on pull request #9367: [HUDI-6648] Support building table views from existing files

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9367:
URL: https://github.com/apache/hudi/pull/9367#issuecomment-1666385880

   
   ## CI report:
   
   * 0e4813a189457ef09caf85dca562698f6ba46a4e Azure: 
[FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19092)
 
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[hudi] branch asf-site updated: [DOCS][HUDI-6536] Add docs on table version change in 0.11.0 (#9206)

2023-08-04 Thread sivabalan

This is an automated email from the ASF dual-hosted git repository.

sivabalan pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/asf-site by this push:
 new c44da42339d [DOCS][HUDI-6536] Add docs on table version change in 
0.11.0 (#9206)
c44da42339d is described below

commit c44da42339dbe513e1c32fc1aaa3de8bc8f61309
Author: Y Ethan Guo 
AuthorDate: Fri Aug 4 21:36:43 2023 -0700

[DOCS][HUDI-6536] Add docs on table version change in 0.11.0 (#9206)
---
 website/releases/release-0.11.0.md | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/website/releases/release-0.11.0.md 
b/website/releases/release-0.11.0.md
index 96d39035453..9e11f1f5861 100644
--- a/website/releases/release-0.11.0.md
+++ b/website/releases/release-0.11.0.md
@@ -12,6 +12,12 @@ import TabItem from '@theme/TabItem';
 
 ## Migration Guide
 
+With 0.11.0, we have added a checksum mechanism for validating the 
`hoodie.proerties`, which introduces a new table version, `4`.
+  Whenever a Hudi job is launched with this release on a table with older 
table version, an upgrade step is executed automatically to upgrade the table 
to table version `4`.
+  This automatic upgrade step happens just once per Hudi table as the 
hoodie.table.version will be updated in property file after upgrade is 
completed.
+  Similarly, a command line tool for Downgrading (command - downgrade) is 
added if in case some users want to downgrade Hudi
+  from table version `4` to `3` or move from Hudi 0.11.0 to pre 0.11.0. This 
needs to be executed from a 0.11.0 hudi-cli binary/script.
+
 ### Bundle usage updates
 
 - Spark bundle for 3.0.x is no longer officially supported. Users are 
encouraged to upgrade to Spark 3.2 or 3.1.

[GitHub] [hudi] nsivabalan merged pull request #9206: [DOCS][HUDI-6536] Add docs on table version change in 0.11.0

2023-08-04 Thread via GitHub



nsivabalan merged PR #9206:
URL: https://github.com/apache/hudi/pull/9206


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[hudi] branch master updated (35be9bbbc7e -> 84030beb863)

2023-08-04 Thread sivabalan

This is an automated email from the ASF dual-hosted git repository.

sivabalan pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


from 35be9bbbc7e [HUDI-6324] Fixing deleting of MDT index (#9248)
 add 84030beb863 [MINOR] Fix typo in config docs (#9295)

No new revisions were added by this update.

Summary of changes:
 .../src/main/java/org/apache/hudi/common/config/ConfigGroups.java   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

[GitHub] [hudi] nsivabalan merged pull request #9295: [MINOR] Fix typo in config docs

2023-08-04 Thread via GitHub



nsivabalan merged PR #9295:
URL: https://github.com/apache/hudi/pull/9295


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[hudi] branch asf-site updated: [DOCS] Improve Developer Setup page (#9208)

2023-08-04 Thread sivabalan

This is an automated email from the ASF dual-hosted git repository.

sivabalan pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/asf-site by this push:
 new ca8c6879f75 [DOCS] Improve Developer Setup page (#9208)
ca8c6879f75 is described below

commit ca8c6879f75c597257e6078f6fd6fc94311fa540
Author: Y Ethan Guo 
AuthorDate: Fri Aug 4 21:35:13 2023 -0700

[DOCS] Improve Developer Setup page (#9208)
---
 website/contribute/developer-setup.md | 18 ++
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/website/contribute/developer-setup.md 
b/website/contribute/developer-setup.md
index 5ca759250d6..4b80cee3e15 100644
--- a/website/contribute/developer-setup.md
+++ b/website/contribute/developer-setup.md
@@ -20,32 +20,34 @@ To contribute code, you need
  - (Recommended) Join our dev mailing list & slack channel, listed on 
[community](/community/get-involved) page.
 
 
-## IDE Setup
+## IntelliJ Setup
 
-To contribute, you would need to do the following
+IntelliJ is the recommended IDE for developing Hudi. To contribute, you would 
need to do the following
  
-- Fork the Hudi code on Github & then clone your own fork locally. Once 
cloned, we recommend building as per instructions on [spark 
quickstart](/docs/quick-start-guide) or [flink 
quickstart](/docs/flink-quick-start-guide)
+- Fork the Hudi code on Github & then clone your own fork locally. Once 
cloned, we recommend building as per instructions on [spark 
quickstart](/docs/quick-start-guide) or [flink 
quickstart](/docs/flink-quick-start-guide).
 
-- In `Project Structure`, select Java 1.8 as the Project SDK
+- In IntelliJ, select `File` > `New` > `Project from Existing Sources...` and 
select the `pom.xml` file under your local Hudi source folder.
+
+- In `Project Structure`, select Java 1.8 as the Project SDK.
 
   ![IDE setup java](/assets/images/contributing/IDE_setup_java.png)
 
 - Make the following configuration in `Preferences` or `Settings` in newer 
IntelliJ so the Hudi code can compile in the IDE:
-  * Enable annotation processing in compiler
+  * Enable annotation processing in compiler.
 
 ![IDE setup annotation 
processing](/assets/images/contributing/IDE_setup_annotation.png)
-  * Configure Maven *NOT* to delegate IDE build/run actions to Maven so you 
can run tests in IntelliJ directly
+  * Configure Maven *NOT* to delegate IDE build/run actions to Maven so you 
can run tests in IntelliJ directly.
 
 ![IDE setup maven 1](/assets/images/contributing/IDE_setup_maven_1.png)
 ![IDE setup maven 2](/assets/images/contributing/IDE_setup_maven_2.png)
 
 - If you switch maven build profile, e.g., from Spark 3.2 to Spark 3.3, you 
need to first build Hudi in the command line first and `Reload All Maven 
Projects` in IntelliJ like below,
-so that IntelliJ re-indexes the code
+so that IntelliJ re-indexes the code.
 
![IDE setup reload](/assets/images/contributing/IDE_setup_reload.png)
 
 - \[Recommended\] We have embraced the code style largely based on [google 
format](https://google.github.io/styleguide/javaguide.html). Please set up your 
IDE with style files from [\/style/](https://github.com/apache/hudi/tree/master/style). These 
instructions have been tested on IntelliJ.
-* Open Project Preferences in IDE
+* Open Project Preferences in IntelliJ
 * Install and activate CheckStyle plugin
 
   
![IDE_setup_checkstyle_1](/assets/images/contributing/IDE_setup_checkstyle_1.png)

[GitHub] [hudi] nsivabalan merged pull request #9208: [DOCS] Improve Developer Setup page

2023-08-04 Thread via GitHub



nsivabalan merged PR #9208:
URL: https://github.com/apache/hudi/pull/9208


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[jira] [Updated] (HUDI-6650) Do not set default preCombine field for Flink table config

2023-08-04 Thread ASF GitHub Bot (Jira)



 [ 
https://issues.apache.org/jira/browse/HUDI-6650?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

ASF GitHub Bot updated HUDI-6650:
-
Labels: pull-request-available  (was: )

> Do not set default preCombine field for Flink table config
> --
>
> Key: HUDI-6650
> URL: https://issues.apache.org/jira/browse/HUDI-6650
> Project: Apache Hudi
>  Issue Type: Bug
>  Components: flink
>Reporter: Danny Chen
>Priority: Major
>  Labels: pull-request-available
> Fix For: 0.14.0
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)

[GitHub] [hudi] danny0405 opened a new pull request, #9370: [HUDI-6650] Do not set default preCombine field for Flink table config

2023-08-04 Thread via GitHub



danny0405 opened a new pull request, #9370:
URL: https://github.com/apache/hudi/pull/9370

   ### Change Logs
   
   Do not set preCombine field when use does not set it up explicitly. This 
behavior is kept in line with Spark.
   
   ### Impact
   
   none.
   
   ### Risk level (write none, low medium or high below)
   
   none
   
   ### Documentation Update
   
   _Describe any necessary documentation update if there is any new feature, 
config, or user-facing change_
   
   - _The config description must be updated if new configs are added or the 
default value of the configs are changed_
   - _Any new feature or user-facing change requires updating the Hudi website. 
Please create a Jira ticket, attach the
 ticket number here and follow the 
[instruction](https://hudi.apache.org/contribute/developer-setup#website) to 
make
 changes to the website._
   
   ### Contributor's checklist
   
   - [ ] Read through [contributor's 
guide](https://hudi.apache.org/contribute/how-to-contribute)
   - [ ] Change Logs and Impact were stated clearly
   - [ ] Adequate tests were added if applicable
   - [ ] CI passed
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] hudi-bot commented on pull request #9369: [HUDI-6386] Fix flakey multiwriter tests

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9369:
URL: https://github.com/apache/hudi/pull/9369#issuecomment-1666376792

   
   ## CI report:
   
   * 68ba8ae97df89a261298dbb06c80f43c57159969 UNKNOWN
   * 2d7ced1d3723f9c6357378c3849ebca7e2db4904 Azure: 
[SUCCESS](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19090)
 
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] hudi-bot commented on pull request #9345: [HUDI-6649] Fix column stat based data filtering for MOR

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9345:
URL: https://github.com/apache/hudi/pull/9345#issuecomment-1666376768

   
   ## CI report:
   
   * 600d19c601ec8b224643b7e4c0a7c0f7a2e1e290 Azure: 
[FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19089)
 
   * 08eeb5b214d2ac2fe81011fbcf990c513e06e1b7 Azure: 
[PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19094)
 
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] hudi-bot commented on pull request #9365: [HUDI-6646] Add default lock provider for spark offline compaction an…

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9365:
URL: https://github.com/apache/hudi/pull/9365#issuecomment-1666376785

   
   ## CI report:
   
   * 0aed2250937742799e04e42c511e7fa08a5b92a2 Azure: 
[CANCELED](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19093)
 
   * 24a796b7d68ed9b0fced46a0bf297ac1ebf21c13 Azure: 
[PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19095)
 
   * 5599728990aa2b2d7e6dae8ef5c54196e94c773e UNKNOWN
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[jira] [Created] (HUDI-6650) Do not set default preCombine field for Flink table config

2023-08-04 Thread Danny Chen (Jira)

Danny Chen created HUDI-6650:


 Summary: Do not set default preCombine field for Flink table config
 Key: HUDI-6650
 URL: https://issues.apache.org/jira/browse/HUDI-6650
 Project: Apache Hudi
  Issue Type: Bug
  Components: flink
Reporter: Danny Chen
 Fix For: 0.14.0






--
This message was sent by Atlassian Jira
(v8.20.10#820010)

[GitHub] [hudi] hudi-bot commented on pull request #9365: [HUDI-6646] Add default lock provider for spark offline compaction an…

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9365:
URL: https://github.com/apache/hudi/pull/9365#issuecomment-1666375333

   
   ## CI report:
   
   * d6ffde7bdc33f53357e131703071c5040793b715 Azure: 
[FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19072)
 
   * 0aed2250937742799e04e42c511e7fa08a5b92a2 Azure: 
[PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19093)
 
   * 24a796b7d68ed9b0fced46a0bf297ac1ebf21c13 UNKNOWN
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] hudi-bot commented on pull request #9345: [HUDI-6649] Fix column stat based data filtering for MOR

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9345:
URL: https://github.com/apache/hudi/pull/9345#issuecomment-1666375320

   
   ## CI report:
   
   * 600d19c601ec8b224643b7e4c0a7c0f7a2e1e290 Azure: 
[FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19089)
 
   * 08eeb5b214d2ac2fe81011fbcf990c513e06e1b7 UNKNOWN
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] hudi-bot commented on pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9276:
URL: https://github.com/apache/hudi/pull/9276#issuecomment-1666373712

   
   ## CI report:
   
   * 662f3b320ab6ea06462bad9a4448add1ec2f380a UNKNOWN
   * def394b73203814bbb635841c5f07c216c0575cc Azure: 
[FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19091)
 
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] hudi-bot commented on pull request #9336: [HUDI-6629] - Changes for s3/gcs IncrSource job to taken into sourceLimit during ingestion

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9336:
URL: https://github.com/apache/hudi/pull/9336#issuecomment-1666358274

   
   ## CI report:
   
   * 77d7b455ee5cd668a005f6f7e6f04135608f2b7a UNKNOWN
   * ae20a8550d647dbaec9428f11bfac0751a2c871c Azure: 
[FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19086)
 
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] hudi-bot commented on pull request #9365: [HUDI-6646] Add default lock provider for spark offline compaction an…

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9365:
URL: https://github.com/apache/hudi/pull/9365#issuecomment-1666344460

   
   ## CI report:
   
   * d6ffde7bdc33f53357e131703071c5040793b715 Azure: 
[FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19072)
 
   * 0aed2250937742799e04e42c511e7fa08a5b92a2 Azure: 
[PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19093)
 
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] hudi-bot commented on pull request #9365: [HUDI-6646] Add default lock provider for spark offline compaction an…

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9365:
URL: https://github.com/apache/hudi/pull/9365#issuecomment-1666340015

   
   ## CI report:
   
   * d6ffde7bdc33f53357e131703071c5040793b715 Azure: 
[FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19072)
 
   * 0aed2250937742799e04e42c511e7fa08a5b92a2 UNKNOWN
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] hudi-bot commented on pull request #8718: [HUDI-6214] Enabling compaction by default for batch writes with MOR table

2023-08-04 Thread via GitHub



hudi-bot commented on PR #8718:
URL: https://github.com/apache/hudi/pull/8718#issuecomment-1666339426

   
   ## CI report:
   
   * e15a030cdb08d00401b55e6d49cebcb6892f218e Azure: 
[FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19087)
 
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] hudi-bot commented on pull request #9359: [HUDI-6639] Rename hoodie.sql.write.operation to hoodie.spark.sql.insert.into.operation

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9359:
URL: https://github.com/apache/hudi/pull/9359#issuecomment-1666337335

   
   ## CI report:
   
   * c695d8fc59ebdbf339f40ea31db465858ea83f98 Azure: 
[SUCCESS](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19085)
 
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] xuzifu666 closed pull request #9364: [HUDI-6645] Relax the restriction for Spark MDT rollback

2023-08-04 Thread via GitHub



xuzifu666 closed pull request #9364: [HUDI-6645] Relax the restriction for 
Spark MDT rollback
URL: https://github.com/apache/hudi/pull/9364


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] xuzifu666 commented on a diff in pull request #9364: [HUDI-6645] Relax the restriction for Spark MDT rollback

2023-08-04 Thread via GitHub



xuzifu666 commented on code in PR #9364:
URL: https://github.com/apache/hudi/pull/9364#discussion_r1284932889


##
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java:
##
@@ -1022,22 +1020,6 @@ public void update(HoodieRollbackMetadata 
rollbackMetadata, String instantTime)
 }
   }
 
-  protected void validateRollback(

Review Comment:
   this change need for compation with spark , otherwise would report error 
@yihua 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] danny0405 commented on a diff in pull request #9365: [HUDI-6646] Add default lock provider for spark offline compaction an…

2023-08-04 Thread via GitHub



danny0405 commented on code in PR #9365:
URL: https://github.com/apache/hudi/pull/9365#discussion_r1284932402


##
hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java:
##
@@ -578,6 +580,20 @@ public static HoodieTableMetaClient createMetaClient(
 .build();
   }
 
+  public static void addLockOptions(String basePath, TypedProperties props) {
+if (!props.containsKey(HoodieLockConfig.LOCK_PROVIDER_CLASS_NAME.key())) {
+  HoodieLockConfig lockConfig = HoodieLockConfig.newBuilder()
+  .withLockProvider(FileSystemBasedLockProvider.class)
+  .withLockWaitTimeInMillis(2000L) // 2s
+  .withFileSystemLockExpire(1) // 1 minute
+  .withClientNumRetries(30)

Review Comment:
   Not sure, I just keep the config same with Flink to make it more friendly to 
streaming job.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] hudi-bot commented on pull request #9345: [HUDI-6649] Fix column stat based data filtering for MOR

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9345:
URL: https://github.com/apache/hudi/pull/9345#issuecomment-1666307069

   
   ## CI report:
   
   * 600d19c601ec8b224643b7e4c0a7c0f7a2e1e290 Azure: 
[FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19089)
 
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] the-other-tim-brown commented on a diff in pull request #9367: [HUDI-6648] Support building table views from existing files

2023-08-04 Thread via GitHub



the-other-tim-brown commented on code in PR #9367:
URL: https://github.com/apache/hudi/pull/9367#discussion_r1284923832


##
hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java:
##
@@ -324,33 +325,45 @@ public static HoodieRecord 
createPartitionListRecord(List
* @param partitionThe name of the partition
* @param filesAdded   Mapping of files to their sizes for files which have 
been added to this partition
* @param filesDeleted List of files which have been deleted from this 
partition
+   * @param instantTime  Commit time of the commit responsible for adding 
and/or deleting these files, will be empty during bootstrapping of the metadata 
table
*/
   public static HoodieRecord 
createPartitionFilesRecord(String partition,
-   
Option> filesAdded,
-   
Option> filesDeleted) {
-Map fileInfo = new HashMap<>();
-filesAdded.ifPresent(filesMap ->
-fileInfo.putAll(
-filesMap.entrySet().stream().collect(
-Collectors.toMap(Map.Entry::getKey, (entry) -> {
-  long fileSize = entry.getValue();
-  // Assert that the file-size of the file being added is 
positive, since Hudi
-  // should not be creating empty files
-  checkState(fileSize > 0);
-  return new HoodieMetadataFileInfo(fileSize, false);
-})))
-);
-filesDeleted.ifPresent(filesList ->
-fileInfo.putAll(
-filesList.stream().collect(
-Collectors.toMap(Function.identity(), (ignored) -> new 
HoodieMetadataFileInfo(0L, true
-);
+   
Map filesAdded,
+   
List filesDeleted,
+   
Option instantTime) {
+int size = filesAdded.size() + filesDeleted.size();
+Map fileInfo = new HashMap<>(size, 1);
+filesAdded.forEach((fileName, fileSize) -> {
+  // Assert that the file-size of the file being added is positive, since 
Hudi
+  // should not be creating empty files
+  checkState(fileSize > 0);
+  fileInfo.put(handleFileName(fileName, instantTime), new 
HoodieMetadataFileInfo(fileSize, false));
+});
+
+filesDeleted.forEach(fileName -> fileInfo.put(handleFileName(fileName, 
instantTime), DELETE_FILE_METADATA));
 
 HoodieKey key = new HoodieKey(partition, 
MetadataPartitionType.FILES.getPartitionPath());
 HoodieMetadataPayload payload = new 
HoodieMetadataPayload(key.getRecordKey(), METADATA_TYPE_FILE_LIST, fileInfo);
 return new HoodieAvroRecord<>(key, payload);
   }
 
+  /**
+   * In the case where a file was created by something other than a Hudi 
writer, the file name will not contain the commit time. We will prefix the file 
name with hudiext_[commitTime] before storing
+   * in the metadata table. The constructor for {@link 
org.apache.hudi.common.model.HoodieBaseFile} will properly handle this prefix.
+   * @param fileName incoming file name
+   * @param commitTime time of the commit (will be empty during bootstrap 
operations)
+   * @return file name with commit time prefix if the input file name does not 
contain the commit time, otherwise returns the original input
+   */
+  private static String handleFileName(String fileName, Option 
commitTime) {

Review Comment:
   Yes that's correct, see 
https://github.com/apache/hudi/pull/9367#discussion_r1284890274 for a slightly 
different approach.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] the-other-tim-brown commented on a diff in pull request #9367: [HUDI-6648] Support building table views from existing files

2023-08-04 Thread via GitHub



the-other-tim-brown commented on code in PR #9367:
URL: https://github.com/apache/hudi/pull/9367#discussion_r1284923741


##
hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java:
##
@@ -324,33 +325,45 @@ public static HoodieRecord 
createPartitionListRecord(List
* @param partitionThe name of the partition
* @param filesAdded   Mapping of files to their sizes for files which have 
been added to this partition
* @param filesDeleted List of files which have been deleted from this 
partition
+   * @param instantTime  Commit time of the commit responsible for adding 
and/or deleting these files, will be empty during bootstrapping of the metadata 
table
*/
   public static HoodieRecord 
createPartitionFilesRecord(String partition,
-   
Option> filesAdded,
-   
Option> filesDeleted) {
-Map fileInfo = new HashMap<>();
-filesAdded.ifPresent(filesMap ->
-fileInfo.putAll(
-filesMap.entrySet().stream().collect(
-Collectors.toMap(Map.Entry::getKey, (entry) -> {
-  long fileSize = entry.getValue();
-  // Assert that the file-size of the file being added is 
positive, since Hudi
-  // should not be creating empty files
-  checkState(fileSize > 0);
-  return new HoodieMetadataFileInfo(fileSize, false);
-})))
-);
-filesDeleted.ifPresent(filesList ->
-fileInfo.putAll(
-filesList.stream().collect(
-Collectors.toMap(Function.identity(), (ignored) -> new 
HoodieMetadataFileInfo(0L, true
-);
+   
Map filesAdded,
+   
List filesDeleted,
+   
Option instantTime) {
+int size = filesAdded.size() + filesDeleted.size();
+Map fileInfo = new HashMap<>(size, 1);
+filesAdded.forEach((fileName, fileSize) -> {
+  // Assert that the file-size of the file being added is positive, since 
Hudi
+  // should not be creating empty files
+  checkState(fileSize > 0);
+  fileInfo.put(handleFileName(fileName, instantTime), new 
HoodieMetadataFileInfo(fileSize, false));
+});
+
+filesDeleted.forEach(fileName -> fileInfo.put(handleFileName(fileName, 
instantTime), DELETE_FILE_METADATA));
 
 HoodieKey key = new HoodieKey(partition, 
MetadataPartitionType.FILES.getPartitionPath());
 HoodieMetadataPayload payload = new 
HoodieMetadataPayload(key.getRecordKey(), METADATA_TYPE_FILE_LIST, fileInfo);
 return new HoodieAvroRecord<>(key, payload);
   }
 
+  /**
+   * In the case where a file was created by something other than a Hudi 
writer, the file name will not contain the commit time. We will prefix the file 
name with hudiext_[commitTime] before storing
+   * in the metadata table. The constructor for {@link 
org.apache.hudi.common.model.HoodieBaseFile} will properly handle this prefix.
+   * @param fileName incoming file name
+   * @param commitTime time of the commit (will be empty during bootstrap 
operations)
+   * @return file name with commit time prefix if the input file name does not 
contain the commit time, otherwise returns the original input
+   */
+  private static String handleFileName(String fileName, Option 
commitTime) {
+return commitTime.map(commit -> {
+  if (fileName.contains(commit) || FSUtils.isLogFile(fileName)) {

Review Comment:
   Can you check out my notes on this and add to the conversation there? 
https://github.com/apache/hudi/pull/9367#discussion_r1284890274



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] the-other-tim-brown commented on a diff in pull request #9367: [HUDI-6648] Support building table views from existing files

2023-08-04 Thread via GitHub



the-other-tim-brown commented on code in PR #9367:
URL: https://github.com/apache/hudi/pull/9367#discussion_r1284923366


##
hudi-common/src/main/java/org/apache/hudi/common/model/HoodieBaseFile.java:
##
@@ -61,16 +62,39 @@ public HoodieBaseFile(String filePath) {
   public HoodieBaseFile(String filePath, BaseFile bootstrapBaseFile) {
 super(filePath);
 this.bootstrapBaseFile = Option.ofNullable(bootstrapBaseFile);
-String[] fileIdAndCommitTime = getFileIdAndCommitTimeFromFileName();
+String[] fileIdAndCommitTime = 
getFileIdAndCommitTimeFromFileName(getFileName());
 this.fileId = fileIdAndCommitTime[0];
 this.commitTime = fileIdAndCommitTime[1];
   }
 
+  public HoodieBaseFile(String filePath, String fileId, String commitTime, 
BaseFile bootstrapBaseFile) {
+super(filePath);
+this.bootstrapBaseFile = Option.ofNullable(bootstrapBaseFile);
+this.fileId = fileId;
+this.commitTime = commitTime;
+  }
+
+  private HoodieBaseFile(FileStatus fileStatus, String[] fileIdAndCommitTime, 
BaseFile bootstrapBaseFile) {
+this(fileStatus, fileIdAndCommitTime[0], fileIdAndCommitTime[1], 
bootstrapBaseFile);
+  }
+
+  public HoodieBaseFile(FileStatus fileStatus, String fileId, String 
commitTime, BaseFile bootstrapBaseFile) {
+super(handleExternallyGeneratedFileName(fileStatus, fileId));

Review Comment:
   No, you can see the usages in the PR are by other constructors and the DTO 
class. That means that a lot of paths will hit this code.
   
   We currently don't have any validations on file paths, just assumptions. 
Should we start adding that now? It should go in a separate patch if so. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] jonvex commented on a diff in pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading

2023-08-04 Thread via GitHub



jonvex commented on code in PR #9276:
URL: https://github.com/apache/hudi/pull/9276#discussion_r1284922051


##
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/NewHoodieParquetFileFormat.scala:
##
@@ -0,0 +1,353 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.parquet
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import 
org.apache.hudi.DataSourceReadOptions.{REALTIME_PAYLOAD_COMBINE_OPT_VAL, 
REALTIME_SKIP_MERGE_OPT_VAL}
+import org.apache.hudi.MergeOnReadSnapshotRelation.createPartitionedFile
+import org.apache.hudi.common.fs.FSUtils
+import org.apache.hudi.common.model.{BaseFile, FileSlice, HoodieLogFile, 
HoodieRecord}
+import org.apache.hudi.common.util.ValidationUtils.checkState
+import org.apache.hudi.{HoodieBaseRelation, HoodieSparkUtils, 
HoodieTableSchema, HoodieTableState, LogFileIterator, 
MergeOnReadSnapshotRelation, PartitionFileSliceMapping, 
RecordMergingFileIterator, SkipMergeIterator, SparkAdapterSupport}
+import org.apache.spark.broadcast.Broadcast
+import 
org.apache.spark.sql.HoodieCatalystExpressionUtils.generateUnsafeProjection
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.JoinedRow
+import org.apache.spark.sql.execution.datasources.PartitionedFile
+import org.apache.spark.sql.hudi.HoodieSqlCommonUtils.isMetaField
+import org.apache.spark.sql.sources.Filter
+import org.apache.spark.sql.types.{StructField, StructType}
+import org.apache.spark.sql.vectorized.{ColumnVector, ColumnarBatch}
+import org.apache.spark.util.SerializableConfiguration
+
+import scala.collection.mutable
+import scala.jdk.CollectionConverters.asScalaIteratorConverter
+
+class NewHoodieParquetFileFormat(tableState: Broadcast[HoodieTableState],
+ tableSchema: Broadcast[HoodieTableSchema],
+ tableName: String,
+ mergeType: String,
+ mandatoryFields: Seq[String],
+ isMOR: Boolean,
+ isBootstrap: Boolean) extends 
ParquetFileFormat with SparkAdapterSupport {
+
+  //Used so that the planner only projects once and does not stack overflow
+  var isProjected = false
+
+  /**
+   * Support batch needs to remain consistent, even if one side of a bootstrap 
merge can support
+   * while the other side can't
+   */
+  private var supportBatchCalled = false
+  private var supportBatchResult = false
+  override def supportBatch(sparkSession: SparkSession, schema: StructType): 
Boolean = {
+if (!supportBatchCalled) {
+  supportBatchCalled = true
+  supportBatchResult = !isMOR && super.supportBatch(sparkSession, schema)
+}
+supportBatchResult
+  }
+
+  override def buildReaderWithPartitionValues(sparkSession: SparkSession,
+  dataSchema: StructType,
+  partitionSchema: StructType,
+  requiredSchema: StructType,
+  filters: Seq[Filter],
+  options: Map[String, String],
+  hadoopConf: Configuration): 
PartitionedFile => Iterator[InternalRow] = {
+
+val outputSchema = StructType(requiredSchema.fields ++ 
partitionSchema.fields)
+
+val requiredSchemaWithMandatory = if (!isMOR || 
MergeOnReadSnapshotRelation.isProjectionCompatible(tableState.value)) {
+  //add mandatory fields to required schema
+  val added: mutable.Buffer[StructField] = mutable.Buffer[StructField]()
+  for (field <- mandatoryFields) {
+if (requiredSchema.getFieldIndex(field).isEmpty) {
+  val fieldToAdd = 
dataSchema.fields(dataSchema.getFieldIndex(field).get)
+  added.append(fieldToAdd)
+}
+  }
+  val addedFields = StructType(added.toArray)
+  StructType(requiredSchema.toArray ++ addedFields.fields)
+} else {

[GitHub] [hudi] jonvex commented on a diff in pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading

2023-08-04 Thread via GitHub



jonvex commented on code in PR #9276:
URL: https://github.com/apache/hudi/pull/9276#discussion_r1284921590


##
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/LegacyHoodieParquetFileFormat.scala:
##
@@ -23,12 +23,12 @@ import org.apache.hudi.{DataSourceReadOptions, 
HoodieSparkUtils, SparkAdapterSup
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.execution.datasources.PartitionedFile
-import 
org.apache.spark.sql.execution.datasources.parquet.HoodieParquetFileFormat.FILE_FORMAT_ID
+import 
org.apache.spark.sql.execution.datasources.parquet.LegacyHoodieParquetFileFormat.FILE_FORMAT_ID
 import org.apache.spark.sql.sources.Filter
 import org.apache.spark.sql.types.{AtomicType, StructType}
 
 
-class HoodieParquetFileFormat extends ParquetFileFormat with 
SparkAdapterSupport {
+class LegacyHoodieParquetFileFormat extends ParquetFileFormat with 
SparkAdapterSupport {

Review Comment:
   I think changes to the relation and especially rdd classes are more likely 
to require changes in the new file format than changes to 
LegacyHoodieParquetFileFormat. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] hudi-bot commented on pull request #9366: [HUDI-6476][FOLLOW-UP] Restore FileStatus instead of Path to avoid additional fs request

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9366:
URL: https://github.com/apache/hudi/pull/9366#issuecomment-1666275223

   
   ## CI report:
   
   * f26271b49440962d10ad12ba619d3caf12a6091d Azure: 
[FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19083)
 
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] yihua commented on pull request #9223: [HUDI-6553] Speedup column stats and bloom index creation on large datasets.

2023-08-04 Thread via GitHub



yihua commented on PR #9223:
URL: https://github.com/apache/hudi/pull/9223#issuecomment-1666269468

   Looks like a similar optimization has been merged before: #8856.  Should 
this PR be rebased first before being ready for review again?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] yihua commented on a diff in pull request #9255: [HUDI-6503] Make TableServiceClient's txnManager consistent with Writ…

2023-08-04 Thread via GitHub



yihua commented on code in PR #9255:
URL: https://github.com/apache/hudi/pull/9255#discussion_r1284902386


##
hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDTableServiceClient.java:
##
@@ -67,8 +68,9 @@ public class SparkRDDTableServiceClient extends 
BaseHoodieTableServiceClient<
 
   protected SparkRDDTableServiceClient(HoodieEngineContext context,
HoodieWriteConfig clientConfig,
-   Option 
timelineService) {
-super(context, clientConfig, timelineService);
+   Option 
timelineService,
+   Option txnManager) {

Review Comment:
   Not sure if it's a good idea to expose this in the constructor, as the 
client can escape the transaction manager and cause correctness issues in 
concurrency control.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] yihua commented on pull request #9255: [HUDI-6503] Make TableServiceClient's txnManager consistent with Writ…

2023-08-04 Thread via GitHub



yihua commented on PR #9255:
URL: https://github.com/apache/hudi/pull/9255#issuecomment-1666264355

   > > Can you elaborate a little more why the table service client can hold a 
separate lock ?
   > 
   > Because `InProcessLockProvider` is valid as long as it is in the same JVM 
process (see `static final Map 
LOCK_INSTANCE_PER_BASEPATH = new ConcurrentHashMap<>();`), other locks can not 
even be in the same JVM. Maybe i'm missing something.
   > 
   > Of course, it is best to use the same lock manager, just like it used to 
be (before #6732). And CI seems to be stable now by the way.
   
   `LOCK_INSTANCE_PER_BASEPATH` is already static final, and multiple 
`InProcessLockProvider` instances using the same table base path in the same 
process should get the same underlying `ReentrantReadWriteLock` instance.  
Check this test which passes now: 
`TestInProcessLockProvider.testLockIdentity()`.
   
   There was a bug before in `InProcessLockProvider` which is fixed in #8658.  
Not sure if you hit that.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] nsivabalan commented on a diff in pull request #9367: [HUDI-6648] Support building table views from existing files

2023-08-04 Thread via GitHub



nsivabalan commented on code in PR #9367:
URL: https://github.com/apache/hudi/pull/9367#discussion_r1284898855


##
hudi-common/src/main/java/org/apache/hudi/common/util/ExternalFilePathUtil.java:
##
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.common.util;
+
+public class ExternalFilePathUtil {
+  private static final String EXTERNAL_FILE_PREFIX = "hudiext_";
+
+  public static String prefixExternalFileWithCommitTime(String fileName, 
String commitTime) {

Review Comment:
   naming: convertExternalFileToHudiFilePath 



##
hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java:
##
@@ -324,33 +325,45 @@ public static HoodieRecord 
createPartitionListRecord(List
* @param partitionThe name of the partition
* @param filesAdded   Mapping of files to their sizes for files which have 
been added to this partition
* @param filesDeleted List of files which have been deleted from this 
partition
+   * @param instantTime  Commit time of the commit responsible for adding 
and/or deleting these files, will be empty during bootstrapping of the metadata 
table
*/
   public static HoodieRecord 
createPartitionFilesRecord(String partition,
-   
Option> filesAdded,
-   
Option> filesDeleted) {
-Map fileInfo = new HashMap<>();
-filesAdded.ifPresent(filesMap ->
-fileInfo.putAll(
-filesMap.entrySet().stream().collect(
-Collectors.toMap(Map.Entry::getKey, (entry) -> {
-  long fileSize = entry.getValue();
-  // Assert that the file-size of the file being added is 
positive, since Hudi
-  // should not be creating empty files
-  checkState(fileSize > 0);
-  return new HoodieMetadataFileInfo(fileSize, false);
-})))
-);
-filesDeleted.ifPresent(filesList ->
-fileInfo.putAll(
-filesList.stream().collect(
-Collectors.toMap(Function.identity(), (ignored) -> new 
HoodieMetadataFileInfo(0L, true
-);
+   
Map filesAdded,
+   
List filesDeleted,
+   
Option instantTime) {
+int size = filesAdded.size() + filesDeleted.size();
+Map fileInfo = new HashMap<>(size, 1);
+filesAdded.forEach((fileName, fileSize) -> {
+  // Assert that the file-size of the file being added is positive, since 
Hudi
+  // should not be creating empty files
+  checkState(fileSize > 0);
+  fileInfo.put(handleFileName(fileName, instantTime), new 
HoodieMetadataFileInfo(fileSize, false));
+});
+
+filesDeleted.forEach(fileName -> fileInfo.put(handleFileName(fileName, 
instantTime), DELETE_FILE_METADATA));
 
 HoodieKey key = new HoodieKey(partition, 
MetadataPartitionType.FILES.getPartitionPath());
 HoodieMetadataPayload payload = new 
HoodieMetadataPayload(key.getRecordKey(), METADATA_TYPE_FILE_LIST, fileInfo);
 return new HoodieAvroRecord<>(key, payload);
   }
 
+  /**
+   * In the case where a file was created by something other than a Hudi 
writer, the file name will not contain the commit time. We will prefix the file 
name with hudiext_[commitTime] before storing

Review Comment:
   lets try to use the same terminology everywhere. 
   if we standardize on "externally created files", lets try to use the same 
phrase everywhere to avoid confusion



##
hudi-common/src/main/java/org/apache/hudi/common/config/HoodieMetadataConfig.java:
##
@@ -312,6 +312,13 @@ public final class HoodieMetadataConfig extends 
HoodieConfig {
   .withDocumentation("Maximum size in bytes of a single log file. Larger 
log files can contain larger log blocks "
   + "thereby reducing the number of blocks to search for keys");
 
+  public static final ConfigProperty DISABLE_FILE

[GitHub] [hudi] the-other-tim-brown commented on a diff in pull request #9367: [HUDI-6648] Support building table views from existing files

2023-08-04 Thread via GitHub



the-other-tim-brown commented on code in PR #9367:
URL: https://github.com/apache/hudi/pull/9367#discussion_r1284897744


##
hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java:
##
@@ -324,33 +325,45 @@ public static HoodieRecord 
createPartitionListRecord(List
* @param partitionThe name of the partition
* @param filesAdded   Mapping of files to their sizes for files which have 
been added to this partition
* @param filesDeleted List of files which have been deleted from this 
partition
+   * @param instantTime  Commit time of the commit responsible for adding 
and/or deleting these files, will be empty during bootstrapping of the metadata 
table
*/
   public static HoodieRecord 
createPartitionFilesRecord(String partition,
-   
Option> filesAdded,
-   
Option> filesDeleted) {
-Map fileInfo = new HashMap<>();
-filesAdded.ifPresent(filesMap ->
-fileInfo.putAll(
-filesMap.entrySet().stream().collect(
-Collectors.toMap(Map.Entry::getKey, (entry) -> {
-  long fileSize = entry.getValue();
-  // Assert that the file-size of the file being added is 
positive, since Hudi
-  // should not be creating empty files
-  checkState(fileSize > 0);
-  return new HoodieMetadataFileInfo(fileSize, false);
-})))
-);
-filesDeleted.ifPresent(filesList ->
-fileInfo.putAll(
-filesList.stream().collect(
-Collectors.toMap(Function.identity(), (ignored) -> new 
HoodieMetadataFileInfo(0L, true
-);
+   
Map filesAdded,
+   
List filesDeleted,
+   
Option instantTime) {
+int size = filesAdded.size() + filesDeleted.size();
+Map fileInfo = new HashMap<>(size, 1);
+filesAdded.forEach((fileName, fileSize) -> {
+  // Assert that the file-size of the file being added is positive, since 
Hudi
+  // should not be creating empty files
+  checkState(fileSize > 0);
+  fileInfo.put(handleFileName(fileName, instantTime), new 
HoodieMetadataFileInfo(fileSize, false));
+});
+
+filesDeleted.forEach(fileName -> fileInfo.put(handleFileName(fileName, 
instantTime), DELETE_FILE_METADATA));
 
 HoodieKey key = new HoodieKey(partition, 
MetadataPartitionType.FILES.getPartitionPath());
 HoodieMetadataPayload payload = new 
HoodieMetadataPayload(key.getRecordKey(), METADATA_TYPE_FILE_LIST, fileInfo);
 return new HoodieAvroRecord<>(key, payload);
   }
 
+  /**
+   * In the case where a file was created by something other than a Hudi 
writer, the file name will not contain the commit time. We will prefix the file 
name with hudiext_[commitTime] before storing
+   * in the metadata table. The constructor for {@link 
org.apache.hudi.common.model.HoodieBaseFile} will properly handle this prefix.
+   * @param fileName incoming file name
+   * @param commitTime time of the commit (will be empty during bootstrap 
operations)
+   * @return file name with commit time prefix if the input file name does not 
contain the commit time, otherwise returns the original input
+   */
+  private static String handleFileName(String fileName, Option 
commitTime) {

Review Comment:
   I tested this out locally and this approach will work. The more I consider 
it, I think that it could be best to force the caller to pass in the modified 
path rather than trying to infer whether or not we should modify it here.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] yihua commented on pull request #9345: [HUDI-6649] Fix column stat based data filtering for MOR

2023-08-04 Thread via GitHub



yihua commented on PR #9345:
URL: https://github.com/apache/hudi/pull/9345#issuecomment-1666254533

   @lokeshj1703 Could you summarize the approach in the PR description?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] yihua commented on pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading

2023-08-04 Thread via GitHub



yihua commented on PR #9276:
URL: https://github.com/apache/hudi/pull/9276#issuecomment-1666254051

   @jonvex Could you also update the PR description with details of the 
approach?  Before merging this PR, let's create a new PR based on this patch 
with `hoodie.datasource.read.use.legacy.parquet.file.format=false` and make 
sure tests pass on MOR and bootstrap queries.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] nsivabalan commented on a diff in pull request #9367: [HUDI-6648] Support building table views from existing files

2023-08-04 Thread via GitHub



nsivabalan commented on code in PR #9367:
URL: https://github.com/apache/hudi/pull/9367#discussion_r1284895611


##
hudi-common/src/main/java/org/apache/hudi/common/config/HoodieMetadataConfig.java:
##
@@ -312,6 +312,13 @@ public final class HoodieMetadataConfig extends 
HoodieConfig {
   .withDocumentation("Maximum size in bytes of a single log file. Larger 
log files can contain larger log blocks "
   + "thereby reducing the number of blocks to search for keys");
 
+  public static final ConfigProperty DISABLE_FILESYSTEM_BOOTSTRAP = 
ConfigProperty
+  .key(METADATA_PREFIX + ".filesystem.bootstrap.disabled")
+  .defaultValue(false)
+  .sinceVersion("0.14.0")
+  .withDocumentation("Disable bootstrapping metadata table from the file 
system when the table is first created. "

Review Comment:
   also, should we prefix it with "_" for now so that we keep it internally. 
and so we avoid exposing to general users ?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] yihua commented on a diff in pull request #9364: [HUDI-6645] Relax the restriction for Spark MDT rollback

2023-08-04 Thread via GitHub



yihua commented on code in PR #9364:
URL: https://github.com/apache/hudi/pull/9364#discussion_r1284895154


##
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java:
##
@@ -1022,22 +1020,6 @@ public void update(HoodieRollbackMetadata 
rollbackMetadata, String instantTime)
 }
   }
 
-  protected void validateRollback(

Review Comment:
   Why should this be relaxed?  Does not look safe for Spark.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] yihua commented on a diff in pull request #9365: [HUDI-6646] Add default lock provider for spark offline compaction an…

2023-08-04 Thread via GitHub



yihua commented on code in PR #9365:
URL: https://github.com/apache/hudi/pull/9365#discussion_r1284893578


##
hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java:
##
@@ -578,6 +580,20 @@ public static HoodieTableMetaClient createMetaClient(
 .build();
   }
 
+  public static void addLockOptions(String basePath, TypedProperties props) {
+if (!props.containsKey(HoodieLockConfig.LOCK_PROVIDER_CLASS_NAME.key())) {
+  HoodieLockConfig lockConfig = HoodieLockConfig.newBuilder()
+  .withLockProvider(FileSystemBasedLockProvider.class)
+  .withLockWaitTimeInMillis(2000L) // 2s
+  .withFileSystemLockExpire(1) // 1 minute
+  .withClientNumRetries(30)

Review Comment:
   Use the default values defined in the `ConfigProperty` in `HoodieLockConfig` 
instead of hard-coding the values?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] the-other-tim-brown commented on a diff in pull request #9367: [HUDI-6648] Support building table views from existing files

2023-08-04 Thread via GitHub



the-other-tim-brown commented on code in PR #9367:
URL: https://github.com/apache/hudi/pull/9367#discussion_r1284890274


##
hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java:
##
@@ -324,33 +325,45 @@ public static HoodieRecord 
createPartitionListRecord(List
* @param partitionThe name of the partition
* @param filesAdded   Mapping of files to their sizes for files which have 
been added to this partition
* @param filesDeleted List of files which have been deleted from this 
partition
+   * @param instantTime  Commit time of the commit responsible for adding 
and/or deleting these files, will be empty during bootstrapping of the metadata 
table
*/
   public static HoodieRecord 
createPartitionFilesRecord(String partition,
-   
Option> filesAdded,
-   
Option> filesDeleted) {
-Map fileInfo = new HashMap<>();
-filesAdded.ifPresent(filesMap ->
-fileInfo.putAll(
-filesMap.entrySet().stream().collect(
-Collectors.toMap(Map.Entry::getKey, (entry) -> {
-  long fileSize = entry.getValue();
-  // Assert that the file-size of the file being added is 
positive, since Hudi
-  // should not be creating empty files
-  checkState(fileSize > 0);
-  return new HoodieMetadataFileInfo(fileSize, false);
-})))
-);
-filesDeleted.ifPresent(filesList ->
-fileInfo.putAll(
-filesList.stream().collect(
-Collectors.toMap(Function.identity(), (ignored) -> new 
HoodieMetadataFileInfo(0L, true
-);
+   
Map filesAdded,
+   
List filesDeleted,
+   
Option instantTime) {
+int size = filesAdded.size() + filesDeleted.size();
+Map fileInfo = new HashMap<>(size, 1);
+filesAdded.forEach((fileName, fileSize) -> {
+  // Assert that the file-size of the file being added is positive, since 
Hudi
+  // should not be creating empty files
+  checkState(fileSize > 0);
+  fileInfo.put(handleFileName(fileName, instantTime), new 
HoodieMetadataFileInfo(fileSize, false));
+});
+
+filesDeleted.forEach(fileName -> fileInfo.put(handleFileName(fileName, 
instantTime), DELETE_FILE_METADATA));
 
 HoodieKey key = new HoodieKey(partition, 
MetadataPartitionType.FILES.getPartitionPath());
 HoodieMetadataPayload payload = new 
HoodieMetadataPayload(key.getRecordKey(), METADATA_TYPE_FILE_LIST, fileInfo);
 return new HoodieAvroRecord<>(key, payload);
   }
 
+  /**
+   * In the case where a file was created by something other than a Hudi 
writer, the file name will not contain the commit time. We will prefix the file 
name with hudiext_[commitTime] before storing
+   * in the metadata table. The constructor for {@link 
org.apache.hudi.common.model.HoodieBaseFile} will properly handle this prefix.
+   * @param fileName incoming file name
+   * @param commitTime time of the commit (will be empty during bootstrap 
operations)
+   * @return file name with commit time prefix if the input file name does not 
contain the commit time, otherwise returns the original input
+   */
+  private static String handleFileName(String fileName, Option 
commitTime) {

Review Comment:
   If we want to avoid this call, we can try to send the prefixed filename 
through the WriteStatus instead so the fileName is a pass through at this point 
in the code. @vinothchandar and @nsivabalan, what are your thoughts?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] hudi-bot commented on pull request #9367: [HUDI-6648] Support building table views from existing files

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9367:
URL: https://github.com/apache/hudi/pull/9367#issuecomment-1666241805

   
   ## CI report:
   
   * e1af45ae2e4eb423ffa0bd14376583b881b64d3b Azure: 
[CANCELED](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19088)
 
   * 0e4813a189457ef09caf85dca562698f6ba46a4e Azure: 
[PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19092)
 
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] yihua commented on pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading

2023-08-04 Thread via GitHub



yihua commented on PR #9276:
URL: https://github.com/apache/hudi/pull/9276#issuecomment-1666241001

   @jonvex let's track the follow-ups in JIRA.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] yihua commented on a diff in pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading

2023-08-04 Thread via GitHub



yihua commented on code in PR #9276:
URL: https://github.com/apache/hudi/pull/9276#discussion_r1284886934


##
hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestNewHoodieParquetFileFormat.java:
##
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.functional;
+
+import org.apache.hudi.DataSourceReadOptions;
+import org.apache.hudi.common.model.HoodieTableType;
+
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SaveMode;
+import org.junit.jupiter.api.Tag;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+
+import java.util.Arrays;
+import java.util.Map;
+import java.util.stream.Stream;
+
+import static org.apache.hudi.common.model.HoodieTableType.COPY_ON_WRITE;
+import static org.apache.hudi.common.model.HoodieTableType.MERGE_ON_READ;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+@Tag("functional")
+public class TestNewHoodieParquetFileFormat extends TestBootstrapReadBase {
+
+  private static Stream testArgs() {
+Stream.Builder b = Stream.builder();
+HoodieTableType[] tableType = {COPY_ON_WRITE, MERGE_ON_READ};
+Integer[] nPartitions = {0, 1, 2};
+for (HoodieTableType tt : tableType) {
+  for (Integer n : nPartitions) {
+b.add(Arguments.of(tt, n));
+  }
+}
+return b.build();
+  }
+
+  @ParameterizedTest
+  @MethodSource("testArgs")
+  public void runTests(HoodieTableType tableType, Integer nPartitions) {
+this.bootstrapType = nPartitions == 0 ? "metadata" : "mixed";
+this.dashPartitions = true;
+this.tableType = tableType;
+this.nPartitions = nPartitions;
+setupDirs();
+
+//do bootstrap
+Map options = setBootstrapOptions();
+Dataset bootstrapDf = sparkSession.emptyDataFrame();
+bootstrapDf.write().format("hudi")
+.options(options)
+.mode(SaveMode.Overwrite)
+.save(bootstrapTargetPath);
+runComparisons();
+
+options = basicOptions();
+doUpdate(options, "001");
+runComparisons();
+
+doInsert(options, "002");
+runComparisons();
+
+doDelete(options, "003");
+runComparisons();
+  }
+
+  protected void runComparisons() {
+if (tableType.equals(MERGE_ON_READ)) {
+  runComparison(hudiBasePath);
+}
+runComparison(bootstrapTargetPath);
+  }
+
+  protected void runComparison(String tableBasePath) {
+testCount(tableBasePath);
+runIndividualComparison(tableBasePath);
+runIndividualComparison(tableBasePath, "partition_path");
+runIndividualComparison(tableBasePath, "_hoodie_record_key", 
"_hoodie_commit_time", "_hoodie_partition_path");
+runIndividualComparison(tableBasePath, "_hoodie_commit_time", 
"_hoodie_commit_seqno");
+runIndividualComparison(tableBasePath, "_hoodie_commit_time", 
"_hoodie_commit_seqno", "partition_path");
+runIndividualComparison(tableBasePath, "_row_key", "_hoodie_commit_seqno", 
"_hoodie_record_key", "_hoodie_partition_path");
+runIndividualComparison(tableBasePath, "_row_key", "partition_path", 
"_hoodie_is_deleted", "begin_lon");
+  }
+
+  protected void testCount(String tableBasePath) {
+Dataset legacyDf = sparkSession.read().format("hudi")
+
.option(DataSourceReadOptions.USE_LEGACY_HUDI_PARQUET_FILE_FORMAT().key(),"true").load(tableBasePath);
+Dataset fileFormatDf = sparkSession.read().format("hudi")
+
.option(DataSourceReadOptions.USE_LEGACY_HUDI_PARQUET_FILE_FORMAT().key(),"false").load(tableBasePath);
+assertEquals(legacyDf.count(), fileFormatDf.count());
+  }
+
+  protected scala.collection.Seq seq(String... a) {
+return 
scala.collection.JavaConverters.asScalaBufferConverter(Arrays.asList(a)).asScala().toSeq();
+  }
+
+  protected void runIndividualComparison(String tableBasePath) {
+runIndividualComparison(tableBasePath, "");
+  }
+
+  protected void runIndividualComparison(String tableBasePath, String 
firstColumn, String... columns) {
+Dataset legacyDf = sparkSession.read().format("hudi")
+
.option(DataSourceReadOptions.USE_LEGACY_HU

[GitHub] [hudi] yihua commented on a diff in pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading

2023-08-04 Thread via GitHub



yihua commented on code in PR #9276:
URL: https://github.com/apache/hudi/pull/9276#discussion_r1284886809


##
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/NewHoodieParquetFileFormat.scala:
##
@@ -0,0 +1,353 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.parquet
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import 
org.apache.hudi.DataSourceReadOptions.{REALTIME_PAYLOAD_COMBINE_OPT_VAL, 
REALTIME_SKIP_MERGE_OPT_VAL}
+import org.apache.hudi.MergeOnReadSnapshotRelation.createPartitionedFile
+import org.apache.hudi.common.fs.FSUtils
+import org.apache.hudi.common.model.{BaseFile, FileSlice, HoodieLogFile, 
HoodieRecord}
+import org.apache.hudi.common.util.ValidationUtils.checkState
+import org.apache.hudi.{HoodieBaseRelation, HoodieSparkUtils, 
HoodieTableSchema, HoodieTableState, LogFileIterator, 
MergeOnReadSnapshotRelation, PartitionFileSliceMapping, 
RecordMergingFileIterator, SkipMergeIterator, SparkAdapterSupport}
+import org.apache.spark.broadcast.Broadcast
+import 
org.apache.spark.sql.HoodieCatalystExpressionUtils.generateUnsafeProjection
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.JoinedRow
+import org.apache.spark.sql.execution.datasources.PartitionedFile
+import org.apache.spark.sql.hudi.HoodieSqlCommonUtils.isMetaField
+import org.apache.spark.sql.sources.Filter
+import org.apache.spark.sql.types.{StructField, StructType}
+import org.apache.spark.sql.vectorized.{ColumnVector, ColumnarBatch}
+import org.apache.spark.util.SerializableConfiguration
+
+import scala.collection.mutable
+import scala.jdk.CollectionConverters.asScalaIteratorConverter
+
+class NewHoodieParquetFileFormat(tableState: Broadcast[HoodieTableState],
+ tableSchema: Broadcast[HoodieTableSchema],
+ tableName: String,
+ mergeType: String,
+ mandatoryFields: Seq[String],
+ isMOR: Boolean,
+ isBootstrap: Boolean) extends 
ParquetFileFormat with SparkAdapterSupport {
+
+  //Used so that the planner only projects once and does not stack overflow
+  var isProjected = false
+
+  /**
+   * Support batch needs to remain consistent, even if one side of a bootstrap 
merge can support
+   * while the other side can't
+   */
+  private var supportBatchCalled = false
+  private var supportBatchResult = false
+  override def supportBatch(sparkSession: SparkSession, schema: StructType): 
Boolean = {
+if (!supportBatchCalled) {
+  supportBatchCalled = true
+  supportBatchResult = !isMOR && super.supportBatch(sparkSession, schema)
+}
+supportBatchResult
+  }
+
+  override def buildReaderWithPartitionValues(sparkSession: SparkSession,
+  dataSchema: StructType,
+  partitionSchema: StructType,
+  requiredSchema: StructType,
+  filters: Seq[Filter],
+  options: Map[String, String],
+  hadoopConf: Configuration): 
PartitionedFile => Iterator[InternalRow] = {
+
+val outputSchema = StructType(requiredSchema.fields ++ 
partitionSchema.fields)
+
+val requiredSchemaWithMandatory = if (!isMOR || 
MergeOnReadSnapshotRelation.isProjectionCompatible(tableState.value)) {
+  //add mandatory fields to required schema
+  val added: mutable.Buffer[StructField] = mutable.Buffer[StructField]()
+  for (field <- mandatoryFields) {
+if (requiredSchema.getFieldIndex(field).isEmpty) {
+  val fieldToAdd = 
dataSchema.fields(dataSchema.getFieldIndex(field).get)
+  added.append(fieldToAdd)
+}
+  }
+  val addedFields = StructType(added.toArray)
+  StructType(requiredSchema.toArray ++ addedFields.fields)
+} else {
+

[GitHub] [hudi] nsivabalan commented on a diff in pull request #9367: [HUDI-6648] Support building table views from existing files

2023-08-04 Thread via GitHub



nsivabalan commented on code in PR #9367:
URL: https://github.com/apache/hudi/pull/9367#discussion_r1284886054


##
hudi-common/src/main/java/org/apache/hudi/common/config/HoodieMetadataConfig.java:
##
@@ -312,6 +312,13 @@ public final class HoodieMetadataConfig extends 
HoodieConfig {
   .withDocumentation("Maximum size in bytes of a single log file. Larger 
log files can contain larger log blocks "
   + "thereby reducing the number of blocks to search for keys");
 
+  public static final ConfigProperty DISABLE_FILESYSTEM_BOOTSTRAP = 
ConfigProperty
+  .key(METADATA_PREFIX + ".filesystem.bootstrap.disabled")
+  .defaultValue(false)
+  .sinceVersion("0.14.0")
+  .withDocumentation("Disable bootstrapping metadata table from the file 
system when the table is first created. "

Review Comment:
   should we consider explicitly calling external metadata sync in the config 
naming. 
   current naming "hoodie.metadata.filesystem.bootstrap.disabled" is kind of 
abstract. I am sure OSS dev may not understand what this config serves or when 
would someone enable it.
   
   Not too strong that we should change it. but just wanted to give it a 2nd 
thought as the naming will stay for ever. 
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] nsivabalan commented on a diff in pull request #9367: [HUDI-6648] Support building table views from existing files

2023-08-04 Thread via GitHub



nsivabalan commented on code in PR #9367:
URL: https://github.com/apache/hudi/pull/9367#discussion_r1284886054


##
hudi-common/src/main/java/org/apache/hudi/common/config/HoodieMetadataConfig.java:
##
@@ -312,6 +312,13 @@ public final class HoodieMetadataConfig extends 
HoodieConfig {
   .withDocumentation("Maximum size in bytes of a single log file. Larger 
log files can contain larger log blocks "
   + "thereby reducing the number of blocks to search for keys");
 
+  public static final ConfigProperty DISABLE_FILESYSTEM_BOOTSTRAP = 
ConfigProperty
+  .key(METADATA_PREFIX + ".filesystem.bootstrap.disabled")
+  .defaultValue(false)
+  .sinceVersion("0.14.0")
+  .withDocumentation("Disable bootstrapping metadata table from the file 
system when the table is first created. "

Review Comment:
   should we consider explicitly calling external metadata sync in the config 
naming. 
   current naming "hoodie.metadata.filesystem.bootstrap.disabled" is kind of 
abstract. I am sure OSS dev may not understand what this config serves or when 
would someone enable it
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] nsivabalan commented on a diff in pull request #9367: [HUDI-6648] Support building table views from existing files

2023-08-04 Thread via GitHub



nsivabalan commented on code in PR #9367:
URL: https://github.com/apache/hudi/pull/9367#discussion_r1284886054


##
hudi-common/src/main/java/org/apache/hudi/common/config/HoodieMetadataConfig.java:
##
@@ -312,6 +312,13 @@ public final class HoodieMetadataConfig extends 
HoodieConfig {
   .withDocumentation("Maximum size in bytes of a single log file. Larger 
log files can contain larger log blocks "
   + "thereby reducing the number of blocks to search for keys");
 
+  public static final ConfigProperty DISABLE_FILESYSTEM_BOOTSTRAP = 
ConfigProperty
+  .key(METADATA_PREFIX + ".filesystem.bootstrap.disabled")
+  .defaultValue(false)
+  .sinceVersion("0.14.0")
+  .withDocumentation("Disable bootstrapping metadata table from the file 
system when the table is first created. "

Review Comment:
   should we consider explicitly calling external metadata sync in the config 
naming. 
   current naming "hoodie.metadata.filesystem.bootstrap.disabled" is kind of 
abstract. I am sure OSS dev may not understand what this config serves. 
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] yihua commented on a diff in pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading

2023-08-04 Thread via GitHub



yihua commented on code in PR #9276:
URL: https://github.com/apache/hudi/pull/9276#discussion_r1284873987


##
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala:
##
@@ -87,6 +87,14 @@ object DataSourceReadOptions {
   s"payload implementation to merge (${REALTIME_PAYLOAD_COMBINE_OPT_VAL}) 
or skip merging altogether" +
   s"${REALTIME_SKIP_MERGE_OPT_VAL}")
 
+  val USE_LEGACY_HUDI_PARQUET_FILE_FORMAT: ConfigProperty[String] = 
ConfigProperty
+.key("hoodie.datasource.read.use.legacy.parquet.file.format")
+.defaultValue("true")
+.markAdvanced()
+.sinceVersion("0.14.0")
+.withDocumentation("Read using the legacy Hudi parquet file format. The 
new Hudi parquet file format is " +
+  "introduced as an experimental feature in 0.14.0")

Review Comment:
   Mention that this new file format applies to MOR and Bootstrap queries only, 
and full schema evaluation is not supported by the new file format (i.e., 
`hoodie.schema.on.read.enable=true`).



##
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DefaultSource.scala:
##
@@ -247,6 +245,9 @@ object DefaultSource {
   Option(schema)
 }
 
+val useMORBootstrapFF = parameters.getOrElse(MOR_BOOTSTRAP_FILE_READER.key,
+  MOR_BOOTSTRAP_FILE_READER.defaultValue).toBoolean && (globPaths == null 
|| globPaths.isEmpty)

Review Comment:
   Got it.  Then let's leave this as a follow-up.  The new file format should 
support this too for feature completeness.



##
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/Iterators.scala:
##
@@ -83,8 +94,8 @@ class LogFileIterator(logFiles: List[HoodieLogFile],
 }
 .getOrElse(new TypedProperties())
 
-  protected override val avroSchema: Schema = new 
Schema.Parser().parse(requiredSchema.avroSchemaStr)
-  protected override val structTypeSchema: StructType = 
requiredSchema.structTypeSchema
+  protected override val avroSchema: Schema = requiredAvroSchema

Review Comment:
   Makes sense.



##
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadSnapshotRelation.scala:
##
@@ -45,13 +45,27 @@ case class MergeOnReadSnapshotRelation(override val 
sqlContext: SQLContext,
private val globPaths: Seq[Path],
private val userSchema: 
Option[StructType],
private val prunedDataSchema: 
Option[StructType] = None)
-  extends BaseMergeOnReadSnapshotRelation(sqlContext, optParams, metaClient, 
globPaths, userSchema, prunedDataSchema) {
+  extends BaseMergeOnReadSnapshotRelation(sqlContext, optParams, metaClient, 
globPaths, userSchema, prunedDataSchema) with SparkAdapterSupport {
 
   override type Relation = MergeOnReadSnapshotRelation
 
   override def updatePrunedDataSchema(prunedSchema: StructType): 
MergeOnReadSnapshotRelation =
 this.copy(prunedDataSchema = Some(prunedSchema))
 
+  def toHadoopFsRelation: HadoopFsRelation = {
+fileIndex.shouldBroadcast = true
+HadoopFsRelation(
+  location = fileIndex,
+  partitionSchema = fileIndex.partitionSchema,
+  dataSchema = fileIndex.dataSchema,
+  bucketSpec = None,
+  fileFormat = 
sparkAdapter.createMORBootstrapFileFormat(shouldExtractPartitionValuesFromPartitionPath,
+sparkSession.sparkContext.broadcast(tableState),
+
sparkSession.sparkContext.broadcast(HoodieTableSchema(tableStructSchema, 
tableAvroSchema.toString, internalSchemaOpt)),
+metaClient.getTableConfig.getTableName, mergeType, mandatoryFields, 
isMOR = true, isBootstrap = false).get,
+  optParams)(sparkSession)
+  }

Review Comment:
   Yeah, sg



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] yihua commented on a diff in pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading

2023-08-04 Thread via GitHub



yihua commented on code in PR #9276:
URL: https://github.com/apache/hudi/pull/9276#discussion_r1284872593


##
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieBootstrapConfig.java:
##
@@ -67,13 +67,6 @@ public class HoodieBootstrapConfig extends HoodieConfig {
   .sinceVersion("0.6.0")
   .withDocumentation("Selects the mode in which each file/partition in the 
bootstrapped dataset gets bootstrapped");
 
-  public static final ConfigProperty DATA_QUERIES_ONLY = ConfigProperty
-  .key("hoodie.bootstrap.data.queries.only")
-  .defaultValue("false")
-  .markAdvanced()
-  .sinceVersion("0.14.0")
-  .withDocumentation("Improves query performance, but queries cannot use 
hudi metadata fields");

Review Comment:
   btw, the config doc update should be in a separate PR.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] hudi-bot commented on pull request #9367: [HUDI-6648] Support building table views from existing files

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9367:
URL: https://github.com/apache/hudi/pull/9367#issuecomment-1666213879

   
   ## CI report:
   
   * e1af45ae2e4eb423ffa0bd14376583b881b64d3b Azure: 
[PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19088)
 
   * 0e4813a189457ef09caf85dca562698f6ba46a4e UNKNOWN
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] hudi-bot commented on pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9276:
URL: https://github.com/apache/hudi/pull/9276#issuecomment-1666213683

   
   ## CI report:
   
   * 662f3b320ab6ea06462bad9a4448add1ec2f380a UNKNOWN
   * ef8eaadd4f817aa08253b938e19ab3fa61d27b5c Azure: 
[FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19046)
 
   * def394b73203814bbb635841c5f07c216c0575cc Azure: 
[PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19091)
 
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] yihua commented on a diff in pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading

2023-08-04 Thread via GitHub



yihua commented on code in PR #9276:
URL: https://github.com/apache/hudi/pull/9276#discussion_r1284867759


##
hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieTypes.scala:
##
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hudi
+
+import org.apache.hudi.common.config.HoodieMetadataConfig
+import org.apache.hudi.internal.schema.InternalSchema
+import org.apache.spark.sql.types.StructType
+
+case class HoodieTableSchema(structTypeSchema: StructType, avroSchemaStr: 
String, internalSchema: Option[InternalSchema] = None)
+
+case class HoodieTableState(tablePath: String,

Review Comment:
   Got it.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] yihua commented on a diff in pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading

2023-08-04 Thread via GitHub



yihua commented on code in PR #9276:
URL: https://github.com/apache/hudi/pull/9276#discussion_r1284867523


##
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieBootstrapConfig.java:
##
@@ -67,13 +67,6 @@ public class HoodieBootstrapConfig extends HoodieConfig {
   .sinceVersion("0.6.0")
   .withDocumentation("Selects the mode in which each file/partition in the 
bootstrapped dataset gets bootstrapped");
 
-  public static final ConfigProperty DATA_QUERIES_ONLY = ConfigProperty
-  .key("hoodie.bootstrap.data.queries.only")
-  .defaultValue("false")
-  .markAdvanced()
-  .sinceVersion("0.14.0")
-  .withDocumentation("Improves query performance, but queries cannot use 
hudi metadata fields");

Review Comment:
   Got it.  Could you update the config documentation?  As discussed, we'll 
keep this config since it can still be used when the existing file format and 
relations are used.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] hudi-bot commented on pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9276:
URL: https://github.com/apache/hudi/pull/9276#issuecomment-1666208153

   
   ## CI report:
   
   * 662f3b320ab6ea06462bad9a4448add1ec2f380a UNKNOWN
   * ef8eaadd4f817aa08253b938e19ab3fa61d27b5c Azure: 
[FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19046)
 
   * def394b73203814bbb635841c5f07c216c0575cc UNKNOWN
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] hudi-bot commented on pull request #9369: [HUDI-6386] Fix flakey multiwriter tests

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9369:
URL: https://github.com/apache/hudi/pull/9369#issuecomment-1666208352

   
   ## CI report:
   
   * 68ba8ae97df89a261298dbb06c80f43c57159969 UNKNOWN
   * 2d7ced1d3723f9c6357378c3849ebca7e2db4904 Azure: 
[PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19090)
 
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] yihua commented on a diff in pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading

2023-08-04 Thread via GitHub



yihua commented on code in PR #9276:
URL: https://github.com/apache/hudi/pull/9276#discussion_r1284854120


##
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/LegacyHoodieParquetFileFormat.scala:
##
@@ -23,12 +23,12 @@ import org.apache.hudi.{DataSourceReadOptions, 
HoodieSparkUtils, SparkAdapterSup
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.execution.datasources.PartitionedFile
-import 
org.apache.spark.sql.execution.datasources.parquet.HoodieParquetFileFormat.FILE_FORMAT_ID
+import 
org.apache.spark.sql.execution.datasources.parquet.LegacyHoodieParquetFileFormat.FILE_FORMAT_ID
 import org.apache.spark.sql.sources.Filter
 import org.apache.spark.sql.types.{AtomicType, StructType}
 
 
-class HoodieParquetFileFormat extends ParquetFileFormat with 
SparkAdapterSupport {
+class LegacyHoodieParquetFileFormat extends ParquetFileFormat with 
SparkAdapterSupport {

Review Comment:
   Add docs here to link to the new file format implementation so that any 
changes to this format implementation should also reflect in the new file 
format class?



##
hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark24LegacyHoodieParquetFileFormat.scala:
##
@@ -50,7 +50,7 @@ import java.net.URI
  *   Avoiding appending partition values to the rows read from the data 
file
  * 
  */
-class Spark24HoodieParquetFileFormat(private val shouldAppendPartitionValues: 
Boolean) extends ParquetFileFormat {
+class Spark24LegacyHoodieParquetFileFormat(private val 
shouldAppendPartitionValues: Boolean) extends ParquetFileFormat {

Review Comment:
   Same here for version-specific file format classes: add docs here to link to 
the new file format implementation so that any changes to this format 
implementation should also reflect in the new file format class?



##
hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/SparkAdapter.scala:
##
@@ -165,7 +165,11 @@ trait SparkAdapter extends Serializable {
   /**
* Create instance of [[ParquetFileFormat]]
*/
-  def createHoodieParquetFileFormat(appendPartitionValues: Boolean): 
Option[ParquetFileFormat]
+  def createLegacyHoodieParquetFileFormat(appendPartitionValues: Boolean): 
Option[ParquetFileFormat]
+
+  def getFilePath(file: PartitionedFile): Path

Review Comment:
   There is an existing API with the same functionality: 
`HoodieSparkPartitionedFileUtils.getPathFromPartitionedFile`



##
hudi-spark-datasource/hudi-spark-common/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister:
##
@@ -17,4 +17,4 @@
 
 
 org.apache.hudi.DefaultSource
-org.apache.spark.sql.execution.datasources.parquet.HoodieParquetFileFormat
\ No newline at end of file
+org.apache.spark.sql.execution.datasources.parquet.LegacyHoodieParquetFileFormat

Review Comment:
   Just curious, I don't have a clear answer.  Since `createRelation` is 
overridden so functionality-wise it's ok.



##
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/NewHoodieParquetFileFormatUtils.scala:
##
@@ -0,0 +1,215 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi
+
+import org.apache.avro.Schema
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.mapred.JobConf
+import org.apache.hudi.HoodieBaseRelation._
+import org.apache.hudi.HoodieConversionUtils.toScalaOption
+import org.apache.hudi.common.config.ConfigProperty
+import org.apache.hudi.common.fs.FSUtils
+import org.apache.hudi.common.model.HoodieRecord
+import 
org.apache.hudi.common.model.HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE_PREFIX
+import org.apache.hudi.common.table.timeline.HoodieTimeline
+import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient, 
TableSchemaResolver}
+import org.apache.hudi.common.util.ValidationUtils.checkState
+import org.apache.hudi.common.util.{ConfigUtils, StringUtils}
+import org.apache.hudi.config.HoodieBootstrapConfig.DATA_QUERIES_ONLY
+import org.apache.

[GitHub] [hudi] jonvex commented on a diff in pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading

2023-08-04 Thread via GitHub



jonvex commented on code in PR #9276:
URL: https://github.com/apache/hudi/pull/9276#discussion_r1284863793


##
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/NewHoodieParquetFileFormat.scala:
##
@@ -0,0 +1,353 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.parquet
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import 
org.apache.hudi.DataSourceReadOptions.{REALTIME_PAYLOAD_COMBINE_OPT_VAL, 
REALTIME_SKIP_MERGE_OPT_VAL}
+import org.apache.hudi.MergeOnReadSnapshotRelation.createPartitionedFile
+import org.apache.hudi.common.fs.FSUtils
+import org.apache.hudi.common.model.{BaseFile, FileSlice, HoodieLogFile, 
HoodieRecord}
+import org.apache.hudi.common.util.ValidationUtils.checkState
+import org.apache.hudi.{HoodieBaseRelation, HoodieSparkUtils, 
HoodieTableSchema, HoodieTableState, LogFileIterator, 
MergeOnReadSnapshotRelation, PartitionFileSliceMapping, 
RecordMergingFileIterator, SkipMergeIterator, SparkAdapterSupport}
+import org.apache.spark.broadcast.Broadcast
+import 
org.apache.spark.sql.HoodieCatalystExpressionUtils.generateUnsafeProjection
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.JoinedRow
+import org.apache.spark.sql.execution.datasources.PartitionedFile
+import org.apache.spark.sql.hudi.HoodieSqlCommonUtils.isMetaField
+import org.apache.spark.sql.sources.Filter
+import org.apache.spark.sql.types.{StructField, StructType}
+import org.apache.spark.sql.vectorized.{ColumnVector, ColumnarBatch}
+import org.apache.spark.util.SerializableConfiguration
+
+import scala.collection.mutable
+import scala.jdk.CollectionConverters.asScalaIteratorConverter
+
+class NewHoodieParquetFileFormat(tableState: Broadcast[HoodieTableState],
+ tableSchema: Broadcast[HoodieTableSchema],
+ tableName: String,
+ mergeType: String,
+ mandatoryFields: Seq[String],
+ isMOR: Boolean,
+ isBootstrap: Boolean) extends 
ParquetFileFormat with SparkAdapterSupport {
+
+  //Used so that the planner only projects once and does not stack overflow
+  var isProjected = false
+
+  /**
+   * Support batch needs to remain consistent, even if one side of a bootstrap 
merge can support
+   * while the other side can't
+   */
+  private var supportBatchCalled = false
+  private var supportBatchResult = false
+  override def supportBatch(sparkSession: SparkSession, schema: StructType): 
Boolean = {
+if (!supportBatchCalled) {
+  supportBatchCalled = true
+  supportBatchResult = !isMOR && super.supportBatch(sparkSession, schema)
+}
+supportBatchResult
+  }
+
+  override def buildReaderWithPartitionValues(sparkSession: SparkSession,
+  dataSchema: StructType,
+  partitionSchema: StructType,
+  requiredSchema: StructType,
+  filters: Seq[Filter],
+  options: Map[String, String],
+  hadoopConf: Configuration): 
PartitionedFile => Iterator[InternalRow] = {
+
+val outputSchema = StructType(requiredSchema.fields ++ 
partitionSchema.fields)
+
+val requiredSchemaWithMandatory = if (!isMOR || 
MergeOnReadSnapshotRelation.isProjectionCompatible(tableState.value)) {
+  //add mandatory fields to required schema
+  val added: mutable.Buffer[StructField] = mutable.Buffer[StructField]()
+  for (field <- mandatoryFields) {
+if (requiredSchema.getFieldIndex(field).isEmpty) {
+  val fieldToAdd = 
dataSchema.fields(dataSchema.getFieldIndex(field).get)
+  added.append(fieldToAdd)
+}
+  }
+  val addedFields = StructType(added.toArray)
+  StructType(requiredSchema.toArray ++ addedFields.fields)
+} else {

[GitHub] [hudi] hudi-bot commented on pull request #9369: [HUDI-6386] Fix flakey multiwriter tests

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9369:
URL: https://github.com/apache/hudi/pull/9369#issuecomment-1666202386

   
   ## CI report:
   
   * 68ba8ae97df89a261298dbb06c80f43c57159969 UNKNOWN
   * 2d7ced1d3723f9c6357378c3849ebca7e2db4904 UNKNOWN
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] jonvex commented on a diff in pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading

2023-08-04 Thread via GitHub



jonvex commented on code in PR #9276:
URL: https://github.com/apache/hudi/pull/9276#discussion_r1284853617


##
hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/adapter/Spark2Adapter.scala:
##
@@ -143,8 +146,8 @@ class Spark2Adapter extends SparkAdapter {
 partitions.toSeq
   }
 
-  override def createHoodieParquetFileFormat(appendPartitionValues: Boolean): 
Option[ParquetFileFormat] = {
-Some(new Spark24HoodieParquetFileFormat(appendPartitionValues))
+  override def createLegacyHoodieParquetFileFormat(appendPartitionValues: 
Boolean): Option[ParquetFileFormat] = {
+Some(new Spark24LegacyHoodieParquetFileFormat(appendPartitionValues))

Review Comment:
   Yeah. We're going to have to figure out if there is a way to do schema 
evolution without porting code.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] jonvex commented on a diff in pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading

2023-08-04 Thread via GitHub



jonvex commented on code in PR #9276:
URL: https://github.com/apache/hudi/pull/9276#discussion_r1284852746


##
hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestNewHoodieParquetFileFormat.java:
##
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.functional;
+
+import org.apache.hudi.DataSourceReadOptions;
+import org.apache.hudi.common.model.HoodieTableType;
+
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SaveMode;
+import org.junit.jupiter.api.Tag;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+
+import java.util.Arrays;
+import java.util.Map;
+import java.util.stream.Stream;
+
+import static org.apache.hudi.common.model.HoodieTableType.COPY_ON_WRITE;
+import static org.apache.hudi.common.model.HoodieTableType.MERGE_ON_READ;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+@Tag("functional")
+public class TestNewHoodieParquetFileFormat extends TestBootstrapReadBase {
+
+  private static Stream testArgs() {
+Stream.Builder b = Stream.builder();
+HoodieTableType[] tableType = {COPY_ON_WRITE, MERGE_ON_READ};
+Integer[] nPartitions = {0, 1, 2};
+for (HoodieTableType tt : tableType) {
+  for (Integer n : nPartitions) {
+b.add(Arguments.of(tt, n));
+  }
+}
+return b.build();
+  }
+
+  @ParameterizedTest
+  @MethodSource("testArgs")
+  public void runTests(HoodieTableType tableType, Integer nPartitions) {
+this.bootstrapType = nPartitions == 0 ? "metadata" : "mixed";
+this.dashPartitions = true;
+this.tableType = tableType;
+this.nPartitions = nPartitions;
+setupDirs();
+
+//do bootstrap
+Map options = setBootstrapOptions();
+Dataset bootstrapDf = sparkSession.emptyDataFrame();
+bootstrapDf.write().format("hudi")
+.options(options)
+.mode(SaveMode.Overwrite)
+.save(bootstrapTargetPath);
+runComparisons();
+
+options = basicOptions();
+doUpdate(options, "001");
+runComparisons();
+
+doInsert(options, "002");
+runComparisons();
+
+doDelete(options, "003");
+runComparisons();
+  }
+
+  protected void runComparisons() {
+if (tableType.equals(MERGE_ON_READ)) {
+  runComparison(hudiBasePath);
+}
+runComparison(bootstrapTargetPath);
+  }
+
+  protected void runComparison(String tableBasePath) {
+testCount(tableBasePath);
+runIndividualComparison(tableBasePath);
+runIndividualComparison(tableBasePath, "partition_path");
+runIndividualComparison(tableBasePath, "_hoodie_record_key", 
"_hoodie_commit_time", "_hoodie_partition_path");
+runIndividualComparison(tableBasePath, "_hoodie_commit_time", 
"_hoodie_commit_seqno");
+runIndividualComparison(tableBasePath, "_hoodie_commit_time", 
"_hoodie_commit_seqno", "partition_path");
+runIndividualComparison(tableBasePath, "_row_key", "_hoodie_commit_seqno", 
"_hoodie_record_key", "_hoodie_partition_path");
+runIndividualComparison(tableBasePath, "_row_key", "partition_path", 
"_hoodie_is_deleted", "begin_lon");
+  }
+
+  protected void testCount(String tableBasePath) {
+Dataset legacyDf = sparkSession.read().format("hudi")
+
.option(DataSourceReadOptions.USE_LEGACY_HUDI_PARQUET_FILE_FORMAT().key(),"true").load(tableBasePath);
+Dataset fileFormatDf = sparkSession.read().format("hudi")
+
.option(DataSourceReadOptions.USE_LEGACY_HUDI_PARQUET_FILE_FORMAT().key(),"false").load(tableBasePath);
+assertEquals(legacyDf.count(), fileFormatDf.count());
+  }
+
+  protected scala.collection.Seq seq(String... a) {
+return 
scala.collection.JavaConverters.asScalaBufferConverter(Arrays.asList(a)).asScala().toSeq();
+  }
+
+  protected void runIndividualComparison(String tableBasePath) {
+runIndividualComparison(tableBasePath, "");
+  }
+
+  protected void runIndividualComparison(String tableBasePath, String 
firstColumn, String... columns) {
+Dataset legacyDf = sparkSession.read().format("hudi")
+
.option(DataSourceReadOptions.USE_LEGACY_H

[GitHub] [hudi] yihua commented on a diff in pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading

2023-08-04 Thread via GitHub



yihua commented on code in PR #9276:
URL: https://github.com/apache/hudi/pull/9276#discussion_r1284822235


##
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/NewHoodieParquetFileFormat.scala:
##
@@ -0,0 +1,353 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.parquet
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import 
org.apache.hudi.DataSourceReadOptions.{REALTIME_PAYLOAD_COMBINE_OPT_VAL, 
REALTIME_SKIP_MERGE_OPT_VAL}
+import org.apache.hudi.MergeOnReadSnapshotRelation.createPartitionedFile
+import org.apache.hudi.common.fs.FSUtils
+import org.apache.hudi.common.model.{BaseFile, FileSlice, HoodieLogFile, 
HoodieRecord}
+import org.apache.hudi.common.util.ValidationUtils.checkState
+import org.apache.hudi.{HoodieBaseRelation, HoodieSparkUtils, 
HoodieTableSchema, HoodieTableState, LogFileIterator, 
MergeOnReadSnapshotRelation, PartitionFileSliceMapping, 
RecordMergingFileIterator, SkipMergeIterator, SparkAdapterSupport}
+import org.apache.spark.broadcast.Broadcast
+import 
org.apache.spark.sql.HoodieCatalystExpressionUtils.generateUnsafeProjection
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.JoinedRow
+import org.apache.spark.sql.execution.datasources.PartitionedFile
+import org.apache.spark.sql.hudi.HoodieSqlCommonUtils.isMetaField
+import org.apache.spark.sql.sources.Filter
+import org.apache.spark.sql.types.{StructField, StructType}
+import org.apache.spark.sql.vectorized.{ColumnVector, ColumnarBatch}
+import org.apache.spark.util.SerializableConfiguration
+
+import scala.collection.mutable
+import scala.jdk.CollectionConverters.asScalaIteratorConverter
+
+class NewHoodieParquetFileFormat(tableState: Broadcast[HoodieTableState],
+ tableSchema: Broadcast[HoodieTableSchema],
+ tableName: String,
+ mergeType: String,
+ mandatoryFields: Seq[String],
+ isMOR: Boolean,
+ isBootstrap: Boolean) extends 
ParquetFileFormat with SparkAdapterSupport {
+
+  //Used so that the planner only projects once and does not stack overflow
+  var isProjected = false
+
+  /**
+   * Support batch needs to remain consistent, even if one side of a bootstrap 
merge can support
+   * while the other side can't
+   */
+  private var supportBatchCalled = false
+  private var supportBatchResult = false
+  override def supportBatch(sparkSession: SparkSession, schema: StructType): 
Boolean = {
+if (!supportBatchCalled) {
+  supportBatchCalled = true
+  supportBatchResult = !isMOR && super.supportBatch(sparkSession, schema)
+}
+supportBatchResult
+  }
+
+  override def buildReaderWithPartitionValues(sparkSession: SparkSession,
+  dataSchema: StructType,
+  partitionSchema: StructType,
+  requiredSchema: StructType,
+  filters: Seq[Filter],
+  options: Map[String, String],
+  hadoopConf: Configuration): 
PartitionedFile => Iterator[InternalRow] = {
+
+val outputSchema = StructType(requiredSchema.fields ++ 
partitionSchema.fields)
+
+val requiredSchemaWithMandatory = if (!isMOR || 
MergeOnReadSnapshotRelation.isProjectionCompatible(tableState.value)) {
+  //add mandatory fields to required schema
+  val added: mutable.Buffer[StructField] = mutable.Buffer[StructField]()
+  for (field <- mandatoryFields) {
+if (requiredSchema.getFieldIndex(field).isEmpty) {
+  val fieldToAdd = 
dataSchema.fields(dataSchema.getFieldIndex(field).get)
+  added.append(fieldToAdd)
+}
+  }
+  val addedFields = StructType(added.toArray)
+  StructType(requiredSchema.toArray ++ addedFields.fields)
+} else {
+

[GitHub] [hudi] jonvex commented on a diff in pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading

2023-08-04 Thread via GitHub



jonvex commented on code in PR #9276:
URL: https://github.com/apache/hudi/pull/9276#discussion_r1284845207


##
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/NewHoodieParquetFileFormatUtils.scala:
##
@@ -0,0 +1,215 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi
+
+import org.apache.avro.Schema
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.mapred.JobConf
+import org.apache.hudi.HoodieBaseRelation._
+import org.apache.hudi.HoodieConversionUtils.toScalaOption
+import org.apache.hudi.common.config.ConfigProperty
+import org.apache.hudi.common.fs.FSUtils
+import org.apache.hudi.common.model.HoodieRecord
+import 
org.apache.hudi.common.model.HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE_PREFIX
+import org.apache.hudi.common.table.timeline.HoodieTimeline
+import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient, 
TableSchemaResolver}
+import org.apache.hudi.common.util.ValidationUtils.checkState
+import org.apache.hudi.common.util.{ConfigUtils, StringUtils}
+import org.apache.hudi.config.HoodieBootstrapConfig.DATA_QUERIES_ONLY
+import org.apache.hudi.config.HoodieWriteConfig
+import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter
+import org.apache.hudi.internal.schema.{HoodieSchemaException, InternalSchema}
+import org.apache.spark.sql.catalyst.analysis.Resolver
+import 
org.apache.spark.sql.execution.datasources.parquet.NewHoodieParquetFileFormat
+import org.apache.spark.sql.execution.datasources.{FileStatusCache, 
HadoopFsRelation}
+import org.apache.spark.sql.hudi.HoodieSqlCommonUtils
+import org.apache.spark.sql.sources.BaseRelation
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.{SQLContext, SparkSession}
+
+import scala.collection.JavaConverters._
+import scala.util.{Failure, Success, Try}
+
+class NewHoodieParquetFileFormatUtils(val sqlContext: SQLContext,
+  val metaClient: HoodieTableMetaClient,
+  val optParamsInput: Map[String, String],
+  private val schemaSpec: 
Option[StructType]) extends SparkAdapterSupport {
+  protected val sparkSession: SparkSession = sqlContext.sparkSession
+
+  protected val optParams: Map[String, String] = optParamsInput.filter(kv => 
!kv._1.equals(DATA_QUERIES_ONLY.key()))
+  protected def tableName: String = metaClient.getTableConfig.getTableName
+
+  protected lazy val resolver: Resolver = 
sparkSession.sessionState.analyzer.resolver
+
+  private lazy val metaFieldNames = 
HoodieRecord.HOODIE_META_COLUMNS.asScala.toSet
+
+  protected lazy val fileIndex: HoodieFileIndex =
+HoodieFileIndex(sparkSession, metaClient, Some(tableStructSchema), 
optParams, FileStatusCache.getOrCreate(sparkSession))
+
+  protected lazy val conf: Configuration = new 
Configuration(sqlContext.sparkContext.hadoopConfiguration)
+  protected lazy val jobConf = new JobConf(conf)
+
+  protected lazy val tableConfig: HoodieTableConfig = metaClient.getTableConfig
+
+  protected lazy val basePath: Path = metaClient.getBasePathV2
+
+  protected lazy val (tableAvroSchema: Schema, internalSchemaOpt: 
Option[InternalSchema]) = {

Review Comment:
   Every singe method here is from base relation. You said to not use the 
relation so I just copied over what I needed. It was much simpler before



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] jonvex commented on a diff in pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading

2023-08-04 Thread via GitHub



jonvex commented on code in PR #9276:
URL: https://github.com/apache/hudi/pull/9276#discussion_r1284844263


##
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/Iterators.scala:
##
@@ -83,8 +94,8 @@ class LogFileIterator(logFiles: List[HoodieLogFile],
 }
 .getOrElse(new TypedProperties())
 
-  protected override val avroSchema: Schema = new 
Schema.Parser().parse(requiredSchema.avroSchemaStr)
-  protected override val structTypeSchema: StructType = 
requiredSchema.structTypeSchema
+  protected override val avroSchema: Schema = requiredAvroSchema

Review Comment:
   There isn't any functional difference to this code. I just added an 
alternate constructor so that I didn't have to create another table schema just 
to deconstruct it
   ```
 def this(logFiles: List[HoodieLogFile],
   partitionPath: Path,
   tableSchema: HoodieTableSchema,
   requiredSchema: HoodieTableSchema,
   tableState: HoodieTableState,
   config: Configuration) {
   this(logFiles, partitionPath, tableSchema, 
requiredSchema.structTypeSchema,
 new Schema.Parser().parse(requiredSchema.avroSchemaStr), tableState, 
config)
 }
   ``` 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] hudi-bot commented on pull request #9369: [HUDI-6386] Fix flakey multiwriter tests

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9369:
URL: https://github.com/apache/hudi/pull/9369#issuecomment-1666166562

   
   ## CI report:
   
   * 68ba8ae97df89a261298dbb06c80f43c57159969 UNKNOWN
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] hudi-bot commented on pull request #9345: [HUDI-6649] Fix column stat based data filtering for MOR

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9345:
URL: https://github.com/apache/hudi/pull/9345#issuecomment-1666166448

   
   ## CI report:
   
   * 600d19c601ec8b224643b7e4c0a7c0f7a2e1e290 Azure: 
[PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19089)
 
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] hudi-bot commented on pull request #9345: [HUDI-6649] Fix column stat based data filtering for MOR

2023-08-04 Thread via GitHub



hudi-bot commented on PR #9345:
URL: https://github.com/apache/hudi/pull/9345#issuecomment-1666159747

   
   ## CI report:
   
   * 600d19c601ec8b224643b7e4c0a7c0f7a2e1e290 UNKNOWN
   
   
   Bot commands
 @hudi-bot supports the following commands:
   
- `@hudi-bot run azure` re-run the last Azure build
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] jonvex opened a new pull request, #9369: [HUDI-6386] Fix flakey multiwriter tests

2023-08-04 Thread via GitHub



jonvex opened a new pull request, #9369:
URL: https://github.com/apache/hudi/pull/9369

   ### Change Logs
   
   getHoodieWriteClient was not thread safe so I override the method so it is 
and more than 1 client could be used
   
   ### Impact
   
   Tests no longer flakey
   
   ### Risk level (write none, low medium or high below)
   none
   
   ### Documentation Update
   
   _Describe any necessary documentation update if there is any new feature, 
config, or user-facing change_
   
   N/A
   
   ### Contributor's checklist
   
   - [ ] Read through [contributor's 
guide](https://hudi.apache.org/contribute/how-to-contribute)
   - [ ] Change Logs and Impact were stated clearly
   - [ ] Adequate tests were added if applicable
   - [ ] CI passed
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[jira] [Updated] (HUDI-6649) Fix column stat based data filtering for MOR

2023-08-04 Thread ASF GitHub Bot (Jira)



 [ 
https://issues.apache.org/jira/browse/HUDI-6649?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

ASF GitHub Bot updated HUDI-6649:
-
Labels: pull-request-available  (was: )

> Fix column stat based data filtering for MOR
> 
>
> Key: HUDI-6649
> URL: https://issues.apache.org/jira/browse/HUDI-6649
> Project: Apache Hudi
>  Issue Type: Bug
>  Components: index, writer-core
>Reporter: Lokesh Jain
>Assignee: Lokesh Jain
>Priority: Major
>  Labels: pull-request-available
>
> Currently MOR snapshot relation does not use the column stats index for 
> pruning the files in its queries. The Jira aims to add support for pruning 
> the file slices based on column stats in case of MOR.
>  



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

[GitHub] [hudi] lokeshj1703 commented on pull request #9345: [HUDI-6649] Fix column stat based data filtering for MOR

2023-08-04 Thread via GitHub



lokeshj1703 commented on PR #9345:
URL: https://github.com/apache/hudi/pull/9345#issuecomment-1666135202

   I couldn't find a way to setup async compaction in the test.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[jira] [Updated] (HUDI-6649) Fix column stat based data filtering for MOR

2023-08-04 Thread Lokesh Jain (Jira)



 [ 
https://issues.apache.org/jira/browse/HUDI-6649?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Lokesh Jain updated HUDI-6649:
--
Description: 
Currently MOR snapshot relation does not use the column stats index for pruning 
the files in its queries. The Jira aims to add support for pruning the file 
slices based on column stats in case of MOR.

 

  was:Currently MOR snapshot and incremental relation does not use the column 
stats index for pruning the files in its queries. The Jira aims to add support 
for pruning the file slices based on column stats in case of MOR.


> Fix column stat based data filtering for MOR
> 
>
> Key: HUDI-6649
> URL: https://issues.apache.org/jira/browse/HUDI-6649
> Project: Apache Hudi
>  Issue Type: Bug
>  Components: index, writer-core
>Reporter: Lokesh Jain
>Assignee: Lokesh Jain
>Priority: Major
>
> Currently MOR snapshot relation does not use the column stats index for 
> pruning the files in its queries. The Jira aims to add support for pruning 
> the file slices based on column stats in case of MOR.
>  



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

[jira] [Created] (HUDI-6649) Fix column stat based data filtering for MOR

2023-08-04 Thread Lokesh Jain (Jira)

Lokesh Jain created HUDI-6649:
-

 Summary: Fix column stat based data filtering for MOR
 Key: HUDI-6649
 URL: https://issues.apache.org/jira/browse/HUDI-6649
 Project: Apache Hudi
  Issue Type: Bug
  Components: index, writer-core
Reporter: Lokesh Jain
Assignee: Lokesh Jain


Currently MOR snapshot and incremental relation does not use the column stats 
index for pruning the files in its queries. The Jira aims to add support for 
pruning the file slices based on column stats.



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

[jira] [Updated] (HUDI-6649) Fix column stat based data filtering for MOR

2023-08-04 Thread Lokesh Jain (Jira)



 [ 
https://issues.apache.org/jira/browse/HUDI-6649?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Lokesh Jain updated HUDI-6649:
--
Description: Currently MOR snapshot and incremental relation does not use 
the column stats index for pruning the files in its queries. The Jira aims to 
add support for pruning the file slices based on column stats in case of MOR.  
(was: Currently MOR snapshot and incremental relation does not use the column 
stats index for pruning the files in its queries. The Jira aims to add support 
for pruning the file slices based on column stats.)

> Fix column stat based data filtering for MOR
> 
>
> Key: HUDI-6649
> URL: https://issues.apache.org/jira/browse/HUDI-6649
> Project: Apache Hudi
>  Issue Type: Bug
>  Components: index, writer-core
>Reporter: Lokesh Jain
>Assignee: Lokesh Jain
>Priority: Major
>
> Currently MOR snapshot and incremental relation does not use the column stats 
> index for pruning the files in its queries. The Jira aims to add support for 
> pruning the file slices based on column stats in case of MOR.



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

[GitHub] [hudi] yihua commented on a diff in pull request #9276: [HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading

2023-08-04 Thread via GitHub



yihua commented on code in PR #9276:
URL: https://github.com/apache/hudi/pull/9276#discussion_r1284800578


##
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/Iterators.scala:
##
@@ -83,8 +94,8 @@ class LogFileIterator(logFiles: List[HoodieLogFile],
 }
 .getOrElse(new TypedProperties())
 
-  protected override val avroSchema: Schema = new 
Schema.Parser().parse(requiredSchema.avroSchemaStr)
-  protected override val structTypeSchema: StructType = 
requiredSchema.structTypeSchema
+  protected override val avroSchema: Schema = requiredAvroSchema

Review Comment:
   Is this for performance improvement?  Should `requiredSchema: 
HoodieTableSchema` still be kept in the constructor for supporting schema 
evolution in the future?



##
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/Iterators.scala:
##
@@ -181,17 +194,30 @@ class RecordMergingFileIterator(logFiles: 
List[HoodieLogFile],
 baseFileIterator: Iterator[InternalRow],
 readerSchema: StructType,
 dataSchema: HoodieTableSchema,
-requiredSchema: HoodieTableSchema,
+requiredStructTypeSchema: StructType,
+requiredAvroSchema: Schema,
 tableState: HoodieTableState,
 config: Configuration)
-  extends LogFileIterator(logFiles, partitionPath, dataSchema, requiredSchema, 
tableState, config) {
+  extends LogFileIterator(logFiles, partitionPath, dataSchema, 
requiredStructTypeSchema, requiredAvroSchema, tableState, config) {
 
+  def this(logFiles: List[HoodieLogFile],
+   partitionPath: Path,
+   baseFileIterator: Iterator[InternalRow],
+   readerSchema: StructType,
+   dataSchema: HoodieTableSchema,
+   requiredSchema: HoodieTableSchema,
+   tableState: HoodieTableState,
+   config: Configuration) {
+this(logFiles, partitionPath, baseFileIterator, readerSchema, dataSchema, 
requiredSchema.structTypeSchema,
+  new Schema.Parser().parse(requiredSchema.avroSchemaStr), tableState, 
config)
+  }
   def this(split: HoodieMergeOnReadFileSplit, baseFileReader: BaseFileReader, 
dataSchema: HoodieTableSchema,
requiredSchema: HoodieTableSchema, tableState: HoodieTableState, 
config: Configuration) {
 this(split.logFiles, getPartitionPath(split), 
baseFileReader(split.dataFile.get),
   baseFileReader.schema, dataSchema, requiredSchema, tableState, config)
   }
 
+

Review Comment:
   nit: remove empty line



##
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/SparkHoodieTableFileIndex.scala:
##
@@ -99,7 +96,7 @@ class SparkHoodieTableFileIndex(spark: SparkSession,
   
AvroConversionUtils.convertAvroSchemaToStructType(schemaUtil.getTableAvroSchema)
 })
 
-  protected lazy val shouldFastBootstrap = 
configProperties.getBoolean(DATA_QUERIES_ONLY.key, false)
+  protected lazy val shouldFastBootstrap: Boolean = 
configProperties.getBoolean(DATA_QUERIES_ONLY.key, false)

Review Comment:
   nit: let's avoid cosmetic changes in this class.



##
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/NewHoodieParquetFileFormat.scala:
##
@@ -0,0 +1,353 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.parquet
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import 
org.apache.hudi.DataSourceReadOptions.{REALTIME_PAYLOAD_COMBINE_OPT_VAL, 
REALTIME_SKIP_MERGE_OPT_VAL}
+import org.apache.hudi.MergeOnReadSnapshotRelation.createPartitionedFile
+import org.apache.hudi.common.fs.FSUtils
+import org.apache.hudi.common.model.{BaseFile, FileSlice, HoodieLogFile, 
HoodieRecord}
+import org.apache.hudi.common.util.ValidationUtils.checkState
+import org.apache.hudi.{HoodieBaseRelation, HoodieSparkUtils, 
HoodieTableSchema, HoodieTableState, LogFileIterator, 
MergeOnReadSnapshotRelation, PartitionFileSliceMapping, 
Re

1 2 3 >

1 - 100 of 223 matches

Mail list logo