[ https://issues.apache.org/jira/browse/HUDI-7812?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Ethan Guo updated HUDI-7812: ---------------------------- Fix Version/s: 0.15.0 > Async Clustering w/ row writer fails due to timetravel query validation > ------------------------------------------------------------------------ > > Key: HUDI-7812 > URL: https://issues.apache.org/jira/browse/HUDI-7812 > Project: Apache Hudi > Issue Type: Bug > Components: clustering > Reporter: sivabalan narayanan > Assignee: sivabalan narayanan > Priority: Major > Labels: pull-request-available > Fix For: 0.15.0 > > > With clustering row writer enabled flow, we trigger a time travel query to > read input records. But the query side fails if there are any pending commits > (due to new ingestion ) whose timestamp < clustering instant time. we need to > relax this constraint. > > {code:java} > Failed to execute CLUSTERING service > java.util.concurrent.CompletionException: > org.apache.hudi.exception.HoodieTimeTravelException: Time travel's timestamp > '20240406123837295' must be earlier than the first incomplete commit > timestamp '20240406123834233'. > at > java.util.concurrent.CompletableFuture.encodeThrowable(CompletableFuture.java:273) > ~[?:1.8.0_392-internal] > at > java.util.concurrent.CompletableFuture.completeThrowable(CompletableFuture.java:280) > ~[?:1.8.0_392-internal] > at > java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1606) > ~[?:1.8.0_392-internal] > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > ~[?:1.8.0_392-internal] > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > ~[?:1.8.0_392-internal] > at java.lang.Thread.run(Thread.java:750) ~[?:1.8.0_392-internal] > Caused by: org.apache.hudi.exception.HoodieTimeTravelException: Time > travel's timestamp '20240406123837295' must be earlier than the first > incomplete commit timestamp '20240406123834233'. > at > org.apache.hudi.common.table.timeline.TimelineUtils.validateTimestampAsOf(TimelineUtils.java:369) > ~[hudi-utilities-bundle_2.12-1.8.1-INTERNAL.jar:1.8.1-INTERNAL] > at > org.apache.hudi.HoodieBaseRelation.$anonfun$listLatestFileSlices$1(HoodieBaseRelation.scala:416) > ~[hudi-utilities-bundle_2.12-1.8.1-INTERNAL.jar:1.8.1-INTERNAL] > at > org.apache.hudi.HoodieBaseRelation.$anonfun$listLatestFileSlices$1$adapted(HoodieBaseRelation.scala:416) > ~[hudi-utilities-bundle_2.12-1.8.1-INTERNAL.jar:1.8.1-INTERNAL] > at scala.Option.foreach(Option.scala:407) > ~[scala-library-2.12.17.jar:?] > at > org.apache.hudi.HoodieBaseRelation.listLatestFileSlices(HoodieBaseRelation.scala:416) > ~[hudi-utilities-bundle_2.12-1.8.1-INTERNAL.jar:1.8.1-INTERNAL] > at > org.apache.hudi.BaseMergeOnReadSnapshotRelation.collectFileSplits(MergeOnReadSnapshotRelation.scala:225) > ~[hudi-utilities-bundle_2.12-1.8.1-INTERNAL.jar:1.8.1-INTERNAL] > at > org.apache.hudi.BaseMergeOnReadSnapshotRelation.collectFileSplits(MergeOnReadSnapshotRelation.scala:68) > ~[hudi-utilities-bundle_2.12-1.8.1-INTERNAL.jar:1.8.1-INTERNAL] > at > org.apache.hudi.HoodieBaseRelation.buildScan(HoodieBaseRelation.scala:369) > ~[hudi-utilities-bundle_2.12-1.8.1-INTERNAL.jar:1.8.1-INTERNAL] > at > org.apache.spark.sql.execution.datasources.DataSourceStrategy$.$anonfun$apply$4(DataSourceStrategy.scala:323) > ~[spark-sql_2.12-3.2.3.jar:1.8.1-INTERNAL] > at > org.apache.spark.sql.execution.datasources.DataSourceStrategy$.$anonfun$pruneFilterProject$1(DataSourceStrategy.scala:357) > ~[spark-sql_2.12-3.2.3.jar:1.8.1-INTERNAL] > at > org.apache.spark.sql.execution.datasources.DataSourceStrategy$.pruneFilterProjectRaw(DataSourceStrategy.scala:413) > ~[spark-sql_2.12-3.2.3.jar:1.8.1-INTERNAL] > at > org.apache.spark.sql.execution.datasources.DataSourceStrategy$.pruneFilterProject(DataSourceStrategy.scala:356) > ~[spark-sql_2.12-3.2.3.jar:1.8.1-INTERNAL] > at > org.apache.spark.sql.execution.datasources.DataSourceStrategy$.apply(DataSourceStrategy.scala:323) > ~[spark-sql_2.12-3.2.3.jar:1.8.1-INTERNAL] > at > org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$1(QueryPlanner.scala:63) > ~[spark-catalyst_2.12-3.2.3.jar:3.2.3] > at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486) > ~[scala-library-2.12.17.jar:?] > at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492) > ~[scala-library-2.12.17.jar:?] > at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491) > ~[scala-library-2.12.17.jar:?] > at > org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93) > ~[spark-catalyst_2.12-3.2.3.jar:3.2.3] > at > org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:67) > ~[spark-sql_2.12-3.2.3.jar:1.8.1-INTERNAL] > at > org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$3(QueryPlanner.scala:78) > ~[spark-catalyst_2.12-3.2.3.jar:3.2.3] > at > scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:196) > ~[scala-library-2.12.17.jar:?] > at > scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:194) > ~[scala-library-2.12.17.jar:?] > at scala.collection.Iterator.foreach(Iterator.scala:943) > ~[scala-library-2.12.17.jar:?] > at scala.collection.Iterator.foreach$(Iterator.scala:943) > ~[scala-library-2.12.17.jar:?] > at scala.collection.AbstractIterator.foreach(Iterator.scala:1431) > ~[scala-library-2.12.17.jar:?] > at > scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:199) > ~[scala-library-2.12.17.jar:?] > at > scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:192) > ~[scala-library-2.12.17.jar:?] > at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1431) > ~[scala-library-2.12.17.jar:?] > at > org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$2(QueryPlanner.scala:75) > ~[spark-catalyst_2.12-3.2.3.jar:3.2.3] > at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486) > ~[scala-library-2.12.17.jar:?] > at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492) > ~[scala-library-2.12.17.jar:?] > at > org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93) > ~[spark-catalyst_2.12-3.2.3.jar:3.2.3] > at > org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:67) > ~[spark-sql_2.12-3.2.3.jar:1.8.1-INTERNAL] > at > org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$3(QueryPlanner.scala:78) > ~[spark-catalyst_2.12-3.2.3.jar:3.2.3] > at > scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:196) > ~[scala-library-2.12.17.jar:?] > at > scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:194) > ~[scala-library-2.12.17.jar:?] > at scala.collection.Iterator.foreach(Iterator.scala:943) > ~[scala-library-2.12.17.jar:?] > at scala.collection.Iterator.foreach$(Iterator.scala:943) > ~[scala-library-2.12.17.jar:?] > at scala.collection.AbstractIterator.foreach(Iterator.scala:1431) > ~[scala-library-2.12.17.jar:?] > at > scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:199) > ~[scala-library-2.12.17.jar:?] > at > scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:192) > ~[scala-library-2.12.17.jar:?] > at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1431) > ~[scala-library-2.12.17.jar:?] > at > org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$2(QueryPlanner.scala:75) > ~[spark-catalyst_2.12-3.2.3.jar:3.2.3] > at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486) > ~[scala-library-2.12.17.jar:?] > at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492) > ~[scala-library-2.12.17.jar:?] > at > org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93) > ~[spark-catalyst_2.12-3.2.3.jar:3.2.3] > at > org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:67) > ~[spark-sql_2.12-3.2.3.jar:1.8.1-INTERNAL] > at > org.apache.spark.sql.execution.QueryExecution$.createSparkPlan(QueryExecution.scala:453) > ~[spark-sql_2.12-3.2.3.jar:1.8.1-INTERNAL] > at > org.apache.spark.sql.execution.QueryExecution.$anonfun$sparkPlan$1(QueryExecution.scala:144) > ~[spark-sql_2.12-3.2.3.jar:1.8.1-INTERNAL] > at > org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111) > ~[spark-catalyst_2.12-3.2.3.jar:3.2.3] > at > org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:183) > ~[spark-sql_2.12-3.2.3.jar:1.8.1-INTERNAL] > at > org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775) > ~[spark-sql_2.12-3.2.3.jar:3.2.3] > at > org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:183) > ~[spark-sql_2.12-3.2.3.jar:1.8.1-INTERNAL] > at > org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:144) > ~[spark-sql_2.12-3.2.3.jar:1.8.1-INTERNAL] > at > org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:137) > ~[spark-sql_2.12-3.2.3.jar:1.8.1-INTERNAL] > at > org.apache.spark.sql.execution.QueryExecution.$anonfun$executedPlan$1(QueryExecution.scala:157) > ~[spark-sql_2.12-3.2.3.jar:1.8.1-INTERNAL] > at > org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111) > ~[spark-catalyst_2.12-3.2.3.jar:3.2.3] > at > org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:183) > ~[spark-sql_2.12-3.2.3.jar:1.8.1-INTERNAL] > at > org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775) > ~[spark-sql_2.12-3.2.3.jar:3.2.3] > at > org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:183) > ~[spark-sql_2.12-3.2.3.jar:1.8.1-INTERNAL] > at > org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:157) > ~[spark-sql_2.12-3.2.3.jar:1.8.1-INTERNAL] > at > org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:150) > ~[spark-sql_2.12-3.2.3.jar:1.8.1-INTERNAL] > at > org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:172) > ~[spark-sql_2.12-3.2.3.jar:1.8.1-INTERNAL] > at > org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:171) > ~[spark-sql_2.12-3.2.3.jar:1.8.1-INTERNAL] > at > org.apache.hudi.HoodieDatasetBulkInsertHelper$.bulkInsert(HoodieDatasetBulkInsertHelper.scala:150) > ~[hudi-utilities-bundle_2.12-1.8.1-INTERNAL.jar:1.8.1-INTERNAL] > at > org.apache.hudi.HoodieDatasetBulkInsertHelper.bulkInsert(HoodieDatasetBulkInsertHelper.scala) > ~[hudi-utilities-bundle_2.12-1.8.1-INTERNAL.jar:1.8.1-INTERNAL] > at > org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy.performClusteringWithRecordsAsRow(SparkSortAndSizeExecutionStrategy.java:76) > ~[hudi-utilities-bundle_2.12-1.8.1-INTERNAL.jar:1.8.1-INTERNAL] > at > org.apache.hudi.client.clustering.run.strategy.MultipleSparkJobExecutionStrategy.lambda$runClusteringForGroupAsyncAsRow$7(MultipleSparkJobExecutionStrategy.java:263) > ~[hudi-utilities-bundle_2.12-1.8.1-INTERNAL.jar:1.8.1-INTERNAL] > at > java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1604) > ~[?:1.8.0_392-internal] > ... 3 more > {code} > -- This message was sent by Atlassian Jira (v8.20.10#820010)