[ 
https://issues.apache.org/jira/browse/HUDI-8551?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Y Ethan Guo updated HUDI-8551:
------------------------------
    Status: Patch Available  (was: In Progress)

> Allow no precombine field in MOR table
> --------------------------------------
>
>                 Key: HUDI-8551
>                 URL: https://issues.apache.org/jira/browse/HUDI-8551
>             Project: Apache Hudi
>          Issue Type: Sub-task
>            Reporter: Y Ethan Guo
>            Assignee: Y Ethan Guo
>            Priority: Blocker
>              Labels: pull-request-available
>             Fix For: 1.0.1
>
>   Original Estimate: 12h
>  Remaining Estimate: 12h
>
> MOR table without precombine field specified in SQL failed to be read.  We 
> should still allow no precombine field in MOR table, which should be treated 
> as natural or commit time ordering with the default EVENT_TIME_ORDERING mode 
> (i.e., setting ordering/precombine value as 0).
> {code:java}
> REATE DATABASE testing_partial_updates;
> CREATE TABLE testing_partial_updates.table1 (
>     ts BIGINT,
>     uuid STRING,
>     rider STRING,
>     driver STRING,
>     fare DOUBLE,
>     city STRING
> ) USING HUDI
> LOCATION 
> 'file:///Users/ethan/Work/tmp/hudi-1.0.0-testing/partial-update/table1'
> TBLPROPERTIES (
>   type = 'mor',
>   primaryKey = 'uuid'
> )
> PARTITIONED BY (city);
> INSERT INTO testing_partial_updates.table1
> VALUES
> (1695159649087,'334e26e9-8355-45cc-97c6-c31daf0df330','rider-A','driver-K',19.10,'san_francisco'),
> (1695091554788,'e96c4396-3fad-413a-a942-4cb36106d721','rider-C','driver-M',27.70
>  ,'san_francisco'),
> (1695046462179,'9909a8b1-2d15-4d3d-8ec9-efc48c536a00','rider-D','driver-L',33.90
>  ,'san_francisco'),
> (1695332066204,'1dced545-862b-4ceb-8b43-d2a568f6616b','rider-E','driver-O',93.50,'san_francisco'),
> (1695516137016,'e3cf430c-889d-4015-bc98-59bdce1e530c','rider-F','driver-P',34.15,'sao_paulo'
>     ),
> (1695376420876,'7a84095f-737f-40bc-b62f-6b69664712d2','rider-G','driver-Q',43.40
>  ,'sao_paulo'    ),
> (1695173887231,'3eeb61f7-c2b0-4636-99bd-5d7a5a1d2c04','rider-I','driver-S',41.06
>  ,'chennai'      ),
> (1695115999911,'c8abbe79-8d89-47ea-b4ce-4d224bae5bfa','rider-J','driver-T',17.85,'chennai');CREATE
>  TABLE merge_source1 (ts bigint, uuid String, fare DOUBLE, city STRING) using 
> parquet;
> INSERT INTO merge_source1 values (1695159649090, 
> '334e26e9-8355-45cc-97c6-c31daf0df330', 25.20, 'san_francisco'), 
> (1695173887240, '3eeb61f7-c2b0-4636-99bd-5d7a5a1d2c04', 50.00, 'chennai');SET 
> hoodie.merge.small.file.group.candidates.limit = 0;MERGE INTO 
> testing_partial_updates.table1 as target
> using (
>   select * from merge_source1
> ) source
> on target.uuid = source.uuid
> when matched then
>  update set ts = source.ts, fare = source.fare
> ;
> spark-sql (default)> select * from testing_partial_updates.table1;
> 24/11/15 16:40:31 ERROR Executor: Exception in task 2.0 in stage 52.0 (TID 98)
> org.apache.spark.SparkException: Encountered error while reading file 
> file:///Users/ethan/Work/tmp/hudi-1.0.0-testing/partial-update/table1/city=chennai/56770d17-8431-49b3-852d-07cb693db466-0_2-11-33_20241115163103124.parquet.
>  Details:
>     at 
> org.apache.spark.sql.errors.QueryExecutionErrors$.cannotReadFilesError(QueryExecutionErrors.scala:864)
>     at 
> org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:293)
>     at 
> org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:129)
>     at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
>     at 
> org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
>     at 
> org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
>     at 
> org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
>     at 
> org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
>     at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
>     at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
>     at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
>     at 
> org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
>     at org.apache.spark.scheduler.Task.run(Task.scala:141)
>     at 
> org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
>     at 
> org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
>     at 
> org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
>     at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
>     at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
>     at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
>     at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>     at java.lang.Thread.run(Thread.java:748)
> Caused by: java.lang.NullPointerException
>     at scala.collection.immutable.StringLike.split(StringLike.scala:266)
>     at scala.collection.immutable.StringLike.split$(StringLike.scala:265)
>     at scala.collection.immutable.StringOps.split(StringOps.scala:33)
>     at 
> org.apache.spark.sql.HoodieUnsafeRowUtils$.composeNestedFieldPath(HoodieUnsafeRowUtils.scala:97)
>     at 
> org.apache.spark.sql.HoodieInternalRowUtils$$anon$3.apply(HoodieInternalRowUtils.scala:105)
>     at 
> org.apache.spark.sql.HoodieInternalRowUtils$$anon$3.apply(HoodieInternalRowUtils.scala:103)
>     at 
> java.util.concurrent.ConcurrentHashMap.computeIfAbsent(ConcurrentHashMap.java:1660)
>     at 
> org.apache.spark.sql.HoodieInternalRowUtils$.getCachedPosList(HoodieInternalRowUtils.scala:103)
>     at 
> org.apache.spark.sql.HoodieInternalRowUtils.getCachedPosList(HoodieInternalRowUtils.scala)
>     at 
> org.apache.hudi.common.model.HoodieSparkRecord.getOrderingValue(HoodieSparkRecord.java:319)
>     at 
> org.apache.hudi.DefaultSparkRecordMerger.partialMerge(DefaultSparkRecordMerger.java:113)
>     at 
> org.apache.hudi.common.table.read.HoodieBaseFileGroupRecordBuffer.merge(HoodieBaseFileGroupRecordBuffer.java:388)
>     at 
> org.apache.hudi.common.table.read.HoodiePositionBasedFileGroupRecordBuffer.hasNextBaseRecord(HoodiePositionBasedFileGroupRecordBuffer.java:227)
>     at 
> org.apache.hudi.common.table.read.HoodieKeyBasedFileGroupRecordBuffer.doHasNext(HoodieKeyBasedFileGroupRecordBuffer.java:135)
>     at 
> org.apache.hudi.common.table.read.HoodieBaseFileGroupRecordBuffer.hasNext(HoodieBaseFileGroupRecordBuffer.java:149)
>     at 
> org.apache.hudi.common.table.read.HoodieFileGroupReader.hasNext(HoodieFileGroupReader.java:235)
>     at 
> org.apache.hudi.common.table.read.HoodieFileGroupReader$HoodieFileGroupReaderIterator.hasNext(HoodieFileGroupReader.java:289)
>     at 
> org.apache.spark.sql.execution.datasources.parquet.HoodieFileGroupReaderBasedParquetFileFormat$$anon$1.hasNext(HoodieFileGroupReaderBasedParquetFileFormat.scala:273)
>     at 
> org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:129)
>     at 
> org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:283)
>     ... 19 more {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to