hudi-bot opened a new issue, #15090: URL: https://github.com/apache/hudi/issues/15090
## JIRA info - Link: https://issues.apache.org/jira/browse/HUDI-3749 - Type: Task --- ## Comments 01/Apr/22 23:08;shivnarayan;build hudi locally w/ latest master for -Dspark3.1 profile. And tried to run it against EMR spark. 1. writes went through fine w/ metadata enabled 2. Read from hudi failed w/ NoClassDefFoundError {code:java} hudiDf.count java.lang.NoClassDefFoundError: org/apache/spark/sql/catalyst/expressions/AliasHelper at org.apache.hudi.BaseFileOnlyRelation.$anonfun$collectFileSplits$2(BaseFileOnlyRelation.scala:89) at scala.collection.TraversableLike.$anonfun$flatMap$1(TraversableLike.scala:245) at scala.collection.immutable.List.foreach(List.scala:392) at scala.collection.TraversableLike.flatMap(TraversableLike.scala:245) at scala.collection.TraversableLike.flatMap$(TraversableLike.scala:242) at scala.collection.immutable.List.flatMap(List.scala:355) at org.apache.hudi.BaseFileOnlyRelation.$anonfun$collectFileSplits$1(BaseFileOnlyRelation.scala:86) at scala.collection.immutable.Stream.flatMap(Stream.scala:489) at org.apache.hudi.BaseFileOnlyRelation.collectFileSplits(BaseFileOnlyRelation.scala:85) at org.apache.hudi.HoodieBaseRelation.buildScan(HoodieBaseRelation.scala:198) at org.apache.spark.sql.execution.datasources.DataSourceStrategy.$anonfun$apply$4(DataSourceStrategy.scala:298) at org.apache.spark.sql.execution.datasources.DataSourceStrategy.$anonfun$pruneFilterProject$1(DataSourceStrategy.scala:331) at org.apache.spark.sql.execution.datasources.DataSourceStrategy.pruneFilterProjectRaw(DataSourceStrategy.scala:386) at org.apache.spark.sql.execution.datasources.DataSourceStrategy.pruneFilterProject(DataSourceStrategy.scala:330) at org.apache.spark.sql.execution.datasources.DataSourceStrategy.apply(DataSourceStrategy.scala:298) at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$1(QueryPlanner.scala:63) at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:490) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:489) at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93) at org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:69) at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$3(QueryPlanner.scala:78) at scala.collection.TraversableOnce.$anonfun$foldLeft$1(TraversableOnce.scala:162) at scala.collection.TraversableOnce.$anonfun$foldLeft$1$adapted(TraversableOnce.scala:162) at scala.collection.Iterator.foreach(Iterator.scala:941) at scala.collection.Iterator.foreach$(Iterator.scala:941) at scala.collection.AbstractIterator.foreach(Iterator.scala:1429) at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:162) at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:160) at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1429) at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$2(QueryPlanner.scala:75) at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:490) at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93) at org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:69) at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$3(QueryPlanner.scala:78) at scala.collection.TraversableOnce.$anonfun$foldLeft$1(TraversableOnce.scala:162) at scala.collection.TraversableOnce.$anonfun$foldLeft$1$adapted(TraversableOnce.scala:162) at scala.collection.Iterator.foreach(Iterator.scala:941) at scala.collection.Iterator.foreach$(Iterator.scala:941) at scala.collection.AbstractIterator.foreach(Iterator.scala:1429) at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:162) at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:160) at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1429) at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$2(QueryPlanner.scala:75) at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:490) at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93) at org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:69) at org.apache.spark.sql.execution.QueryExecution$.createSparkPlan(QueryExecution.scala:365) at org.apache.spark.sql.execution.QueryExecution.$anonfun$sparkPlan$1(QueryExecution.scala:94) at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:149) at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:153) at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764) at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:153) at org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:94) at org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:87) at org.apache.spark.sql.execution.QueryExecution.$anonfun$executedPlan$1(QueryExecution.scala:107) at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:149) at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:153) at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764) at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:153) at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:104) at org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:100) at org.apache.spark.sql.execution.QueryExecution.$anonfun$writePlans$5(QueryExecution.scala:219) at org.apache.spark.sql.catalyst.plans.QueryPlan$.append(QueryPlan.scala:381) at org.apache.spark.sql.execution.QueryExecution.org$apache$spark$sql$execution$QueryExecution$$writePlans(QueryExecution.scala:219) at org.apache.spark.sql.execution.QueryExecution.toString(QueryExecution.scala:227) at org.apache.spark.sql.execution.SQLExecution$.executeQuery$1(SQLExecution.scala:99) at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:132) at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:104) at org.apache.spark.sql.execution.SQLExecution$.withTracker(SQLExecution.scala:227) at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:132) at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:248) at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:131) at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:68) at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3665) at org.apache.spark.sql.Dataset.count(Dataset.scala:3020) ... 57 elided Caused by: java.lang.ClassNotFoundException: org.apache.spark.sql.catalyst.expressions.AliasHelper at java.net.URLClassLoader.findClass(URLClassLoader.java:387) at java.lang.ClassLoader.loadClass(ClassLoader.java:418) at java.lang.ClassLoader.loadClass(ClassLoader.java:351) ... 136 more {code} Code snippet used to test above {code:java} import scala.collection.JavaConversions._ import org.apache.spark.sql.SaveMode._ import org.apache.hudi.DataSourceReadOptions._ import org.apache.hudi.DataSourceWriteOptions._ import org.apache.hudi.config.HoodieWriteConfig._sys.props.update("spark.ui.proxyBase", "") val df = spark.read.parquet("s3_input_data") val basePath = "/tmp/hudi_tbl7" df.write.format("hudi"). option(PRECOMBINE_FIELD_OPT_KEY, "created_at"). option(RECORDKEY_FIELD_OPT_KEY, "id"). option(PARTITIONPATH_FIELD_OPT_KEY, "type"). option(TABLE_NAME, "hudi_tbl1"). option("hoodie.embed.timeline.server","false"). mode(Overwrite). save(basePath) df.write.format("hudi"). option(PRECOMBINE_FIELD_OPT_KEY, "created_at"). option(RECORDKEY_FIELD_OPT_KEY, "id"). option(PARTITIONPATH_FIELD_OPT_KEY, "type"). option(TABLE_NAME, "hudi_tbl1"). option("hoodie.embed.timeline.server","false"). mode(Append). save(basePath) val hudiDf = spark.read.format("hudi").load(basePath) hudiDf.count {code} ;;; --- 05/Apr/22 02:44;shivnarayan;CC [~uditme] ;;; --- 12/Apr/22 05:55;xushiyan;[~shivnarayan] what's the done criteria for this ticket? can you please put down in the description?;;; --- 15/Apr/22 18:08;shivnarayan;regular hive sync worked out of the box. {code:java} df.write.format("hudi"). option(PRECOMBINE_FIELD_OPT_KEY, "tpep_dropoff_datetime"). option(RECORDKEY_FIELD_OPT_KEY, "tpep_pickup_datetime"). option(PARTITIONPATH_FIELD_OPT_KEY, "date_col"). option(TABLE_NAME, "hudi_tbl1"). option("hoodie.embed.timeline.server","false"). option("hoodie.datasource.hive_sync.enable","true"). option("hoodie.datasource.hive_sync.database","default"). option("hoodie.datasource.hive_sync.table","test_tbl3"). option("hoodie.datasource.hive_sync.mode","hms"). option("hoodie.datasource.hive_sync.partition_fields","_hoodie_partition_path"). mode(Overwrite). save(basePath) {code} via beeline: {code:java} select * from test_tbl3 limit 5;{code} {code:java} +--------------------------------+---------------------------------+-------------------------------+----------------------------------------------------+---------------------+---------------------------------+----------------------------------+----------------------------+--------------------------+-----------------------+-------------------------------+-------------------------+-------------------------+-------------------------+------------------------+------------------+--------------------+-----------------------+-------------------------+----------------------------------+-------------------------+---------------------------------+---------------------+-----------------------------------+ | test_tbl3._hoodie_commit_time | test_tbl3._hoodie_commit_seqno | test_tbl3._hoodie_record_key | test_tbl3._hoodie_file_name | test_tbl3.vendorid | test_tbl3.tpep_pickup_datetime | test_tbl3.tpep_dropoff_datetime | test_tbl3.passenger_count | test_tbl3.trip_distance | test_tbl3.ratecodeid | test_tbl3.store_and_fwd_flag | test_tbl3.pulocationid | test_tbl3.dolocationid | test_tbl3.payment_type | test_tbl3.fare_amount | test_tbl3.extra | test_tbl3.mta_tax | test_tbl3.tip_amount | test_tbl3.tolls_amount | test_tbl3.improvement_surcharge | test_tbl3.total_amount | test_tbl3.congestion_surcharge | test_tbl3.date_col | test_tbl3._hoodie_partition_path | +--------------------------------+---------------------------------+-------------------------------+----------------------------------------------------+---------------------+---------------------------------+----------------------------------+----------------------------+--------------------------+-----------------------+-------------------------------+-------------------------+-------------------------+-------------------------+------------------------+------------------+--------------------+-----------------------+-------------------------+----------------------------------+-------------------------+---------------------------------+---------------------+-----------------------------------+ | 20220415180627021 | 20220415180627021_7_1085992 | 2008-12-31 23:02:59 | e78169d4-03a8-40e0-ad11-9ae43a52b565-0_7-155-6608_20220415180627021.parquet | 2 | 2008-12-31 23:02:59 | 2009-01-01 18:22:41 | 1 | 0.99 | 1 | N | 249 | 90 | 2 | 7.0 | 1.0 | 0.5 | 0.0 | 0.0 | 0.3 | 11.3 | 2.5 | 2008-12-31 | 2008-12-31 | | 20220415180627021 | 20220415180627021_7_1085996 | 2008-12-31 23:07:03 | e78169d4-03a8-40e0-ad11-9ae43a52b565-0_7-155-6608_20220415180627021.parquet | 2 | 2008-12-31 23:07:03 | 2008-12-31 23:19:26 | 1 | 1.39 | 1 | N | 107 | 162 | 2 | 8.5 | 0.0 | 0.5 | 0.0 | 0.0 | 0.3 | 11.8 | 2.5 | 2008-12-31 | 2008-12-31 | | 20220415180627021 | 20220415180627021_7_1085998 | 2008-12-31 23:43:51 | e78169d4-03a8-40e0-ad11-9ae43a52b565-0_7-155-6608_20220415180627021.parquet | 2 | 2008-12-31 23:43:51 | 2009-01-01 10:32:34 | 1 | 0.79 | 1 | N | 170 | 264 | 1 | 9.5 | 2.5 | 0.5 | 2.56 | 0.0 | 0.3 | 15.36 | 0.0 | 2008-12-31 | 2008-12-31 | | 20220415180627021 | 20220415180627021_7_1086000 | 2008-12-31 23:03:52 | e78169d4-03a8-40e0-ad11-9ae43a52b565-0_7-155-6608_20220415180627021.parquet | 2 | 2008-12-31 23:03:52 | 2008-12-31 23:22:12 | 1 | 2.42 | 1 | N | 107 | 237 | 2 | 14.0 | 0.0 | 0.5 | 0.0 | 0.0 | 0.3 | 17.3 | 2.5 | 2008-12-31 | 2008-12-31 | | 20220415180627021 | 20220415180627021_0_856333 | 2009-01-01 00:02:19 | f4cb2f09-25c9-48ef-8c81-7b58156f397a-0_0-155-6601_20220415180627021.parquet | 2 | 2009-01-01 00:02:19 | 2009-01-01 09:43:46 | 1 | 1.86 | 1 | N | 45 | 107 | 2 | 10.0 | 0.0 | 0.5 | 0.0 | 0.0 | 0.3 | 13.3 | 2.5 | 2009-01-01 | 2009-01-01 | +--------------------------------+---------------------------------+-------------------------------+----------------------------------------------------+---------------------+---------------------------------+----------------------------------+----------------------------+--------------------------+-----------------------+-------------------------------+-------------------------+-------------------------+-------------------------+------------------------+------------------+--------------------+-----------------------+-------------------------+----------------------------------+-------------------------+---------------------------------+---------------------+-----------------------------------+ {code} ;;; --- 15/Apr/22 18:09;shivnarayan;Handing it off to [~uditme] to take it from here. [~xushiyan] : I will let Udit drive this since aws folks needs to upstream the changes they have internally to OSS anyways. ;;; -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
