[ https://issues.apache.org/jira/browse/HUDI-3549?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Alexey Kudinkin updated HUDI-3549: ---------------------------------- Status: In Progress (was: Open) > Investigate spark3 read issues w/ hudi spark bundle 3.2 with S3 dataset > ----------------------------------------------------------------------- > > Key: HUDI-3549 > URL: https://issues.apache.org/jira/browse/HUDI-3549 > Project: Apache Hudi > Issue Type: Bug > Components: spark > Reporter: sivabalan narayanan > Assignee: Alexey Kudinkin > Priority: Blocker > Labels: pull-request-available > Fix For: 0.11.0 > > > {code:java} > scala> df.write.format("hudi"). > | options(getQuickstartWriteConfigs). > | option(PRECOMBINE_FIELD_OPT_KEY, "ts"). > | option(RECORDKEY_FIELD_OPT_KEY, "uuid"). > | option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath"). > | option(TABLE_NAME, tableName). > | mode(Overwrite). > | save(basePath) > warning: one deprecation; for details, enable `:setting -deprecation' or > `:replay -deprecation' > 2022-03-02 14:57:00,922 WARN config.DFSPropertiesConfiguration: Cannot find > HUDI_CONF_DIR, please set it as the dir of hudi-defaults.conf > 2022-03-02 14:57:00,930 WARN config.DFSPropertiesConfiguration: Properties > file file:/etc/hudi/conf/hudi-defaults.conf not found. Ignoring to load props > file > 2022-03-02 14:57:00,947 WARN hudi.HoodieSparkSqlWriter$: hoodie table at > /tmp/hudi_trips_cow already exists. Deleting existing data & overwriting with > new data. > 2022-03-02 14:57:01,523 WARN metadata.HoodieBackedTableMetadata: Metadata > table was not found at path /tmp/hudi_trips_cow/.hoodie/metadata > 2022-03-02 14:57:10,929 WARN scheduler.TaskSetManager: Lost task 1.0 in stage > 15.0 (TID 15) (ip-172-31-47-53.us-east-2.compute.internal executor 2): > java.lang.NoSuchMethodError: > org.apache.spark.sql.execution.datasources.DataSourceUtils$.createDateRebaseFuncInWrite(Lscala/Enumeration$Value;Ljava/lang/String;)Lscala/Function1; > at > org.apache.hudi.spark.org.apache.spark.sql.avro.AvroSerializer.<init>(AvroSerializer.scala:64) > at > org.apache.hudi.spark.org.apache.spark.sql.avro.AvroSerializer.<init>(AvroSerializer.scala:56) > at > org.apache.hudi.spark.org.apache.spark.sql.avro.HoodieAvroSerializer.<init>(HoodieAvroSerializer.scala:26) > at > org.apache.spark.sql.adapter.Spark3Adapter.createAvroSerializer(Spark3Adapter.scala:47) > at > org.apache.hudi.AvroConversionUtils$.$anonfun$createInternalRowToAvroConverter$1(AvroConversionUtils.scala:79) > at > org.apache.hudi.HoodieSparkUtils$.$anonfun$createRdd$5(HoodieSparkUtils.scala:166) > at scala.collection.Iterator$$anon$10.next(Iterator.scala:461) > at scala.collection.Iterator$$anon$10.next(Iterator.scala:461) > at scala.collection.Iterator$$anon$10.next(Iterator.scala:461) > at > org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:199) > at > org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63) > at > org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52) > at org.apache.spark.scheduler.Task.run(Task.scala:131) > at > org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506) > at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:750) > 2022-03-02 14:57:12,923 ERROR scheduler.TaskSetManager: Task 1 in stage 15.0 > failed 4 times; aborting job > org.apache.hudi.exception.HoodieUpsertException: Failed to upsert for commit > time 20220302145700945 > at > org.apache.hudi.table.action.commit.BaseWriteHelper.write(BaseWriteHelper.java:64) > at > org.apache.hudi.table.action.commit.SparkUpsertCommitActionExecutor.execute(SparkUpsertCommitActionExecutor.java:46) > at > org.apache.hudi.table.HoodieSparkCopyOnWriteTable.upsert(HoodieSparkCopyOnWriteTable.java:121) > at > org.apache.hudi.table.HoodieSparkCopyOnWriteTable.upsert(HoodieSparkCopyOnWriteTable.java:105) > at > org.apache.hudi.client.SparkRDDWriteClient.upsert(SparkRDDWriteClient.java:159) > at > org.apache.hudi.DataSourceUtils.doWriteOperation(DataSourceUtils.java:218) > at > org.apache.hudi.HoodieSparkSqlWriter$.write(HoodieSparkSqlWriter.scala:289) > at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:162) > at > org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45) > at > org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:75) > at > org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:73) > at > org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:84) > at > org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:110) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103) > at > org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90) > at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64) > at > org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:110) > at > org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:106) > at > org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:481) > at > org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:82) > at > org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:481) > at > org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30) > at > org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267) > at > org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263) > at > org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30) > at > org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30) > at > org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:457) > at > org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:106) > at > org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:93) > at > org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:91) > at > org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:128) > at > org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:848) > at > org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:382) > at > org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:303) > at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:239) > ... 66 elided {code} -- This message was sent by Atlassian Jira (v8.20.1#820001)