[ https://issues.apache.org/jira/browse/HUDI-2109?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17380562#comment-17380562 ]
ASF GitHub Bot commented on HUDI-2109: -------------------------------------- jkdll commented on pull request #3195: URL: https://github.com/apache/hudi/pull/3195#issuecomment-879860213 After some analysis, most of the issues have been solved by updating spark to spark 3. Updating this code seems not to be a great priority since it seems to be solved by upgrading to a more recent version. Closing the PR. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org > AvroConversionHelper does not handle Nulls > ------------------------------------------ > > Key: HUDI-2109 > URL: https://issues.apache.org/jira/browse/HUDI-2109 > Project: Apache Hudi > Issue Type: Bug > Reporter: Jake Dalli > Priority: Trivial > Labels: pull-request-available > > Given an avro schema containing a null field: > ``` > { > "name": "messageKey", > "type": "null" > } > ``` > When using `org.apache.hudi.utilities.transform.SqlQueryBasedTransformer` > with deltastreamer and AvroKafkaSource, I get the following error: > ``` > Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: > Task 0 in stage 1.0 failed 4 times, most recent failure: Lost task 0.3 in > stage 1.0 (TID 4, ip-10-102-8-124.eu-central-1.compute.internal, executor 1): > org.apache.spark.sql.avro.IncompatibleSchemaException: Cannot convert Avro > schema to catalyst type because schema at path messageKey is not compatible > (avroType = NullType, sqlType = NULL). > Source Avro Schema: ... > Target Catalyst type: ... > at > org.apache.hudi.AvroConversionHelper$.createConverter$1(AvroConversionHelper.scala:265) > at > org.apache.hudi.AvroConversionHelper$.createConverter$1(AvroConversionHelper.scala:146) > at > org.apache.hudi.AvroConversionHelper$.createConverterToRow(AvroConversionHelper.scala:273) > at > org.apache.hudi.AvroConversionUtils$.$anonfun$createDataFrame$1(AvroConversionUtils.scala:42) > at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:837) > at > org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:837) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) > at > org.apache.spark.sql.execution.SQLExecutionRDD.$anonfun$compute$1(SQLExecutionRDD.scala:52) > at > org.apache.spark.sql.internal.SQLConf$.withExistingConf(SQLConf.scala:100) > at > org.apache.spark.sql.execution.SQLExecutionRDD.compute(SQLExecutionRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) > at org.apache.spark.scheduler.Task.run(Task.scala:127) > at > org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:444) > at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:447) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:748) > Driver stacktrace: > at > org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2175) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2124) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2123) > at > scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62) > at > scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55) > at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49) > at > org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2123) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:990) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:990) > at scala.Option.foreach(Option.scala:407) > at > org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:990) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2355) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2304) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2293) > at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) > at > org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:792) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2093) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2114) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2133) > at org.apache.spark.rdd.RDD.$anonfun$take$1(RDD.scala:1423) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:388) > at org.apache.spark.rdd.RDD.take(RDD.scala:1396) > at org.apache.spark.rdd.RDD.$anonfun$isEmpty$1(RDD.scala:1531) > at > scala.runtime.java8.JFunction0$mcZ$sp.apply(JFunction0$mcZ$sp.java:23) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:388) > at org.apache.spark.rdd.RDD.isEmpty(RDD.scala:1531) > at > org.apache.spark.api.java.JavaRDDLike.isEmpty(JavaRDDLike.scala:544) > at > org.apache.spark.api.java.JavaRDDLike.isEmpty$(JavaRDDLike.scala:544) > at > org.apache.spark.api.java.AbstractJavaRDDLike.isEmpty(JavaRDDLike.scala:45) > at > org.apache.hudi.utilities.deltastreamer.DeltaSync.readFromSource(DeltaSync.java:380) > at > org.apache.hudi.utilities.deltastreamer.DeltaSync.syncOnce(DeltaSync.java:255) > at > org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer$DeltaSyncService.lambda$startService$0(HoodieDeltaStreamer.java:587) > ... 4 more > Caused by: org.apache.spark.sql.avro.IncompatibleSchemaException: Cannot > convert Avro schema to catalyst type because schema at path routingKey is not > compatible (avroType = NullType, sqlType = NULL). > at > org.apache.hudi.AvroConversionHelper$.createConverter$1(AvroConversionHelper.scala:265) > at > org.apache.hudi.AvroConversionHelper$.createConverter$1(AvroConversionHelper.scala:146) > at > org.apache.hudi.AvroConversionHelper$.createConverterToRow(AvroConversionHelper.scala:273) > at > org.apache.hudi.AvroConversionUtils$.$anonfun$createDataFrame$1(AvroConversionUtils.scala:42) > at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:837) > at > org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:837) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) > at > org.apache.spark.sql.execution.SQLExecutionRDD.$anonfun$compute$1(SQLExecutionRDD.scala:52) > at > org.apache.spark.sql.internal.SQLConf$.withExistingConf(SQLConf.scala:100) > at > org.apache.spark.sql.execution.SQLExecutionRDD.compute(SQLExecutionRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) > at org.apache.spark.scheduler.Task.run(Task.scala:127) > at > org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:444) > at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:447) > ... 3 more > ``` -- This message was sent by Atlassian Jira (v8.3.4#803005)