hudi-bot opened a new issue, #15566:
URL: https://github.com/apache/hudi/issues/15566
When the schema evolution is enabled, unable to write data to the table and
throw ArrayIndexOutOfBoundsException
Environment Description
Hudi version : master
Spark version : 3.2.2
Storage (HDFS/S3/GCS..) : localhost
Scene of reproduction:
{code:java}
CREATE TABLE default.hudi_mor_pt_002 (
`key` BIGINT,
`A1` STRING,
`A2` STRING,
`A3` STRING)
using hudi
TBLPROPERTIES(
'preCombineField' = 'key',
'primaryKey' = 'key',
'type' = 'mor');
set hoodie.schema.on.read.enable=true;
insert into default.hudi_mor_pt_002 select 1 as key, 'A' as A1, 'A' as A2,
'A' as A3;
{code}
log
{code:java}
Caused by: org.apache.spark.SparkException: Job aborted due to stage
failure: Task 0 in stage 7.0 failed 1 times, most recent failure: Lost task 0.0
in stage 7.0 (TID 5) (172.20.10.4 executor driver): java.lang.RuntimeException:
Error while encoding: java.lang.ArrayIndexOutOfBoundsException: 4
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null
else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType,
fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0,
org.apache.spark.sql.Row, true]), 0, _hoodie_commit_time), StringType), true,
false, true) AS _hoodie_commit_time#103
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null
else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType,
fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0,
org.apache.spark.sql.Row, true]), 1, _hoodie_commit_seqno), StringType), true,
false, true) AS _hoodie_commit_seqno#104
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null
else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType,
fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0,
org.apache.spark.sql.Row, true]), 2, _hoodie_record_key), StringType), true,
false, true) AS _hoodie_record_key#105
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null
else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType,
fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0,
org.apache.spark.sql.Row, true]), 3, _hoodie_partition_path), StringType),
true, false, true) AS _hoodie_partition_path#106
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null
else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType,
fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0,
org.apache.spark.sql.Row, true]), 4, _hoodie_file_name), StringType), true,
false, true) AS _hoodie_file_name#107
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null
else validateexternaltype(getexternalrowfield(assertnotnull(input[0,
org.apache.spark.sql.Row, true]), 5, key), LongType) AS key#108L
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null
else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType,
fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0,
org.apache.spark.sql.Row, true]), 6, A1), StringType), true, false, true) AS
A1#109
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null
else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType,
fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0,
org.apache.spark.sql.Row, true]), 7, A2), StringType), true, false, true) AS
A2#110
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null
else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType,
fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0,
org.apache.spark.sql.Row, true]), 8, A3), StringType), true, false, true) AS
A3#111
at
org.apache.spark.sql.errors.QueryExecutionErrors$.expressionEncodingError(QueryExecutionErrors.scala:1052)
at
org.apache.spark.sql.catalyst.encoders.ExpressionEncoder$Serializer.apply(ExpressionEncoder.scala:210)
at
org.apache.spark.sql.catalyst.encoders.ExpressionEncoder$Serializer.apply(ExpressionEncoder.scala:193)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
at
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown
Source)
at
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at
org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:759)
at scala.collection.Iterator.isEmpty(Iterator.scala:387)
at scala.collection.Iterator.isEmpty$(Iterator.scala:387)
at
org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.isEmpty(WholeStageCodegenExec.scala:757)
at
org.apache.hudi.HoodieSparkUtils$.$anonfun$createRdd$2(HoodieSparkUtils.scala:103)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:863)
at
org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:863)
at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at
org.apache.spark.sql.execution.SQLConfInjectingRDD.compute(SQLConfInjectingRDD.scala:58)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at
org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1491)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ArrayIndexOutOfBoundsException: 4
at
org.apache.spark.sql.catalyst.expressions.GenericRow.get(rows.scala:174)
at org.apache.spark.sql.Row.isNullAt(Row.scala:214)
at org.apache.spark.sql.Row.isNullAt$(Row.scala:214)
at
org.apache.spark.sql.catalyst.expressions.GenericRow.isNullAt(rows.scala:166)
at
org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.writeFields_0_2$(Unknown
Source)
at
org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown
Source)
at
org.apache.spark.sql.catalyst.encoders.ExpressionEncoder$Serializer.apply(ExpressionEncoder.scala:207)
... 33 more
Driver stacktrace:
at
org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2454)
at
org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2403)
at
org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2402)
at
scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at
scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2402)
at
org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1160)
at
org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1160)
at scala.Option.foreach(Option.scala:407)
at
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1160)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2642)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2584)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2573)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at
org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:938)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2214)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2235)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2254)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2279)
at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
at org.apache.spark.rdd.RDD.collect(RDD.scala:1029)
at org.apache.spark.api.java.JavaRDDLike.collect(JavaRDDLike.scala:362)
at org.apache.spark.api.java.JavaRDDLike.collect$(JavaRDDLike.scala:361)
at
org.apache.spark.api.java.AbstractJavaRDDLike.collect(JavaRDDLike.scala:45)
at
org.apache.hudi.data.HoodieJavaRDD.collectAsList(HoodieJavaRDD.java:155)
at
org.apache.hudi.index.simple.HoodieSimpleIndex.fetchRecordLocationsForAffectedPartitions(HoodieSimpleIndex.java:142)
at
org.apache.hudi.index.simple.HoodieSimpleIndex.tagLocationInternal(HoodieSimpleIndex.java:113)
at
org.apache.hudi.index.simple.HoodieSimpleIndex.tagLocation(HoodieSimpleIndex.java:91)
at
org.apache.hudi.table.action.commit.HoodieWriteHelper.tag(HoodieWriteHelper.java:51)
at
org.apache.hudi.table.action.commit.HoodieWriteHelper.tag(HoodieWriteHelper.java:34)
at
org.apache.hudi.table.action.commit.BaseWriteHelper.write(BaseWriteHelper.java:53)
... 99 more
Caused by: java.lang.RuntimeException: Error while encoding:
java.lang.ArrayIndexOutOfBoundsException: 4 {code}
## JIRA info
- Link: https://issues.apache.org/jira/browse/HUDI-5204
- Type: Bug
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]