[ https://issues.apache.org/jira/browse/HIVE-16823?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16137991#comment-16137991 ]
liyunzhang_intel commented on HIVE-16823: ----------------------------------------- update changes in q*out in HIVE-16823.1.patch. Most changes like following. This is because we remove ConstantPropagate in SparkCompiler#runDynamicPartitionPruning. {code} Map Operator Tree: TableScan alias: srcpart_date - filterExpr: ((date = '2008-04-08') and ds is not null) (type: boolean) + filterExpr: ((date = '2008-04-08') and ds is not null and true) (type: boolean) Statistics: Num rows: 2 Data size: 42 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: ((date = '2008-04-08') and ds is not null) (type: boolean) + predicate: ((date = '2008-04-08') and ds is not null and true) (type: boolean) Statistics: Num rows: 1 Data size: 21 Basic stats: COMPLETE Column stats: NONE {code} Big changes in spark_vectorized_dynamic_partition_pruning.q.out as this file has not been updated for long time. > "ArrayIndexOutOfBoundsException" in > spark_vectorized_dynamic_partition_pruning.q > -------------------------------------------------------------------------------- > > Key: HIVE-16823 > URL: https://issues.apache.org/jira/browse/HIVE-16823 > Project: Hive > Issue Type: Bug > Reporter: Jianguo Tian > Assignee: liyunzhang_intel > Attachments: explain.spark, explain.tez, HIVE-16823.patch > > > spark_vectorized_dynamic_partition_pruning.q > {code} > set hive.optimize.ppd=true; > set hive.ppd.remove.duplicatefilters=true; > set hive.spark.dynamic.partition.pruning=true; > set hive.optimize.metadataonly=false; > set hive.optimize.index.filter=true; > set hive.vectorized.execution.enabled=true; > set hive.strict.checks.cartesian.product=false; > -- parent is reduce tasks > select count(*) from srcpart join (select ds as ds, ds as `date` from srcpart > group by ds) s on (srcpart.ds = s.ds) where s.`date` = '2008-04-08'; > {code} > The exceptions are as follows: > {code} > 2017-06-05T09:20:31,468 ERROR [Executor task launch worker-0] > spark.SparkReduceRecordHandler: Fatal error: > org.apache.hadoop.hive.ql.metadata.HiveException: Error while processing > vector batch (tag=0) Column vector types: 0:BYTES, 1:BYTES > ["2008-04-08", "2008-04-08"] > org.apache.hadoop.hive.ql.metadata.HiveException: Error while processing > vector batch (tag=0) Column vector types: 0:BYTES, 1:BYTES > ["2008-04-08", "2008-04-08"] > at > org.apache.hadoop.hive.ql.exec.spark.SparkReduceRecordHandler.processVectors(SparkReduceRecordHandler.java:413) > ~[hive-exec-3.0.0-SNAPSHOT.jar:3.0.0-SNAPSHOT] > at > org.apache.hadoop.hive.ql.exec.spark.SparkReduceRecordHandler.processRow(SparkReduceRecordHandler.java:301) > ~[hive-exec-3.0.0-SNAPSHOT.jar:3.0.0-SNAPSHOT] > at > org.apache.hadoop.hive.ql.exec.spark.HiveReduceFunctionResultList.processNextRecord(HiveReduceFunctionResultList.java:54) > ~[hive-exec-3.0.0-SNAPSHOT.jar:3.0.0-SNAPSHOT] > at > org.apache.hadoop.hive.ql.exec.spark.HiveReduceFunctionResultList.processNextRecord(HiveReduceFunctionResultList.java:28) > ~[hive-exec-3.0.0-SNAPSHOT.jar:3.0.0-SNAPSHOT] > at > org.apache.hadoop.hive.ql.exec.spark.HiveBaseFunctionResultList.hasNext(HiveBaseFunctionResultList.java:85) > ~[hive-exec-3.0.0-SNAPSHOT.jar:3.0.0-SNAPSHOT] > at > scala.collection.convert.Wrappers$JIteratorWrapper.hasNext(Wrappers.scala:42) > ~[scala-library-2.11.8.jar:?] > at scala.collection.Iterator$class.foreach(Iterator.scala:893) > ~[scala-library-2.11.8.jar:?] > at scala.collection.AbstractIterator.foreach(Iterator.scala:1336) > ~[scala-library-2.11.8.jar:?] > at > org.apache.spark.rdd.AsyncRDDActions$$anonfun$foreachAsync$1$$anonfun$apply$12.apply(AsyncRDDActions.scala:127) > ~[spark-core_2.11-2.0.0.jar:2.0.0] > at > org.apache.spark.rdd.AsyncRDDActions$$anonfun$foreachAsync$1$$anonfun$apply$12.apply(AsyncRDDActions.scala:127) > ~[spark-core_2.11-2.0.0.jar:2.0.0] > at > org.apache.spark.SparkContext$$anonfun$33.apply(SparkContext.scala:1974) > ~[spark-core_2.11-2.0.0.jar:2.0.0] > at > org.apache.spark.SparkContext$$anonfun$33.apply(SparkContext.scala:1974) > ~[spark-core_2.11-2.0.0.jar:2.0.0] > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > ~[spark-core_2.11-2.0.0.jar:2.0.0] > at org.apache.spark.scheduler.Task.run(Task.scala:85) > ~[spark-core_2.11-2.0.0.jar:2.0.0] > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) > ~[spark-core_2.11-2.0.0.jar:2.0.0] > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) > [?:1.8.0_112] > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) > [?:1.8.0_112] > at java.lang.Thread.run(Thread.java:745) [?:1.8.0_112] > Caused by: java.lang.ArrayIndexOutOfBoundsException: 1 > at > org.apache.hadoop.hive.ql.exec.vector.VectorGroupKeyHelper.copyGroupKey(VectorGroupKeyHelper.java:107) > ~[hive-exec-3.0.0-SNAPSHOT.jar:3.0.0-SNAPSHOT] > at > org.apache.hadoop.hive.ql.exec.vector.VectorGroupByOperator$ProcessingModeReduceMergePartial.doProcessBatch(VectorGroupByOperator.java:832) > ~[hive-exec-3.0.0-SNAPSHOT.jar:3.0.0-SNAPSHOT] > at > org.apache.hadoop.hive.ql.exec.vector.VectorGroupByOperator$ProcessingModeBase.processBatch(VectorGroupByOperator.java:179) > ~[hive-exec-3.0.0-SNAPSHOT.jar:3.0.0-SNAPSHOT] > at > org.apache.hadoop.hive.ql.exec.vector.VectorGroupByOperator.process(VectorGroupByOperator.java:1035) > ~[hive-exec-3.0.0-SNAPSHOT.jar:3.0.0-SNAPSHOT] > at > org.apache.hadoop.hive.ql.exec.spark.SparkReduceRecordHandler.processVectors(SparkReduceRecordHandler.java:400) > ~[hive-exec-3.0.0-SNAPSHOT.jar:3.0.0-SNAPSHOT] > ... 17 more > 2017-06-05T09:20:31,472 ERROR [Executor task launch worker-0] > executor.Executor: Exception in task 2.0 in stage 1.0 (TID 8) > java.lang.RuntimeException: org.apache.hadoop.hive.ql.metadata.HiveException: > Error while processing vector batch (tag=0) Column vector types: 0:BYTES, > 1:BYTES > ["2008-04-08", "2008-04-08"] > at > org.apache.hadoop.hive.ql.exec.spark.SparkReduceRecordHandler.processRow(SparkReduceRecordHandler.java:315) > ~[hive-exec-3.0.0-SNAPSHOT.jar:3.0.0-SNAPSHOT] > at > org.apache.hadoop.hive.ql.exec.spark.HiveReduceFunctionResultList.processNextRecord(HiveReduceFunctionResultList.java:54) > ~[hive-exec-3.0.0-SNAPSHOT.jar:3.0.0-SNAPSHOT] > at > org.apache.hadoop.hive.ql.exec.spark.HiveReduceFunctionResultList.processNextRecord(HiveReduceFunctionResultList.java:28) > ~[hive-exec-3.0.0-SNAPSHOT.jar:3.0.0-SNAPSHOT] > at > org.apache.hadoop.hive.ql.exec.spark.HiveBaseFunctionResultList.hasNext(HiveBaseFunctionResultList.java:85) > ~[hive-exec-3.0.0-SNAPSHOT.jar:3.0.0-SNAPSHOT] > at > scala.collection.convert.Wrappers$JIteratorWrapper.hasNext(Wrappers.scala:42) > ~[scala-library-2.11.8.jar:?] > at scala.collection.Iterator$class.foreach(Iterator.scala:893) > ~[scala-library-2.11.8.jar:?] > at scala.collection.AbstractIterator.foreach(Iterator.scala:1336) > ~[scala-library-2.11.8.jar:?] > at > org.apache.spark.rdd.AsyncRDDActions$$anonfun$foreachAsync$1$$anonfun$apply$12.apply(AsyncRDDActions.scala:127) > ~[spark-core_2.11-2.0.0.jar:2.0.0] > at > org.apache.spark.rdd.AsyncRDDActions$$anonfun$foreachAsync$1$$anonfun$apply$12.apply(AsyncRDDActions.scala:127) > ~[spark-core_2.11-2.0.0.jar:2.0.0] > at > org.apache.spark.SparkContext$$anonfun$33.apply(SparkContext.scala:1974) > ~[spark-core_2.11-2.0.0.jar:2.0.0] > at > org.apache.spark.SparkContext$$anonfun$33.apply(SparkContext.scala:1974) > ~[spark-core_2.11-2.0.0.jar:2.0.0] > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > ~[spark-core_2.11-2.0.0.jar:2.0.0] > at org.apache.spark.scheduler.Task.run(Task.scala:85) > ~[spark-core_2.11-2.0.0.jar:2.0.0] > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) > ~[spark-core_2.11-2.0.0.jar:2.0.0] > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) > [?:1.8.0_112] > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) > [?:1.8.0_112] > at java.lang.Thread.run(Thread.java:745) [?:1.8.0_112] > Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: Error while > processing vector batch (tag=0) Column vector types: 0:BYTES, 1:BYTES > ["2008-04-08", "2008-04-08"] > at > org.apache.hadoop.hive.ql.exec.spark.SparkReduceRecordHandler.processVectors(SparkReduceRecordHandler.java:413) > ~[hive-exec-3.0.0-SNAPSHOT.jar:3.0.0-SNAPSHOT] > at > org.apache.hadoop.hive.ql.exec.spark.SparkReduceRecordHandler.processRow(SparkReduceRecordHandler.java:301) > ~[hive-exec-3.0.0-SNAPSHOT.jar:3.0.0-SNAPSHOT] > ... 16 more > Caused by: java.lang.ArrayIndexOutOfBoundsException: 1 > at > org.apache.hadoop.hive.ql.exec.vector.VectorGroupKeyHelper.copyGroupKey(VectorGroupKeyHelper.java:107) > ~[hive-exec-3.0.0-SNAPSHOT.jar:3.0.0-SNAPSHOT] > at > org.apache.hadoop.hive.ql.exec.vector.VectorGroupByOperator$ProcessingModeReduceMergePartial.doProcessBatch(VectorGroupByOperator.java:832) > ~[hive-exec-3.0.0-SNAPSHOT.jar:3.0.0-SNAPSHOT] > at > org.apache.hadoop.hive.ql.exec.vector.VectorGroupByOperator$ProcessingModeBase.processBatch(VectorGroupByOperator.java:179) > ~[hive-exec-3.0.0-SNAPSHOT.jar:3.0.0-SNAPSHOT] > at > org.apache.hadoop.hive.ql.exec.vector.VectorGroupByOperator.process(VectorGroupByOperator.java:1035) > ~[hive-exec-3.0.0-SNAPSHOT.jar:3.0.0-SNAPSHOT] > at > org.apache.hadoop.hive.ql.exec.spark.SparkReduceRecordHandler.processVectors(SparkReduceRecordHandler.java:400) > ~[hive-exec-3.0.0-SNAPSHOT.jar:3.0.0-SNAPSHOT] > at > org.apache.hadoop.hive.ql.exec.spark.SparkReduceRecordHandler.processRow(SparkReduceRecordHandler.java:301) > ~[hive-exec-3.0.0-SNAPSHOT.jar:3.0.0-SNAPSHOT] > ... 16 more > 2017-06-05T09:20:31,488 DEBUG [dispatcher-event-loop-2] > scheduler.TaskSchedulerImpl: parentName: , name: TaskSet_1, runningTasks: 0 > 2017-06-05T09:20:31,493 WARN [task-result-getter-0] > scheduler.TaskSetManager: Lost task 2.0 in stage 1.0 (TID 8, localhost): > java.lang.RuntimeException: org.apache.hadoop.hive.ql.metadata.HiveException: > Error while processing vector batch (tag=0) Column vector types: 0:BYTES, > 1:BYTES > ["2008-04-08", "2008-04-08"] > at > org.apache.hadoop.hive.ql.exec.spark.SparkReduceRecordHandler.processRow(SparkReduceRecordHandler.java:315) > at > org.apache.hadoop.hive.ql.exec.spark.HiveReduceFunctionResultList.processNextRecord(HiveReduceFunctionResultList.java:54) > at > org.apache.hadoop.hive.ql.exec.spark.HiveReduceFunctionResultList.processNextRecord(HiveReduceFunctionResultList.java:28) > at > org.apache.hadoop.hive.ql.exec.spark.HiveBaseFunctionResultList.hasNext(HiveBaseFunctionResultList.java:85) > at > scala.collection.convert.Wrappers$JIteratorWrapper.hasNext(Wrappers.scala:42) > at scala.collection.Iterator$class.foreach(Iterator.scala:893) > at scala.collection.AbstractIterator.foreach(Iterator.scala:1336) > at > org.apache.spark.rdd.AsyncRDDActions$$anonfun$foreachAsync$1$$anonfun$apply$12.apply(AsyncRDDActions.scala:127) > at > org.apache.spark.rdd.AsyncRDDActions$$anonfun$foreachAsync$1$$anonfun$apply$12.apply(AsyncRDDActions.scala:127) > at > org.apache.spark.SparkContext$$anonfun$33.apply(SparkContext.scala:1974) > at > org.apache.spark.SparkContext$$anonfun$33.apply(SparkContext.scala:1974) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > at org.apache.spark.scheduler.Task.run(Task.scala:85) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) > at java.lang.Thread.run(Thread.java:745) > Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: Error while > processing vector batch (tag=0) Column vector types: 0:BYTES, 1:BYTES > ["2008-04-08", "2008-04-08"] > at > org.apache.hadoop.hive.ql.exec.spark.SparkReduceRecordHandler.processVectors(SparkReduceRecordHandler.java:413) > at > org.apache.hadoop.hive.ql.exec.spark.SparkReduceRecordHandler.processRow(SparkReduceRecordHandler.java:301) > ... 16 more > Caused by: java.lang.ArrayIndexOutOfBoundsException: 1 > at > org.apache.hadoop.hive.ql.exec.vector.VectorGroupKeyHelper.copyGroupKey(VectorGroupKeyHelper.java:107) > at > org.apache.hadoop.hive.ql.exec.vector.VectorGroupByOperator$ProcessingModeReduceMergePartial.doProcessBatch(VectorGroupByOperator.java:832) > at > org.apache.hadoop.hive.ql.exec.vector.VectorGroupByOperator$ProcessingModeBase.processBatch(VectorGroupByOperator.java:179) > at > org.apache.hadoop.hive.ql.exec.vector.VectorGroupByOperator.process(VectorGroupByOperator.java:1035) > at > org.apache.hadoop.hive.ql.exec.spark.SparkReduceRecordHandler.processVectors(SparkReduceRecordHandler.java:400) > ... 17 more > 2017-06-05T09:20:31,495 ERROR [task-result-getter-0] > scheduler.TaskSetManager: Task 2 in stage 1.0 failed 1 times; aborting job > {code} > This exception happens in this line of VectorGroupKeyHelper.java: > {code} > BytesColumnVector outputColumnVector = (BytesColumnVector) > outputBatch.cols[columnIndex]; > {code} -- This message was sent by Atlassian JIRA (v6.4.14#64029)