[jira] [Updated] (SPARK-10159) Hive 1.3.x GenericUDFDate NPE issue
[ https://issues.apache.org/jira/browse/SPARK-10159?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Sean Owen updated SPARK-10159: -- Assignee: Michael Armbrust > Hive 1.3.x GenericUDFDate NPE issue > --- > > Key: SPARK-10159 > URL: https://issues.apache.org/jira/browse/SPARK-10159 > Project: Spark > Issue Type: Bug > Components: SQL >Affects Versions: 1.4.0 >Reporter: Alex Liu >Assignee: Michael Armbrust > Fix For: 1.5.0 > > > When run sql query with HiveContext, Hive 1.3.x GenericUDFDate NPE issue. > The following is the query and log > {code} > SELECT a.stationid AS stationid, > a.month AS month, > a.year AS year, > AVG(a.mean) AS mean, > MIN(a.min) AS min, > MAX(a.max) AS max > FROM > (SELECT *, > YEAR(date) AS year, > MONTH(date) AS month, > FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE(date), '-MM-dd'), 'E') AS > weekday >FROM weathercql.daily) a > WHERE ((a.weekday = 'Mon')) > AND (a.metric = 'temperature') > GROUP BY a.stationid, a.month, a.year > ORDER BY stationid, year, month > LIMIT 100 > {code} > log {code} > Filter > ((HiveSimpleUdf#org.apache.hadoop.hive.ql.udf.UDFFromUnixTime(HiveGenericUdf#org.apache.hadoop.hive.ql.udf.generic.GenericUDFUnixTimeStamp(HiveGenericUdf#org.apache.hadoop.hive.ql.udf.generic.GenericUDFDate(date#81),-MM-dd),E) > = Mon) && (metric#80 = temperature)) > ERROR 2015-08-20 15:39:06 > org.apache.spark.sql.hive.thriftserver.SparkExecuteStatementOperation: Error > executing query: > org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in > stage 2.0 failed 4 times, most recent failure: Lost task 1.3 in stage 2.0 > (TID 208, 127.0.0.1): java.lang.NullPointerException > at > org.apache.hadoop.hive.ql.udf.generic.GenericUDFDate.evaluate(GenericUDFDate.java:119) > at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188) > at > org.apache.spark.sql.hive.HiveGenericUdf$$anonfun$eval$2.apply(hiveUdfs.scala:184) > at > org.apache.spark.sql.hive.DeferredObjectAdapter.get(hiveUdfs.scala:138) > at > org.apache.hadoop.hive.ql.udf.generic.GenericUDFToUnixTimeStamp.evaluate(GenericUDFToUnixTimeStamp.java:121) > at > org.apache.hadoop.hive.ql.udf.generic.GenericUDFUnixTimeStamp.evaluate(GenericUDFUnixTimeStamp.java:52) > at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188) > at > org.apache.spark.sql.hive.HiveSimpleUdf$$anonfun$eval$1.apply(hiveUdfs.scala:121) > at > org.apache.spark.sql.hive.HiveSimpleUdf$$anonfun$eval$1.apply(hiveUdfs.scala:121) > at > scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244) > at > scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244) > at scala.collection.immutable.List.foreach(List.scala:318) > at scala.collection.TraversableLike$class.map(TraversableLike.scala:244) > at scala.collection.AbstractTraversable.map(Traversable.scala:105) > at org.apache.spark.sql.hive.HiveSimpleUdf.eval(hiveUdfs.scala:121) > at > org.apache.spark.sql.catalyst.expressions.EqualTo.eval(predicates.scala:191) > at > org.apache.spark.sql.catalyst.expressions.And.eval(predicates.scala:130) > at > org.apache.spark.sql.catalyst.expressions.InterpretedPredicate$$anonfun$create$1.apply(predicates.scala:30) > at > org.apache.spark.sql.catalyst.expressions.InterpretedPredicate$$anonfun$create$1.apply(predicates.scala:30) > at scala.collection.Iterator$$anon$14.hasNext(Iterator.scala:390) > at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327) > at > org.apache.spark.sql.execution.Aggregate$$anonfun$doExecute$1$$anonfun$7.apply(Aggregate.scala:154) > at > org.apache.spark.sql.execution.Aggregate$$anonfun$doExecute$1$$anonfun$7.apply(Aggregate.scala:149) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$17.apply(RDD.scala:686) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$17.apply(RDD.scala:686) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:70) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41) > at org.apache.spark.scheduler.Task.run(Task.scala:70) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.sc
[jira] [Updated] (SPARK-10159) Hive 1.3.x GenericUDFDate NPE issue
[ https://issues.apache.org/jira/browse/SPARK-10159?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Alex Liu updated SPARK-10159: - Description: When run sql query with HiveContext, Hive 1.3.x GenericUDFDate NPE issue. The following is the query and log {code} SELECT a.stationid AS stationid, a.month AS month, a.year AS year, AVG(a.mean) AS mean, MIN(a.min) AS min, MAX(a.max) AS max FROM (SELECT *, YEAR(date) AS year, MONTH(date) AS month, FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE(date), '-MM-dd'), 'E') AS weekday FROM weathercql.daily) a WHERE ((a.weekday = 'Mon')) AND (a.metric = 'temperature') GROUP BY a.stationid, a.month, a.year ORDER BY stationid, year, month LIMIT 100 {code} log {code} Filter ((HiveSimpleUdf#org.apache.hadoop.hive.ql.udf.UDFFromUnixTime(HiveGenericUdf#org.apache.hadoop.hive.ql.udf.generic.GenericUDFUnixTimeStamp(HiveGenericUdf#org.apache.hadoop.hive.ql.udf.generic.GenericUDFDate(date#81),-MM-dd),E) = Mon) && (metric#80 = temperature)) ERROR 2015-08-20 15:39:06 org.apache.spark.sql.hive.thriftserver.SparkExecuteStatementOperation: Error executing query: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 2.0 failed 4 times, most recent failure: Lost task 1.3 in stage 2.0 (TID 208, 127.0.0.1): java.lang.NullPointerException at org.apache.hadoop.hive.ql.udf.generic.GenericUDFDate.evaluate(GenericUDFDate.java:119) at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188) at org.apache.spark.sql.hive.HiveGenericUdf$$anonfun$eval$2.apply(hiveUdfs.scala:184) at org.apache.spark.sql.hive.DeferredObjectAdapter.get(hiveUdfs.scala:138) at org.apache.hadoop.hive.ql.udf.generic.GenericUDFToUnixTimeStamp.evaluate(GenericUDFToUnixTimeStamp.java:121) at org.apache.hadoop.hive.ql.udf.generic.GenericUDFUnixTimeStamp.evaluate(GenericUDFUnixTimeStamp.java:52) at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188) at org.apache.spark.sql.hive.HiveSimpleUdf$$anonfun$eval$1.apply(hiveUdfs.scala:121) at org.apache.spark.sql.hive.HiveSimpleUdf$$anonfun$eval$1.apply(hiveUdfs.scala:121) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244) at scala.collection.immutable.List.foreach(List.scala:318) at scala.collection.TraversableLike$class.map(TraversableLike.scala:244) at scala.collection.AbstractTraversable.map(Traversable.scala:105) at org.apache.spark.sql.hive.HiveSimpleUdf.eval(hiveUdfs.scala:121) at org.apache.spark.sql.catalyst.expressions.EqualTo.eval(predicates.scala:191) at org.apache.spark.sql.catalyst.expressions.And.eval(predicates.scala:130) at org.apache.spark.sql.catalyst.expressions.InterpretedPredicate$$anonfun$create$1.apply(predicates.scala:30) at org.apache.spark.sql.catalyst.expressions.InterpretedPredicate$$anonfun$create$1.apply(predicates.scala:30) at scala.collection.Iterator$$anon$14.hasNext(Iterator.scala:390) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327) at org.apache.spark.sql.execution.Aggregate$$anonfun$doExecute$1$$anonfun$7.apply(Aggregate.scala:154) at org.apache.spark.sql.execution.Aggregate$$anonfun$doExecute$1$$anonfun$7.apply(Aggregate.scala:149) at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$17.apply(RDD.scala:686) at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$17.apply(RDD.scala:686) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:70) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41) at org.apache.spark.scheduler.Task.run(Task.scala:70) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) at java.lang.Thread.run(Thread.java:745) Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273) ~[spark-core_2.10-1.4.1.1.jar:1.4.1.1] at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.a