[ https://issues.apache.org/jira/browse/SPARK-5707?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14538926#comment-14538926 ]
Nathan McCarthy commented on SPARK-5707: ---------------------------------------- Also throws and error when switching to Java Serialisation Error: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 9.0 failed 4 times, most recent failure: Lost task 0.3 in stage 9.0 (TID 515, qtausc-pphd0152.quantium.com.au.local): java.io.IOException: java.lang.ClassNotFoundException: __wrapper$3$5dd511946bf343e58042f4a7fe748b41.__wrapper$3$5dd511946bf343e58042f4a7fe748b41$SpecificRow$1 at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1156) at org.apache.spark.broadcast.TorrentBroadcast.readBroadcastBlock(TorrentBroadcast.scala:164) at org.apache.spark.broadcast.TorrentBroadcast._value$lzycompute(TorrentBroadcast.scala:64) at org.apache.spark.broadcast.TorrentBroadcast._value(TorrentBroadcast.scala:64) at org.apache.spark.broadcast.TorrentBroadcast.getValue(TorrentBroadcast.scala:87) at org.apache.spark.broadcast.Broadcast.value(Broadcast.scala:70) at org.apache.spark.sql.execution.joins.BroadcastHashJoin$$anonfun$3.apply(BroadcastHashJoin.scala:73) at org.apache.spark.sql.execution.joins.BroadcastHashJoin$$anonfun$3.apply(BroadcastHashJoin.scala:72) at org.apache.spark.rdd.RDD$$anonfun$14.apply(RDD.scala:634) at org.apache.spark.rdd.RDD$$anonfun$14.apply(RDD.scala:634) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61) at org.apache.spark.scheduler.Task.run(Task.scala:64) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) at java.lang.Thread.run(Thread.java:745) Caused by: java.lang.ClassNotFoundException: __wrapper$3$5dd511946bf343e58042f4a7fe748b41.__wrapper$3$5dd511946bf343e58042f4a7fe748b41$SpecificRow$1 at java.net.URLClassLoader$1.run(URLClassLoader.java:366) at java.net.URLClassLoader$1.run(URLClassLoader.java:355) at java.security.AccessController.doPrivileged(Native Method) at java.net.URLClassLoader.findClass(URLClassLoader.java:354) at java.lang.ClassLoader.loadClass(ClassLoader.java:425) at java.lang.ClassLoader.loadClass(ClassLoader.java:358) at java.lang.Class.forName0(Native Method) at java.lang.Class.forName(Class.java:274) at org.apache.spark.serializer.JavaDeserializationStream$$anon$1.resolveClass(JavaSerializer.scala:65) at java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:1612) at java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1517) at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1771) at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1350) at java.io.ObjectInputStream.readObject(ObjectInputStream.java:370) at java.util.HashMap.readObject(HashMap.java:1179) at sun.reflect.GeneratedMethodAccessor26.invoke(Unknown Source) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:606) at java.io.ObjectStreamClass.invokeReadObject(ObjectStreamClass.java:1017) at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1893) at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1798) at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1350) at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:1990) at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1915) at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1798) at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1350) at java.io.ObjectInputStream.readObject(ObjectInputStream.java:370) at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:68) at org.apache.spark.broadcast.TorrentBroadcast$.unBlockifyObject(TorrentBroadcast.scala:216) at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$readBroadcastBlock$1.apply(TorrentBroadcast.scala:177) at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1153) ... 24 more Driver stacktrace: SQLState: null ErrorCode: 0 > Enabling spark.sql.codegen throws ClassNotFound exception > --------------------------------------------------------- > > Key: SPARK-5707 > URL: https://issues.apache.org/jira/browse/SPARK-5707 > Project: Spark > Issue Type: Bug > Components: SQL > Affects Versions: 1.2.0, 1.3.1 > Environment: yarn-client mode, spark.sql.codegen=true > Reporter: Yi Yao > Priority: Blocker > > Exception thrown: > {noformat} > org.apache.spark.SparkException: Job aborted due to stage failure: Task 13 in > stage 133.0 failed 4 times, most recent failure: Lost task 13.3 in stage > 133.0 (TID 3066, cdh52-node2): java.io.IOException: > com.esotericsoftware.kryo.KryoException: Unable to find class: > __wrapper$1$81257352e1c844aebf09cb84fe9e7459.__wrapper$1$81257352e1c844aebf09cb84fe9e7459$SpecificRow$1 > Serialization trace: > hashTable (org.apache.spark.sql.execution.joins.UniqueKeyHashedRelation) > at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1011) > at > org.apache.spark.broadcast.TorrentBroadcast.readBroadcastBlock(TorrentBroadcast.scala:164) > at > org.apache.spark.broadcast.TorrentBroadcast._value$lzycompute(TorrentBroadcast.scala:64) > at > org.apache.spark.broadcast.TorrentBroadcast._value(TorrentBroadcast.scala:64) > at > org.apache.spark.broadcast.TorrentBroadcast.getValue(TorrentBroadcast.scala:87) > at org.apache.spark.broadcast.Broadcast.value(Broadcast.scala:70) > at > org.apache.spark.sql.execution.joins.BroadcastHashJoin$$anonfun$3.apply(BroadcastHashJoin.scala:62) > at > org.apache.spark.sql.execution.joins.BroadcastHashJoin$$anonfun$3.apply(BroadcastHashJoin.scala:61) > at org.apache.spark.rdd.RDD$$anonfun$13.apply(RDD.scala:601) > at org.apache.spark.rdd.RDD$$anonfun$13.apply(RDD.scala:601) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:230) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:230) > at org.apache.spark.rdd.MappedRDD.compute(MappedRDD.scala:31) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:230) > at org.apache.spark.rdd.CartesianRDD.compute(CartesianRDD.scala:75) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:230) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:230) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:230) > at org.apache.spark.rdd.MappedRDD.compute(MappedRDD.scala:31) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:230) > at org.apache.spark.rdd.CartesianRDD.compute(CartesianRDD.scala:75) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:230) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:230) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:230) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:230) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61) > at org.apache.spark.scheduler.Task.run(Task.scala:56) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:196) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:745) > {noformat} > SQL: > {code:sql} > INSERT INTO TABLE ${hiveconf:TEMP_TABLE} > SELECT > s_store_name, > pr_review_date, > pr_review_content > FROM ( > --select store_name for stores with flat or declining sales in 3 > consecutive months. > SELECT s_store_name > FROM store s > JOIN ( > -- linear regression part > SELECT > temp.cat AS cat, > --SUM(temp.x)as sumX, > --SUM(temp.y)as sumY, > --SUM(temp.xy)as sumXY, > --SUM(temp.xx)as sumXSquared, > --count(temp.x) as N, > --N * sumXY - sumX * sumY AS numerator, > --N * sumXSquared - sumX*sumX AS denom > --numerator / denom as slope, > --(sumY - slope * sumX) / N as intercept > --(count(temp.x) * SUM(temp.xy) - SUM(temp.x) * SUM(temp.y)) AS > numerator, > --(count(temp.x) * SUM(temp.xx) - SUM(temp.x) * SUM(temp.x)) AS denom > --numerator / denom as slope, > --(sumY - slope * sumX) / N as intercept > ((count(temp.x) * SUM(temp.xy) - SUM(temp.x) * SUM(temp.y)) / > (count(temp.x) * SUM(temp.xx) - SUM(temp.x) * SUM(temp.x)) ) as slope, > (SUM(temp.y) - ((count(temp.x) * SUM(temp.xy) - SUM(temp.x) * > SUM(temp.y)) / (count(temp.x) * SUM(temp.xx) - SUM(temp.x) * SUM(temp.x)) ) * > SUM(temp.x)) / count(temp.x) as intercept > FROM ( > SELECT > s.ss_store_sk AS cat, > s.ss_sold_date_sk AS x, > SUM(s.ss_net_paid) AS y, > s.ss_sold_date_sk * SUM(s.ss_net_paid) AS xy, > s.ss_sold_date_sk*s.ss_sold_date_sk AS xx > FROM store_sales s > --select date range > LEFT SEMI JOIN ( > SELECT d_date_sk > FROM date_dim d > WHERE d.d_date >= '${hiveconf:q18_startDate}' > AND d.d_date <= '${hiveconf:q18_endDate}' > ) dd ON ( s.ss_sold_date_sk=dd.d_date_sk ) > WHERE s.ss_store_sk <= 18 > GROUP BY s.ss_store_sk, s.ss_sold_date_sk > ) temp > GROUP BY temp.cat > ) c on s.s_store_sk = c.cat > WHERE c.slope < 0 > ) tmp > JOIN product_reviews pr on (true) > WHERE instr(pr.pr_review_content, tmp.s_store_name) > 0 > {code} -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org