[ https://issues.apache.org/jira/browse/SPARK-15822?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15331375#comment-15331375 ]
Pete Robbins commented on SPARK-15822: -------------------------------------- and the plan: {noformat} == Parsed Logical Plan == 'Project [unresolvedalias('Origin, None), unresolvedalias('UniqueCarrier, None), 'round((('count * 100) / 'total), 2) AS rank#173] +- Project [Origin#16, UniqueCarrier#8, count#134L, total#97L] +- Join Inner, ((Origin#16 = Origin#155) && (UniqueCarrier#8 = UniqueCarrier#147)) :- Aggregate [Origin#16, UniqueCarrier#8], [Origin#16, UniqueCarrier#8, count(1) AS count#134L] : +- Filter (NOT (Cancelled#21 = 0) && (CancellationCode#22 = A)) : +- Filter (Dest#17 = ORD) : +- Relation[Year#0,Month#1,DayofMonth#2,DayOfWeek#3,DepTime#4,CRSDepTime#5,ArrTime#6,CRSArrTime#7,UniqueCarrier#8,FlightNum#9,TailNum#10,ActualElapsedTime#11,CRSElapsedTime#12,AirTime#13,ArrDelay#14,DepDelay#15,Origin#16,Dest#17,Distance#18,TaxiIn#19,TaxiOut#20,Cancelled#21,CancellationCode#22,Diverted#23,... 5 more fields] csv +- Project [Origin#155, UniqueCarrier#147, count#92L AS total#97L] +- Aggregate [Origin#155, UniqueCarrier#147], [Origin#155, UniqueCarrier#147, count(1) AS count#92L] +- Filter (Dest#156 = ORD) +- Relation[Year#139,Month#140,DayofMonth#141,DayOfWeek#142,DepTime#143,CRSDepTime#144,ArrTime#145,CRSArrTime#146,UniqueCarrier#147,FlightNum#148,TailNum#149,ActualElapsedTime#150,CRSElapsedTime#151,AirTime#152,ArrDelay#153,DepDelay#154,Origin#155,Dest#156,Distance#157,TaxiIn#158,TaxiOut#159,Cancelled#160,CancellationCode#161,Diverted#162,... 5 more fields] csv == Analyzed Logical Plan == Origin: string, UniqueCarrier: string, rank: double Project [Origin#16, UniqueCarrier#8, round((cast((count#134L * cast(100 as bigint)) as double) / cast(total#97L as double)), 2) AS rank#173] +- Project [Origin#16, UniqueCarrier#8, count#134L, total#97L] +- Join Inner, ((Origin#16 = Origin#155) && (UniqueCarrier#8 = UniqueCarrier#147)) :- Aggregate [Origin#16, UniqueCarrier#8], [Origin#16, UniqueCarrier#8, count(1) AS count#134L] : +- Filter (NOT (Cancelled#21 = 0) && (CancellationCode#22 = A)) : +- Filter (Dest#17 = ORD) : +- Relation[Year#0,Month#1,DayofMonth#2,DayOfWeek#3,DepTime#4,CRSDepTime#5,ArrTime#6,CRSArrTime#7,UniqueCarrier#8,FlightNum#9,TailNum#10,ActualElapsedTime#11,CRSElapsedTime#12,AirTime#13,ArrDelay#14,DepDelay#15,Origin#16,Dest#17,Distance#18,TaxiIn#19,TaxiOut#20,Cancelled#21,CancellationCode#22,Diverted#23,... 5 more fields] csv +- Project [Origin#155, UniqueCarrier#147, count#92L AS total#97L] +- Aggregate [Origin#155, UniqueCarrier#147], [Origin#155, UniqueCarrier#147, count(1) AS count#92L] +- Filter (Dest#156 = ORD) +- Relation[Year#139,Month#140,DayofMonth#141,DayOfWeek#142,DepTime#143,CRSDepTime#144,ArrTime#145,CRSArrTime#146,UniqueCarrier#147,FlightNum#148,TailNum#149,ActualElapsedTime#150,CRSElapsedTime#151,AirTime#152,ArrDelay#153,DepDelay#154,Origin#155,Dest#156,Distance#157,TaxiIn#158,TaxiOut#159,Cancelled#160,CancellationCode#161,Diverted#162,... 5 more fields] csv == Optimized Logical Plan == Project [Origin#16, UniqueCarrier#8, round((cast((count#134L * 100) as double) / cast(total#97L as double)), 2) AS rank#173] +- Join Inner, ((Origin#16 = Origin#155) && (UniqueCarrier#8 = UniqueCarrier#147)) :- Aggregate [Origin#16, UniqueCarrier#8], [Origin#16, UniqueCarrier#8, count(1) AS count#134L] : +- Project [UniqueCarrier#8, Origin#16] : +- Filter (((((((isnotnull(Origin#16) && isnotnull(UniqueCarrier#8)) && isnotnull(Cancelled#21)) && isnotnull(CancellationCode#22)) && NOT (Cancelled#21 = 0)) && (CancellationCode#22 = A)) && isnotnull(Dest#17)) && (Dest#17 = ORD)) : +- Relation[Year#0,Month#1,DayofMonth#2,DayOfWeek#3,DepTime#4,CRSDepTime#5,ArrTime#6,CRSArrTime#7,UniqueCarrier#8,FlightNum#9,TailNum#10,ActualElapsedTime#11,CRSElapsedTime#12,AirTime#13,ArrDelay#14,DepDelay#15,Origin#16,Dest#17,Distance#18,TaxiIn#19,TaxiOut#20,Cancelled#21,CancellationCode#22,Diverted#23,... 5 more fields] csv +- Aggregate [Origin#155, UniqueCarrier#147], [Origin#155, UniqueCarrier#147, count(1) AS total#97L] +- Project [UniqueCarrier#147, Origin#155] +- Filter (((isnotnull(UniqueCarrier#147) && isnotnull(Origin#155)) && isnotnull(Dest#156)) && (Dest#156 = ORD)) +- Relation[Year#139,Month#140,DayofMonth#141,DayOfWeek#142,DepTime#143,CRSDepTime#144,ArrTime#145,CRSArrTime#146,UniqueCarrier#147,FlightNum#148,TailNum#149,ActualElapsedTime#150,CRSElapsedTime#151,AirTime#152,ArrDelay#153,DepDelay#154,Origin#155,Dest#156,Distance#157,TaxiIn#158,TaxiOut#159,Cancelled#160,CancellationCode#161,Diverted#162,... 5 more fields] csv == Physical Plan == *Project [Origin#16, UniqueCarrier#8, round((cast((count#134L * 100) as double) / cast(total#97L as double)), 2) AS rank#173] +- *SortMergeJoin [Origin#16, UniqueCarrier#8], [Origin#155, UniqueCarrier#147], Inner :- *Sort [Origin#16 ASC, UniqueCarrier#8 ASC], false, 0 : +- *HashAggregate(key=[Origin#16,UniqueCarrier#8], functions=[count(1)], output=[Origin#16,UniqueCarrier#8,count#134L]) : +- Exchange hashpartitioning(Origin#16, UniqueCarrier#8, 200) : +- *HashAggregate(key=[Origin#16,UniqueCarrier#8], functions=[partial_count(1)], output=[Origin#16,UniqueCarrier#8,count#296L]) : +- *Project [UniqueCarrier#8, Origin#16] : +- *Filter (((((((isnotnull(Origin#16) && isnotnull(UniqueCarrier#8)) && isnotnull(Cancelled#21)) && isnotnull(CancellationCode#22)) && NOT (Cancelled#21 = 0)) && (CancellationCode#22 = A)) && isnotnull(Dest#17)) && (Dest#17 = ORD)) : +- *Scan csv [UniqueCarrier#8,Origin#16,Dest#17,Cancelled#21,CancellationCode#22] Format: CSV, InputPaths: file:/home/robbins/brandberry/2008.csv, PushedFilters: [IsNotNull(Origin), IsNotNull(UniqueCarrier), IsNotNull(Cancelled), IsNotNull(CancellationCode), ..., ReadSchema: struct<UniqueCarrier:string,Origin:string,Dest:string,Cancelled:int,CancellationCode:string> +- *Sort [Origin#155 ASC, UniqueCarrier#147 ASC], false, 0 +- *HashAggregate(key=[Origin#155,UniqueCarrier#147], functions=[count(1)], output=[Origin#155,UniqueCarrier#147,total#97L]) +- Exchange hashpartitioning(Origin#155, UniqueCarrier#147, 200) +- *HashAggregate(key=[Origin#155,UniqueCarrier#147], functions=[partial_count(1)], output=[Origin#155,UniqueCarrier#147,count#303L]) +- *Project [UniqueCarrier#147, Origin#155] +- *Filter (((isnotnull(UniqueCarrier#147) && isnotnull(Origin#155)) && isnotnull(Dest#156)) && (Dest#156 = ORD)) +- *Scan csv [UniqueCarrier#147,Origin#155,Dest#156] Format: CSV, InputPaths: file:/home/robbins/brandberry/2008.csv, PushedFilters: [IsNotNull(UniqueCarrier), IsNotNull(Origin), IsNotNull(Dest), EqualTo(Dest,ORD)], ReadSchema: struct<UniqueCarrier:string,Origin:string,Dest:string> {noformat} > segmentation violation in o.a.s.unsafe.types.UTF8String > -------------------------------------------------------- > > Key: SPARK-15822 > URL: https://issues.apache.org/jira/browse/SPARK-15822 > Project: Spark > Issue Type: Bug > Affects Versions: 2.0.0 > Environment: linux amd64 > openjdk version "1.8.0_91" > OpenJDK Runtime Environment (build 1.8.0_91-b14) > OpenJDK 64-Bit Server VM (build 25.91-b14, mixed mode) > Reporter: Pete Robbins > Assignee: Herman van Hovell > Priority: Blocker > > Executors fail with segmentation violation while running application with > spark.memory.offHeap.enabled true > spark.memory.offHeap.size 512m > Also now reproduced with > spark.memory.offHeap.enabled false > {noformat} > # > # A fatal error has been detected by the Java Runtime Environment: > # > # SIGSEGV (0xb) at pc=0x00007f4559b4d4bd, pid=14182, tid=139935319750400 > # > # JRE version: OpenJDK Runtime Environment (8.0_91-b14) (build 1.8.0_91-b14) > # Java VM: OpenJDK 64-Bit Server VM (25.91-b14 mixed mode linux-amd64 > compressed oops) > # Problematic frame: > # J 4816 C2 > org.apache.spark.unsafe.types.UTF8String.compareTo(Lorg/apache/spark/unsafe/types/UTF8String;)I > (64 bytes) @ 0x00007f4559b4d4bd [0x00007f4559b4d460+0x5d] > {noformat} > We initially saw this on IBM java on PowerPC box but is recreatable on linux > with OpenJDK. On linux with IBM Java 8 we see a null pointer exception at the > same code point: > {noformat} > 16/06/08 11:14:58 ERROR Executor: Exception in task 1.0 in stage 5.0 (TID 48) > java.lang.NullPointerException > at > org.apache.spark.unsafe.types.UTF8String.compareTo(UTF8String.java:831) > at org.apache.spark.unsafe.types.UTF8String.compare(UTF8String.java:844) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.findNextInnerJoinRows$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$doExecute$2$$anon$2.hasNext(WholeStageCodegenExec.scala:377) > at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) > at > scala.collection.convert.Wrappers$IteratorWrapper.hasNext(Wrappers.scala:30) > at org.spark_project.guava.collect.Ordering.leastOf(Ordering.java:664) > at org.apache.spark.util.collection.Utils$.takeOrdered(Utils.scala:37) > at > org.apache.spark.rdd.RDD$$anonfun$takeOrdered$1$$anonfun$30.apply(RDD.scala:1365) > at > org.apache.spark.rdd.RDD$$anonfun$takeOrdered$1$$anonfun$30.apply(RDD.scala:1362) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:757) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:757) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:318) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:282) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > at org.apache.spark.scheduler.Task.run(Task.scala:85) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1153) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) > at java.lang.Thread.run(Thread.java:785) > {noformat} -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org