Definitely looks like a bug. Ted - are you looking at this?
On Mon, May 2, 2016 at 7:15 AM, Koert Kuipers <ko...@tresata.com> wrote: > Created issue: > https://issues.apache.org/jira/browse/SPARK-15062 > > On Mon, May 2, 2016 at 6:48 AM, Ted Yu <yuzhih...@gmail.com> wrote: > >> I tried the same statement using Spark 1.6.1 >> There was no error with default memory setting. >> >> Suggest logging a bug. >> >> On May 1, 2016, at 9:22 PM, Koert Kuipers <ko...@tresata.com> wrote: >> >> Yeah I got that too, then I increased heap for tests to 8G to get error I >> showed earlier. >> On May 2, 2016 12:09 AM, "Ted Yu" <yuzhih...@gmail.com> wrote: >> >>> Using commit hash 90787de864b58a1079c23e6581381ca8ffe7685f and >>> Java 1.7.0_67 , I got: >>> >>> scala> val dfComplicated = sc.parallelize(List((Map("1" -> "a"), >>> List("b", "c")), (Map("2" -> "b"), List("d", "e")))).toDF >>> ... >>> dfComplicated: org.apache.spark.sql.DataFrame = [_1: map<string,string>, >>> _2: array<string>] >>> >>> scala> dfComplicated.show >>> java.lang.OutOfMemoryError: Java heap space >>> at >>> org.apache.spark.unsafe.types.UTF8String.getBytes(UTF8String.java:229) >>> at >>> org.apache.spark.unsafe.types.UTF8String.toString(UTF8String.java:821) >>> at >>> org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificSafeProjection.apply(Unknown >>> Source) >>> at >>> org.apache.spark.sql.catalyst.encoders.ExpressionEncoder.fromRow(ExpressionEncoder.scala:241) >>> at >>> org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$execute$1$1$$anonfun$apply$13.apply(Dataset.scala:2121) >>> at >>> org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$execute$1$1$$anonfun$apply$13.apply(Dataset.scala:2121) >>> at >>> scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) >>> at >>> scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) >>> at >>> scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33) >>> at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186) >>> at >>> scala.collection.TraversableLike$class.map(TraversableLike.scala:234) >>> at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186) >>> at >>> org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$execute$1$1.apply(Dataset.scala:2121) >>> at >>> org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:54) >>> at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2408) >>> at org.apache.spark.sql.Dataset.org >>> $apache$spark$sql$Dataset$$execute$1(Dataset.scala:2120) >>> at org.apache.spark.sql.Dataset.org >>> $apache$spark$sql$Dataset$$collect(Dataset.scala:2127) >>> at >>> org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:1861) >>> at >>> org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:1860) >>> at org.apache.spark.sql.Dataset.withTypedCallback(Dataset.scala:2438) >>> at org.apache.spark.sql.Dataset.head(Dataset.scala:1860) >>> at org.apache.spark.sql.Dataset.take(Dataset.scala:2077) >>> at org.apache.spark.sql.Dataset.showString(Dataset.scala:238) >>> at org.apache.spark.sql.Dataset.show(Dataset.scala:529) >>> at org.apache.spark.sql.Dataset.show(Dataset.scala:489) >>> at org.apache.spark.sql.Dataset.show(Dataset.scala:498) >>> ... 6 elided >>> >>> scala> >>> >>> On Sun, May 1, 2016 at 8:34 PM, Koert Kuipers <ko...@tresata.com> wrote: >>> >>>> by removing dependencies it turns into a different error, see below. >>>> the test is simply writing a DataFrame out to file and reading it back >>>> in. i see the error for all data sources (json, parquet, etc.). >>>> >>>> this is the data that i write out and read back in: >>>> val dfComplicated = sc.parallelize(List((Map("1" -> "a"), List("b", >>>> "c")), (Map("2" -> "b"), List("d", "e")))).toDF >>>> >>>> >>>> [info] java.lang.RuntimeException: Error while decoding: >>>> java.lang.NegativeArraySizeException >>>> [info] createexternalrow(if (isnull(input[0, map<string,string>])) null >>>> else staticinvoke(class >>>> org.apache.spark.sql.catalyst.util.ArrayBasedMapData$, ObjectType(interface >>>> scala.collection.Map), toScalaMap, staticinvoke(class >>>> scala.collection.mutable.WrappedArray$, ObjectType(interface >>>> scala.collection.Seq), make, >>>> mapobjects(lambdavariable(MapObjects_loopValue16, MapObjects_loopIsNull17, >>>> StringType), lambdavariable(MapObjects_loopValue16, >>>> MapObjects_loopIsNull17, StringType).toString, input[0, >>>> map<string,string>].keyArray).array, true), staticinvoke(class >>>> scala.collection.mutable.WrappedArray$, ObjectType(interface >>>> scala.collection.Seq), make, >>>> mapobjects(lambdavariable(MapObjects_loopValue18, MapObjects_loopIsNull19, >>>> StringType), lambdavariable(MapObjects_loopValue18, >>>> MapObjects_loopIsNull19, StringType).toString, input[0, >>>> map<string,string>].valueArray).array, true), true), if (isnull(input[1, >>>> array<string>])) null else staticinvoke(class >>>> scala.collection.mutable.WrappedArray$, ObjectType(interface >>>> scala.collection.Seq), make, >>>> mapobjects(lambdavariable(MapObjects_loopValue20, MapObjects_loopIsNull21, >>>> StringType), lambdavariable(MapObjects_loopValue20, >>>> MapObjects_loopIsNull21, StringType).toString, input[1, >>>> array<string>]).array, true), >>>> StructField(_1,MapType(StringType,StringType,true),true), >>>> StructField(_2,ArrayType(StringType,true),true)) >>>> [info] :- if (isnull(input[0, map<string,string>])) null else >>>> staticinvoke(class org.apache.spark.sql.catalyst.util.ArrayBasedMapData$, >>>> ObjectType(interface scala.collection.Map), toScalaMap, staticinvoke(class >>>> scala.collection.mutable.WrappedArray$, ObjectType(interface >>>> scala.collection.Seq), make, >>>> mapobjects(lambdavariable(MapObjects_loopValue16, MapObjects_loopIsNull17, >>>> StringType), lambdavariable(MapObjects_loopValue16, >>>> MapObjects_loopIsNull17, StringType).toString, input[0, >>>> map<string,string>].keyArray).array, true), staticinvoke(class >>>> scala.collection.mutable.WrappedArray$, ObjectType(interface >>>> scala.collection.Seq), make, >>>> mapobjects(lambdavariable(MapObjects_loopValue18, MapObjects_loopIsNull19, >>>> StringType), lambdavariable(MapObjects_loopValue18, >>>> MapObjects_loopIsNull19, StringType).toString, input[0, >>>> map<string,string>].valueArray).array, true), true) >>>> [info] : :- isnull(input[0, map<string,string>]) >>>> [info] : : +- input[0, map<string,string>] >>>> [info] : :- null >>>> [info] : +- staticinvoke(class >>>> org.apache.spark.sql.catalyst.util.ArrayBasedMapData$, ObjectType(interface >>>> scala.collection.Map), toScalaMap, staticinvoke(class >>>> scala.collection.mutable.WrappedArray$, ObjectType(interface >>>> scala.collection.Seq), make, >>>> mapobjects(lambdavariable(MapObjects_loopValue16, MapObjects_loopIsNull17, >>>> StringType), lambdavariable(MapObjects_loopValue16, >>>> MapObjects_loopIsNull17, StringType).toString, input[0, >>>> map<string,string>].keyArray).array, true), staticinvoke(class >>>> scala.collection.mutable.WrappedArray$, ObjectType(interface >>>> scala.collection.Seq), make, >>>> mapobjects(lambdavariable(MapObjects_loopValue18, MapObjects_loopIsNull19, >>>> StringType), lambdavariable(MapObjects_loopValue18, >>>> MapObjects_loopIsNull19, StringType).toString, input[0, >>>> map<string,string>].valueArray).array, true), true) >>>> [info] : :- staticinvoke(class >>>> scala.collection.mutable.WrappedArray$, ObjectType(interface >>>> scala.collection.Seq), make, >>>> mapobjects(lambdavariable(MapObjects_loopValue16, MapObjects_loopIsNull17, >>>> StringType), lambdavariable(MapObjects_loopValue16, >>>> MapObjects_loopIsNull17, StringType).toString, input[0, >>>> map<string,string>].keyArray).array, true) >>>> [info] : : +- mapobjects(lambdavariable(MapObjects_loopValue16, >>>> MapObjects_loopIsNull17, StringType), >>>> lambdavariable(MapObjects_loopValue16, MapObjects_loopIsNull17, >>>> StringType).toString, input[0, map<string,string>].keyArray).array >>>> [info] : : +- mapobjects(lambdavariable(MapObjects_loopValue16, >>>> MapObjects_loopIsNull17, StringType), >>>> lambdavariable(MapObjects_loopValue16, MapObjects_loopIsNull17, >>>> StringType).toString, input[0, map<string,string>].keyArray) >>>> [info] : : :- lambdavariable(MapObjects_loopValue16, >>>> MapObjects_loopIsNull17, StringType).toString >>>> [info] : : : +- lambdavariable(MapObjects_loopValue16, >>>> MapObjects_loopIsNull17, StringType) >>>> [info] : : +- input[0, map<string,string>].keyArray >>>> [info] : : +- input[0, map<string,string>] >>>> [info] : +- staticinvoke(class >>>> scala.collection.mutable.WrappedArray$, ObjectType(interface >>>> scala.collection.Seq), make, >>>> mapobjects(lambdavariable(MapObjects_loopValue18, MapObjects_loopIsNull19, >>>> StringType), lambdavariable(MapObjects_loopValue18, >>>> MapObjects_loopIsNull19, StringType).toString, input[0, >>>> map<string,string>].valueArray).array, true) >>>> [info] : +- mapobjects(lambdavariable(MapObjects_loopValue18, >>>> MapObjects_loopIsNull19, StringType), >>>> lambdavariable(MapObjects_loopValue18, MapObjects_loopIsNull19, >>>> StringType).toString, input[0, map<string,string>].valueArray).array >>>> [info] : +- mapobjects(lambdavariable(MapObjects_loopValue18, >>>> MapObjects_loopIsNull19, StringType), >>>> lambdavariable(MapObjects_loopValue18, MapObjects_loopIsNull19, >>>> StringType).toString, input[0, map<string,string>].valueArray) >>>> [info] : :- lambdavariable(MapObjects_loopValue18, >>>> MapObjects_loopIsNull19, StringType).toString >>>> [info] : : +- lambdavariable(MapObjects_loopValue18, >>>> MapObjects_loopIsNull19, StringType) >>>> [info] : +- input[0, map<string,string>].valueArray >>>> [info] : +- input[0, map<string,string>] >>>> [info] +- if (isnull(input[1, array<string>])) null else >>>> staticinvoke(class scala.collection.mutable.WrappedArray$, >>>> ObjectType(interface scala.collection.Seq), make, >>>> mapobjects(lambdavariable(MapObjects_loopValue20, MapObjects_loopIsNull21, >>>> StringType), lambdavariable(MapObjects_loopValue20, >>>> MapObjects_loopIsNull21, StringType).toString, input[1, >>>> array<string>]).array, true) >>>> [info] :- isnull(input[1, array<string>]) >>>> [info] : +- input[1, array<string>] >>>> [info] :- null >>>> [info] +- staticinvoke(class scala.collection.mutable.WrappedArray$, >>>> ObjectType(interface scala.collection.Seq), make, >>>> mapobjects(lambdavariable(MapObjects_loopValue20, MapObjects_loopIsNull21, >>>> StringType), lambdavariable(MapObjects_loopValue20, >>>> MapObjects_loopIsNull21, StringType).toString, input[1, >>>> array<string>]).array, true) >>>> [info] +- mapobjects(lambdavariable(MapObjects_loopValue20, >>>> MapObjects_loopIsNull21, StringType), >>>> lambdavariable(MapObjects_loopValue20, MapObjects_loopIsNull21, >>>> StringType).toString, input[1, array<string>]).array >>>> [info] +- mapobjects(lambdavariable(MapObjects_loopValue20, >>>> MapObjects_loopIsNull21, StringType), >>>> lambdavariable(MapObjects_loopValue20, MapObjects_loopIsNull21, >>>> StringType).toString, input[1, array<string>]) >>>> [info] :- lambdavariable(MapObjects_loopValue20, >>>> MapObjects_loopIsNull21, StringType).toString >>>> [info] : +- lambdavariable(MapObjects_loopValue20, >>>> MapObjects_loopIsNull21, StringType) >>>> [info] +- input[1, array<string>] >>>> [info] at >>>> org.apache.spark.sql.catalyst.encoders.ExpressionEncoder.fromRow(ExpressionEncoder.scala:244) >>>> [info] at >>>> org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$execute$1$1$$anonfun$apply$13.apply(Dataset.scala:2121) >>>> [info] at >>>> org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$execute$1$1$$anonfun$apply$13.apply(Dataset.scala:2121) >>>> [info] at >>>> scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245) >>>> [info] at >>>> scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245) >>>> [info] at >>>> scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33) >>>> [info] at >>>> scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186) >>>> [info] at >>>> scala.collection.TraversableLike$class.map(TraversableLike.scala:245) >>>> [info] at >>>> scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186) >>>> [info] at >>>> org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$execute$1$1.apply(Dataset.scala:2121) >>>> [info] at >>>> org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:54) >>>> [info] at >>>> org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2408) >>>> [info] at org.apache.spark.sql.Dataset.org >>>> $apache$spark$sql$Dataset$$execute$1(Dataset.scala:2120) >>>> [info] at >>>> org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$collect$1.apply(Dataset.scala:2125) >>>> [info] at >>>> org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$collect$1.apply(Dataset.scala:2125) >>>> [info] at >>>> org.apache.spark.sql.Dataset.withCallback(Dataset.scala:2421) >>>> [info] at org.apache.spark.sql.Dataset.org >>>> $apache$spark$sql$Dataset$$collect(Dataset.scala:2125) >>>> [info] at org.apache.spark.sql.Dataset.collect(Dataset.scala:2101) >>>> >>>> >>>> >>>> On Thu, Apr 28, 2016 at 2:41 PM, Ted Yu <yuzhih...@gmail.com> wrote: >>>> >>>>> Are you able to pastebin a unit test which can reproduce the following >>>>> ? >>>>> >>>>> Thanks >>>>> >>>>> On Apr 28, 2016, at 11:35 AM, Koert Kuipers <ko...@tresata.com> wrote: >>>>> >>>>> i tried for the first time to run our own in-house unit tests on spark >>>>> 2, and i get the error below. >>>>> has anyone seen this? >>>>> >>>>> it is reproducible. i tried latest java 7 and it is still there. >>>>> >>>>> # A fatal error has been detected by the Java Runtime Environment: >>>>> # >>>>> # SIGSEGV (0xb) at pc=0x00007f7c3a4b1f54, pid=21939, >>>>> tid=140171011417856 >>>>> # >>>>> # JRE version: Java(TM) SE Runtime Environment (7.0_75-b13) (build >>>>> 1.7.0_75-b13) >>>>> # Java VM: Java HotSpot(TM) 64-Bit Server VM (24.75-b04 mixed mode >>>>> linux-amd64 compressed oops) >>>>> # Problematic frame: >>>>> # V [libjvm.so+0x747f54] _Copy_arrayof_conjoint_jlongs+0x44 >>>>> >>>>> more info: >>>>> >>>>> Stack: [0x00007f7c1b47e000,0x00007f7c1b57f000], >>>>> sp=0x00007f7c1b57a9a8, free space=1010k >>>>> Native frames: (J=compiled Java code, j=interpreted, Vv=VM code, >>>>> C=native code) >>>>> V [libjvm.so+0x747f54] _Copy_arrayof_conjoint_jlongs+0x44 >>>>> j >>>>> sun.misc.Unsafe.copyMemory(Ljava/lang/Object;JLjava/lang/Object;JJ)V+0 >>>>> j >>>>> org.apache.spark.unsafe.Platform.copyMemory(Ljava/lang/Object;JLjava/lang/Object;JJ)V+34 >>>>> j org.apache.spark.unsafe.types.UTF8String.getBytes()[B+76 >>>>> j >>>>> org.apache.spark.unsafe.types.UTF8String.toString()Ljava/lang/String;+5 >>>>> j >>>>> org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificSafeProjection.apply(Ljava/lang/Object;)Ljava/lang/Object;+876 >>>>> j >>>>> org.apache.spark.sql.catalyst.encoders.ExpressionEncoder.fromRow(Lorg/apache/spark/sql/catalyst/InternalRow;)Ljava/lang/Object;+5 >>>>> j >>>>> org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$execute$1$1$$anonfun$apply$13.apply(Lorg/apache/spark/sql/catalyst/InternalRow;)Ljava/lang/Object;+11 >>>>> j >>>>> org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$execute$1$1$$anonfun$apply$13.apply(Ljava/lang/Object;)Ljava/lang/Object;+5 >>>>> J 13277 C2 >>>>> scala.collection.mutable.ArrayOps$ofRef.map(Lscala/Function1;Lscala/collection/generic/CanBuildFrom;)Ljava/lang/Object; >>>>> (7 bytes) @ 0x00007f7c25eeae08 [0x00007f7c25eead40+0xc8] >>>>> j >>>>> org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$execute$1$1.apply()Ljava/lang/Object;+43 >>>>> j >>>>> org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(Lorg/apache/spark/sql/SparkSession;Lorg/apache/spark/sql/execution/QueryExecution;Lscala/Function0;)Ljava/lang/Object;+106 >>>>> j >>>>> org.apache.spark.sql.Dataset.withNewExecutionId(Lscala/Function0;)Ljava/lang/Object;+12 >>>>> j org.apache.spark.sql.Dataset.org >>>>> $apache$spark$sql$Dataset$$execute$1()Ljava/lang/Object;+9 >>>>> j >>>>> org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$collect$1.apply(Lorg/apache/spark/sql/Dataset;)Ljava/lang/Object;+4 >>>>> j >>>>> org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$collect$1.apply(Ljava/lang/Object;)Ljava/lang/Object;+5 >>>>> j >>>>> org.apache.spark.sql.Dataset.withCallback(Ljava/lang/String;Lorg/apache/spark/sql/Dataset;Lscala/Function1;)Ljava/lang/Object;+25 >>>>> j org.apache.spark.sql.Dataset.org >>>>> $apache$spark$sql$Dataset$$collect(Z)Ljava/lang/Object;+20 >>>>> j org.apache.spark.sql.Dataset.collect()Ljava/lang/Object;+2 >>>>> >>>>> >>>> >>> >