This is https://issues.apache.org/jira/browse/SPARK-10422, which has been fixed in Spark 1.5.1.
On Wed, Oct 21, 2015 at 4:40 PM, Sourav Mazumder < sourav.mazumde...@gmail.com> wrote: > In 1.5.0 if I use randomSplit on a data frame I get this error. > > Here is teh code snippet - > > val splitData = merged.randomSplit(Array(70,30)) > val trainData = splitData(0).persist() > val testData = splitData(1) > > trainData.registerTempTable("trn") > > %sql select * from trn > > The exception goes like this - > > java.util.NoSuchElementException: key not found: 1910 at > scala.collection.MapLike$class.default(MapLike.scala:228) at > scala.collection.AbstractMap.default(Map.scala:58) at > scala.collection.mutable.HashMap.apply(HashMap.scala:64) at > org.apache.spark.sql.columnar.compression.DictionaryEncoding$Encoder.compress(compressionSchemes.scala:258) > at > org.apache.spark.sql.columnar.compression.CompressibleColumnBuilder$class.build(CompressibleColumnBuilder.scala:110) > at > org.apache.spark.sql.columnar.NativeColumnBuilder.build(ColumnBuilder.scala:87) > at > org.apache.spark.sql.columnar.InMemoryRelation$$anonfun$3$$anon$1$$anonfun$next$2.apply(InMemoryColumnarTableScan.scala:152) > at > org.apache.spark.sql.columnar.InMemoryRelation$$anonfun$3$$anon$1$$anonfun$next$2.apply(InMemoryColumnarTableScan.scala:152) > at > scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244) > at > scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244) > at > scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33) > at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:108) at > scala.collection.TraversableLike$class.map(TraversableLike.scala:244) at > scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:108) at > org.apache.spark.sql.columnar.InMemoryRelation$$anonfun$3$$anon$1.next(InMemoryColumnarTableScan.scala:152) > at > org.apache.spark.sql.columnar.InMemoryRelation$$anonfun$3$$anon$1.next(InMemoryColumnarTableScan.scala:120) > at org.apache.spark.storage.MemoryStore.unrollSafely(MemoryStore.scala:278) > at org.apache.spark.CacheManager.putInBlockManager(CacheManager.scala:171) > at org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:78) at > org.apache.spark.rdd.RDD.iterator(RDD.scala:262) at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at > org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:297) at > org.apache.spark.rdd.RDD.iterator(RDD.scala:264) at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at > org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:297) at > org.apache.spark.rdd.RDD.iterator(RDD.scala:264) at > org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66) at > org.apache.spark.scheduler.Task.run(Task.scala:88) at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214) at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) > at java.lang.Thread.run(Thread.java:745) > > > Any idea ? > > regards, > Sourav >