Hmmm, this looks like a bug. Can you file a JIRA? On Thu, Oct 30, 2014 at 4:04 PM, Jean-Pascal Billaud <j...@tellapart.com> wrote:
> Hi, > > While testing SparkSQL on top of our Hive metastore, I am getting > some java.lang.ArrayIndexOutOfBoundsException while reusing a cached RDD > table. > > Basically, I have a table "mtable" partitioned by some "date" field in > hive and below is the scala code I am running in spark-shell: > > val sqlContext = new org.apache.spark.sql.hive.HiveContext(sc); > val rdd_mtable = sqlContext.sql("select * from mtable where > date=20141028"); > rdd_mtable.registerTempTable("rdd_mtable"); > sqlContext.cacheTable("rdd_mtable"); > sqlContext.sql("select count(*) from rdd_mtable").collect(); <-- OK > sqlContext.sql("select count(*) from rdd_mtable").collect(); <-- Exception > > So the first collect() is working just fine, however running the second > collect() which I expect use the cached RDD throws some > java.lang.ArrayIndexOutOfBoundsException, see the backtrace at the end of > this email. It seems the columnar traversal is crashing for some reasons. > FYI, I am using spark ToT (234de9232bcfa212317a8073c4a82c3863b36b14). > > java.lang.ArrayIndexOutOfBoundsException: 14 > at > org.apache.spark.sql.catalyst.expressions.GenericRow.apply(Row.scala:142) > at > org.apache.spark.sql.catalyst.expressions.BoundReference.eval(BoundAttribute.scala:37) > at > org.apache.spark.sql.catalyst.expressions.Expression.n2(Expression.scala:108) > at org.apache.spark.sql.catalyst.expressions.Add.eval(arithmetic.scala:89) > at > org.apache.spark.sql.columnar.InMemoryRelation$$anonfun$computeSizeInBytes$1.apply(InMemoryColumnarTableScan.scala:66) > at > org.apache.spark.sql.columnar.InMemoryRelation$$anonfun$computeSizeInBytes$1.apply(InMemoryColumnarTableScan.scala:66) > at > scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244) > at > scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244) > at > scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) > at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47) > at scala.collection.TraversableLike$class.map(TraversableLike.scala:244) > at scala.collection.AbstractTraversable.map(Traversable.scala:105) > at > org.apache.spark.sql.columnar.InMemoryRelation.computeSizeInBytes(InMemoryColumnarTableScan.scala:66) > at > org.apache.spark.sql.columnar.InMemoryRelation.statistics(InMemoryColumnarTableScan.scala:87) > at > org.apache.spark.sql.columnar.InMemoryRelation.statisticsToBePropagated(InMemoryColumnarTableScan.scala:73) > at > org.apache.spark.sql.columnar.InMemoryRelation.withOutput(InMemoryColumnarTableScan.scala:147) > at > org.apache.spark.sql.CacheManager$$anonfun$useCachedData$1$$anonfun$applyOrElse$1.apply(CacheManager.scala:122) > at > org.apache.spark.sql.CacheManager$$anonfun$useCachedData$1$$anonfun$applyOrElse$1.apply(CacheManager.scala:122) > at scala.Option.map(Option.scala:145) > at > org.apache.spark.sql.CacheManager$$anonfun$useCachedData$1.applyOrElse(CacheManager.scala:122) > at > org.apache.spark.sql.CacheManager$$anonfun$useCachedData$1.applyOrElse(CacheManager.scala:119) > at > org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:144) > at > org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:162) > at scala.collection.Iterator$$anon$11.next(Iterator.scala:328) > at scala.collection.Iterator$class.foreach(Iterator.scala:727) > at scala.collection.AbstractIterator.foreach(Iterator.scala:1157) > at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48) > at > scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103) > at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47) > at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273) > at scala.collection.AbstractIterator.to(Iterator.scala:1157) > at > scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265) > at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157) > at > scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252) > at scala.collection.AbstractIterator.toArray(Iterator.scala:1157) > at > org.apache.spark.sql.catalyst.trees.TreeNode.transformChildrenDown(TreeNode.scala:191) > at > org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:147) > at > org.apache.spark.sql.CacheManager$class.useCachedData(CacheManager.scala:119) > at org.apache.spark.sql.SQLContext.useCachedData(SQLContext.scala:49) > at > org.apache.spark.sql.SQLContext$QueryExecution.withCachedData$lzycompute(SQLContext.scala:376) > at > org.apache.spark.sql.SQLContext$QueryExecution.withCachedData(SQLContext.scala:376) > at > org.apache.spark.sql.SQLContext$QueryExecution.optimizedPlan$lzycompute(SQLContext.scala:377) > at > org.apache.spark.sql.SQLContext$QueryExecution.optimizedPlan(SQLContext.scala:377) > at > org.apache.spark.sql.SQLContext$QueryExecution.sparkPlan$lzycompute(SQLContext.scala:382) > at > org.apache.spark.sql.SQLContext$QueryExecution.sparkPlan(SQLContext.scala:380) > at > org.apache.spark.sql.SQLContext$QueryExecution.executedPlan$lzycompute(SQLContext.scala:386) > at > org.apache.spark.sql.SQLContext$QueryExecution.executedPlan(SQLContext.scala:386) > > Thanks, >