Hi All, I am currently having issues reading in a json file using spark sql's api. Here is what the json file looks like: { "namespace": "spacey", "name": "namer", "type": "record", "fields": [ {"name":"f1","type":["null","string"]}, {"name":"f2","type":["null","string"]}, {"name":"f3","type":["null","string"]}, {"name":"f4","type":["null","string"]}, {"name":"f5","type":["null","string"]}, {"name":"f6","type":["null","string"]}, {"name":"f7","type":["null","string"]}, {"name":"f8","type":["null","string"]}, {"name":"f9","type":["null","string"]}, {"name":"f10","type":["null","string"]}, {"name":"f11","type":["null","string"]}, {"name":"f12","type":["null","string"]}, {"name":"f13","type":["null","string"]}, {"name":"f14","type":["null","string"]}, {"name":"f15","type":["null","string"]} ] }
This is what I am doing to read in the json file(using spark sql in the spark shell on CDH5.3): val sqlsc = new org.apache.spark.sql.SQLContext(sc) val j = sqlsc.jsonFile("/tmp/try.avsc") This is what I am getting as an error: 15/03/02 11:23:45 WARN TaskSetManager: Lost task 0.0 in stage 3.0 (TID 12, 10.0.2.15): scala.MatchError: namespace (of class java.lang.String) at org.apache.spark.sql.json.JsonRDD$$anonfun$parseJson$1$$anonfun$apply$2.apply(JsonRDD.scala:305) at org.apache.spark.sql.json.JsonRDD$$anonfun$parseJson$1$$anonfun$apply$2.apply(JsonRDD.scala:303) at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:371) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327) at scala.collection.Iterator$class.foreach(Iterator.scala:727) at scala.collection.AbstractIterator.foreach(Iterator.scala:1157) at scala.collection.TraversableOnce$class.reduceLeft(TraversableOnce.scala:172) at scala.collection.AbstractIterator.reduceLeft(Iterator.scala:1157) at org.apache.spark.rdd.RDD$$anonfun$18.apply(RDD.scala:853) at org.apache.spark.rdd.RDD$$anonfun$18.apply(RDD.scala:851) at org.apache.spark.SparkContext$$anonfun$29.apply(SparkContext.scala:1350) at org.apache.spark.SparkContext$$anonfun$29.apply(SparkContext.scala:1350) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61) at org.apache.spark.scheduler.Task.run(Task.scala:56) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:196) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) at java.lang.Thread.run(Thread.java:745) 15/03/02 11:23:45 INFO TaskSetManager: Starting task 0.1 in stage 3.0 (TID 14, 10.0.2.15, ANY, 1308 bytes) 15/03/02 11:23:45 INFO TaskSetManager: Finished task 1.0 in stage 3.0 (TID 13) in 128 ms on 10.0.2.15 (1/2) 15/03/02 11:23:45 INFO TaskSetManager: Lost task 0.1 in stage 3.0 (TID 14) on executor 10.0.2.15: scala.MatchError (namespace (of class java.lang.String)) [duplicate 1] 15/03/02 11:23:45 INFO TaskSetManager: Starting task 0.2 in stage 3.0 (TID 15, 10.0.2.15, ANY, 1308 bytes) 15/03/02 11:23:45 INFO TaskSetManager: Lost task 0.2 in stage 3.0 (TID 15) on executor 10.0.2.15: scala.MatchError (namespace (of class java.lang.String)) [duplicate 2] 15/03/02 11:23:45 INFO TaskSetManager: Starting task 0.3 in stage 3.0 (TID 16, 10.0.2.15, ANY, 1308 bytes) 15/03/02 11:23:45 INFO TaskSetManager: Lost task 0.3 in stage 3.0 (TID 16) on executor 10.0.2.15: scala.MatchError (namespace (of class java.lang.String)) [duplicate 3] 15/03/02 11:23:45 ERROR TaskSetManager: Task 0 in stage 3.0 failed 4 times; aborting job 15/03/02 11:23:45 INFO TaskSchedulerImpl: Removed TaskSet 3.0, whose tasks have all completed, from pool 15/03/02 11:23:45 INFO TaskSchedulerImpl: Cancelling stage 3 15/03/02 11:23:45 INFO DAGScheduler: Job 3 failed: reduce at JsonRDD.scala:57, took 0.210707 s org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 3.0 failed 4 times, most recent failure: Lost task 0.3 in stage 3.0 (TID 16, 10.0.2.15): scala.MatchError: namespace (of class java.lang.String) at org.apache.spark.sql.json.JsonRDD$$anonfun$parseJson$1$$anonfun$apply$2.apply(JsonRDD.scala:305) at org.apache.spark.sql.json.JsonRDD$$anonfun$parseJson$1$$anonfun$apply$2.apply(JsonRDD.scala:303) at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:371) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327) at scala.collection.Iterator$class.foreach(Iterator.scala:727) at scala.collection.AbstractIterator.foreach(Iterator.scala:1157) at scala.collection.TraversableOnce$class.reduceLeft(TraversableOnce.scala:172) at scala.collection.AbstractIterator.reduceLeft(Iterator.scala:1157) at org.apache.spark.rdd.RDD$$anonfun$18.apply(RDD.scala:853) at org.apache.spark.rdd.RDD$$anonfun$18.apply(RDD.scala:851) at org.apache.spark.SparkContext$$anonfun$29.apply(SparkContext.scala:1350) at org.apache.spark.SparkContext$$anonfun$29.apply(SparkContext.scala:1350) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61) at org.apache.spark.scheduler.Task.run(Task.scala:56) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:196) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) at java.lang.Thread.run(Thread.java:745) Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1214) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1203) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1202) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1202) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:696) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:696) at scala.Option.foreach(Option.scala:236) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:696) at org.apache.spark.scheduler.DAGSchedulerEventProcessActor$$anonfun$receive$2.applyOrElse(DAGScheduler.scala:1420) at akka.actor.ActorCell.receiveMessage(ActorCell.scala:498) at akka.actor.ActorCell.invoke(ActorCell.scala:456) at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:237) at akka.dispatch.Mailbox.run(Mailbox.scala:219) at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:386) at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) -- View this message in context: http://apache-spark-user-list.1001560.n3.nabble.com/Issues-reading-in-Json-file-with-spark-sql-tp21879.html Sent from the Apache Spark User List mailing list archive at Nabble.com. --------------------------------------------------------------------- To unsubscribe, e-mail: user-unsubscr...@spark.apache.org For additional commands, e-mail: user-h...@spark.apache.org