Hi , I am reading parquet file and creating temp table. when i am trying to execute query outside of foreach function it is working fine. throws nullpointerexception within data frame.foreach function.
code snippet: String CITATION_QUERY = "select c.citation_num, c.title, c.publisher from test c"; Dataset<Row> citation_query = spark.sql(CITATION_QUERY); System.out.println("mistery:"+citation_query.count()); // Iterator<Row> iterofresulDF = resultDF.toLocalIterator(); resultDF.foreach(new ForeachFunction<Row>() { private static final long serialVersionUID = 1L; public void call(Row line) { Dataset<Row> row = spark.sql(CITATION_QUERY); System.out.println("mistery row count:"+row.count()); } }); Error trace: 16/10/12 15:59:53 INFO CodecPool: Got brand-new decompressor [.snappy] 16/10/12 15:59:53 ERROR Executor: Exception in task 0.0 in stage 5.0 (TID 5) java.lang.NullPointerException at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:112) at org.apache.spark.sql.SparkSession.sessionState(SparkSession.scala:110) at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:582) at com.elsevier.datasearch.ExecuteSQL.executeQuery(ExecuteSQL.java:11) at com.elsevier.datasearch.ProcessPetDB$1.call(ProcessPetDB.java:53) at com.elsevier.datasearch.ProcessPetDB$1.call(ProcessPetDB.java:1) at org.apache.spark.sql.Dataset$$anonfun$foreach$2.apply(Dataset.scala:2118) at org.apache.spark.sql.Dataset$$anonfun$foreach$2.apply(Dataset.scala:2118) at scala.collection.Iterator$class.foreach(Iterator.scala:893) at scala.collection.AbstractIterator.foreach(Iterator.scala:1336) at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$27.apply(RDD.scala:894) at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$27.apply(RDD.scala:894) at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1916) at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1916) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) at org.apache.spark.scheduler.Task.run(Task.scala:86) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.org $apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1454) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1442) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1441) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1441) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811) at scala.Option.foreach(Option.scala:257) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1667) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1622) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1611) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1890) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1903) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1916) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1930) at org.apache.spark.rdd.RDD$$anonfun$foreach$1.apply(RDD.scala:894) at org.apache.spark.rdd.RDD$$anonfun$foreach$1.apply(RDD.scala:892) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) at org.apache.spark.rdd.RDD.withScope(RDD.scala:358) at org.apache.spark.rdd.RDD.foreach(RDD.scala:892) at org.apache.spark.sql.Dataset$$anonfun$foreach$1.apply$mcV$sp(Dataset.scala:2108) at org.apache.spark.sql.Dataset$$anonfun$foreach$1.apply(Dataset.scala:2108) at org.apache.spark.sql.Dataset$$anonfun$foreach$1.apply(Dataset.scala:2108) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:57) at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2546) at org.apache.spark.sql.Dataset.foreach(Dataset.scala:2107) at org.apache.spark.sql.Dataset.foreach(Dataset.scala:2118) at com.elsevier.datasearch.ProcessPetDB.main(ProcessPetDB.java:46) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:497) at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:736) at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:185) at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:210) at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:124) at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) Caused by: java.lang.NullPointerException at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:112) at org.apache.spark.sql.SparkSession.sessionState(SparkSession.scala:110) at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:582) at com.elsevier.datasearch.ExecuteSQL.executeQuery(ExecuteSQL.java:11) at com.elsevier.datasearch.ProcessPetDB$1.call(ProcessPetDB.java:53) at com.elsevier.datasearch.ProcessPetDB$1.call(ProcessPetDB.java:1) at org.apache.spark.sql.Dataset$$anonfun$foreach$2.apply(Dataset.scala:2118) at org.apache.spark.sql.Dataset$$anonfun$foreach$2.apply(Dataset.scala:2118) at scala.collection.Iterator$class.foreach(Iterator.scala:893) at scala.collection.AbstractIterator.foreach(Iterator.scala:1336) at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$27.apply(RDD.scala:894) at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$27.apply(RDD.scala:894) at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1916) at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1916) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) at org.apache.spark.scheduler.Task.run(Task.scala:86) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) 16/10/12 15:59:53 INFO SparkContext: Invoking stop() from shutdown hook Please let me know if i am missing anything. Thank you for the help. -- Selvam Raman "லஞ்சம் தவிர்த்து நெஞ்சம் நிமிர்த்து"