[ https://issues.apache.org/jira/browse/SPARK-47104?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Dongjoon Hyun updated SPARK-47104: ---------------------------------- Affects Version/s: 3.0.3 > Spark SQL query fails with NullPointerException > ----------------------------------------------- > > Key: SPARK-47104 > URL: https://issues.apache.org/jira/browse/SPARK-47104 > Project: Spark > Issue Type: Bug > Components: SQL > Affects Versions: 3.0.3, 3.1.3, 3.2.1, 3.4.2, 3.5.0 > Reporter: Chhavi Bansal > Priority: Major > Labels: pull-request-available > > I am trying to run a very simple SQL query involving join and orderby clause > and then using UUID() function in the outermost select stmt. The query fails > {code:java} > val df = spark.read.format("csv").option("header", > "true").load("src/main/resources/titanic.csv") > df.createOrReplaceTempView("titanic") > val query = spark.sql(" select name, uuid() as _iid from (select s.name from > titanic s join titanic t on s.name = t.name order by name) ;") > query.show() // FAILS{code} > Dataset is a normal csv file with the following columns > {code:java} > PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked > {code} > Below is the error > {code:java} > Exception in thread "main" java.lang.NullPointerException > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown > Source) > at > org.apache.spark.sql.execution.TakeOrderedAndProjectExec.$anonfun$executeCollect$2(limit.scala:207) > at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:237) > at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36) > at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33) > at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198) > at scala.collection.TraversableLike.map(TraversableLike.scala:237) > at scala.collection.TraversableLike.map$(TraversableLike.scala:230) > at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:198) > at > org.apache.spark.sql.execution.TakeOrderedAndProjectExec.executeCollect(limit.scala:207) > at > org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$executeCollect$1(AdaptiveSparkPlanExec.scala:338) > at > org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.withFinalPlanUpdate(AdaptiveSparkPlanExec.scala:366) > at > org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.executeCollect(AdaptiveSparkPlanExec.scala:338) > at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3715) > at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2728) > at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3706) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103) > at > org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90) > at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64) > at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3704) > at org.apache.spark.sql.Dataset.head(Dataset.scala:2728) > at org.apache.spark.sql.Dataset.take(Dataset.scala:2935) > at org.apache.spark.sql.Dataset.getRows(Dataset.scala:287) > at org.apache.spark.sql.Dataset.showString(Dataset.scala:326) > at org.apache.spark.sql.Dataset.show(Dataset.scala:808) > at org.apache.spark.sql.Dataset.show(Dataset.scala:785) > at > hyperspace2.sparkPlan$.delayedEndpoint$hyperspace2$sparkPlan$1(sparkPlan.scala:14) > at hyperspace2.sparkPlan$delayedInit$body.apply(sparkPlan.scala:6) > at scala.Function0.apply$mcV$sp(Function0.scala:39) > at scala.Function0.apply$mcV$sp$(Function0.scala:39) > at scala.runtime.AbstractFunction0.apply$mcV$sp(AbstractFunction0.scala:17) > at scala.App.$anonfun$main$1$adapted(App.scala:80) > at scala.collection.immutable.List.foreach(List.scala:392) > at scala.App.main(App.scala:80) > at scala.App.main$(App.scala:78) > at hyperspace2.sparkPlan$.main(sparkPlan.scala:6) > at hyperspace2.sparkPlan.main(sparkPlan.scala) {code} > Note: > # here if I remove order by clause then it produces the correct output. > # This happens when I read the dataset using csv file, works fine if I make > the dataframe using Seq().toDf > # The query fails if I use spark.sql("query").show() but is success when I > simple write it to csv file > [https://stackoverflow.com/questions/78020267/spark-sql-query-fails-with-nullpointerexception] > Please can someone look into why this happens just when using `show()` since > this is failing queries in production for me. -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org