This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new ad60c6d [SPARK-27439][SQL] Use analyzed plan when explaining Dataset ad60c6d is described below commit ad60c6d9be3234a0296d1620129d5ca108f0876b Author: Liang-Chi Hsieh <vii...@gmail.com> AuthorDate: Sun Apr 21 10:25:56 2019 -0700 [SPARK-27439][SQL] Use analyzed plan when explaining Dataset ## What changes were proposed in this pull request? Because a review is resolved during analysis when we create a dataset, the content of the view is determined when the dataset is created, not when it is evaluated. Now the explain result of a dataset is not correctly consistent with the collected result of it, because we use pre-analyzed logical plan of the dataset in explain command. The explain command will analyzed the logical plan passed in. So if a view is changed after the dataset was created, the plans shown by explain command [...] ```scala scala> spark.range(10).createOrReplaceTempView("test") scala> spark.range(5).createOrReplaceTempView("test2") scala> spark.sql("select * from test").createOrReplaceTempView("tmp001") scala> val df = spark.sql("select * from tmp001") scala> spark.sql("select * from test2").createOrReplaceTempView("tmp001") scala> df.show +---+ | id| +---+ | 0| | 1| | 2| | 3| | 4| | 5| | 6| | 7| | 8| | 9| +---+ scala> df.explain ``` Before: ```scala == Physical Plan == *(1) Range (0, 5, step=1, splits=12) ``` After: ```scala == Physical Plan == *(1) Range (0, 10, step=1, splits=12) ``` ## How was this patch tested? Manually test and unit test. Closes #24415 from viirya/SPARK-27439. Authored-by: Liang-Chi Hsieh <vii...@gmail.com> Signed-off-by: Dongjoon Hyun <dh...@apple.com> --- .../src/main/scala/org/apache/spark/sql/Dataset.scala | 5 ++++- .../scala/org/apache/spark/sql/DataFrameSuite.scala | 19 ++++++++++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 793714f..e974912 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -498,7 +498,10 @@ class Dataset[T] private[sql]( * @since 1.6.0 */ def explain(extended: Boolean): Unit = { - val explain = ExplainCommand(queryExecution.logical, extended = extended) + // Because views are possibly resolved in the analyzed plan of this dataset. We use analyzed + // plan in `ExplainCommand`, for consistency. Otherwise, the plans shown by explain command + // might be inconsistent with the evaluated data of this dataset. + val explain = ExplainCommand(queryExecution.analyzed, extended = extended) sparkSession.sessionState.executePlan(explain).executedPlan.executeCollect().foreach { // scalastyle:off println r => println(r.getString(0)) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index 8a9c526..62fcca4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql -import java.io.File +import java.io.{ByteArrayOutputStream, File} import java.nio.charset.StandardCharsets import java.sql.{Date, Timestamp} import java.util.UUID @@ -2133,4 +2133,21 @@ class DataFrameSuite extends QueryTest with SharedSQLContext { checkAnswer(res, Row("1-1", 6, 6)) } } + + test("SPARK-27439: Explain result should match collected result after view change") { + withTempView("test", "test2", "tmp") { + spark.range(10).createOrReplaceTempView("test") + spark.range(5).createOrReplaceTempView("test2") + spark.sql("select * from test").createOrReplaceTempView("tmp") + val df = spark.sql("select * from tmp") + spark.sql("select * from test2").createOrReplaceTempView("tmp") + + val captured = new ByteArrayOutputStream() + Console.withOut(captured) { + df.explain() + } + checkAnswer(df, spark.range(10).toDF) + assert(captured.toString().contains("Range (0, 10, step=1, splits=2)")) + } + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org