This is an automated email from the ASF dual-hosted git repository.

gengliang pushed a commit to branch branch-3.4
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.4 by this push:
     new 83f8ddd1ac2 [SPARK-42416][SQL] Dateset operations should not resolve 
the analyzed logical plan again
83f8ddd1ac2 is described below

commit 83f8ddd1ac285550fdd429e96a37f101ef23c79c
Author: Gengliang Wang <gengli...@apache.org>
AuthorDate: Mon Feb 13 10:57:47 2023 -0800

    [SPARK-42416][SQL] Dateset operations should not resolve the analyzed 
logical plan again
    
    ### What changes were proposed in this pull request?
    
    For the following query
    
    ```
            sql(
              """
                |CREATE TABLE app (
                |  uid STRING,
                |  st TIMESTAMP,
                |  ds INT
                |) USING parquet PARTITIONED BY (ds);
                |""".stripMargin)
    
            sql(
              """
                |create or replace temporary view view1 as WITH new_app AS (
                |  SELECT a.* FROM app a)
                |SELECT
                |    uid,
                |    20230208 AS ds
                |  FROM
                |    new_app
                |  GROUP BY
                |    1,
                |    2
                |""".stripMargin)
    
            val df = sql("select uid from view1")
            df.show()
    ```
    Spark will throw the following error
    ```
    [GROUP_BY_POS_OUT_OF_RANGE] GROUP BY position 20230208 is not in select 
list (valid range is [1, 2]).; line 9 pos 4
    ```
    This is because the logical plan in `df` is not set as analyzed after 
changes in 
https://github.com/apache/spark/commit/6adda258e5155761a861a96af4f5410b8a7f304d#diff-583171e935b2dc349378063a5841c5b98b30a2d57ac3743a9eccfe7bffcb8f2aR126,
 which sets the result of `inlineCTE(plan)` as analyzed instead.
    Resolving ordinals is not idempotent. In `df`, the `GROUP BY 1, 2` is 
resolved as `GROUP BY uid, 20230208`. `Dateset.show()` will resolve the logical 
plan in `df` again and treat the 20230208 in `GROUP BY uid, 20230208` as 
ordinals again. Thus the error happens
    ### Why are the changes needed?
    
    Bug fix
    
    ### Does this PR introduce _any_ user-facing change?
    
    No, the regression is not released yet.
    ### How was this patch tested?
    
    New UT
    
    Closes #39988 from gengliangwang/group_by_error.
    
    Authored-by: Gengliang Wang <gengli...@apache.org>
    Signed-off-by: Gengliang Wang <gengli...@apache.org>
    (cherry picked from commit 5267ad82d992ad80dccda8f59d99d84c3e85a32c)
    Signed-off-by: Gengliang Wang <gengli...@apache.org>
---
 .../sql/catalyst/analysis/CheckAnalysis.scala      |  3 +-
 .../scala/org/apache/spark/sql/SQLQuerySuite.scala | 34 ++++++++++++++++++++++
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index e95c21ad985..77948735dbe 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -142,6 +142,7 @@ trait CheckAnalysis extends PredicateHelper with 
LookupCatalog with QueryErrorsB
     }
     // Inline all CTEs in the plan to help check query plan structures in 
subqueries.
     checkAnalysis0(inlineCTE(plan))
+    plan.setAnalyzed()
   }
 
   def checkAnalysis0(plan: LogicalPlan): Unit = {
@@ -775,8 +776,6 @@ trait CheckAnalysis extends PredicateHelper with 
LookupCatalog with QueryErrorsB
           summary = o.origin.context.summary)
       case _ =>
     }
-
-    plan.setAnalyzed()
   }
 
   private def getAllExpressions(plan: LogicalPlan): Seq[Expression] = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index f0566619e74..3350858b7e2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -4546,6 +4546,40 @@ class SQLQuerySuite extends QueryTest with 
SharedSparkSession with AdaptiveSpark
       Seq(Row(2), Row(1)))
   }
 
+  test("SPARK-42416: Dateset operations should not resolve the analyzed 
logical plan again") {
+    withTable("app") {
+      withView("view1") {
+        sql(
+          """
+            |CREATE TABLE app (
+            |  uid STRING,
+            |  st TIMESTAMP,
+            |  ds INT
+            |) USING parquet PARTITIONED BY (ds);
+            |""".stripMargin)
+
+        sql(
+          """
+            |create or replace temporary view view1 as WITH new_app AS (
+            |  SELECT a.* FROM app a)
+            |SELECT
+            |    uid,
+            |    20230208 AS ds
+            |  FROM
+            |    new_app
+            |  GROUP BY
+            |    1,
+            |    2
+            |""".stripMargin)
+        val df = sql("select uid from view1")
+        // If the logical plan in `df` is analyzed again, the 'group by 
20230208' will be
+        // treated as ordinal again and there will be an error about GROUP BY 
position 20230208
+        // being out of range.
+        df.show()
+      }
+    }
+  }
+
   test("SPARK-39548: CreateView will make queries go into inline CTE code path 
thus" +
     "trigger a mis-clarified `window definition not found` issue") {
     sql(


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to