(spark) branch master updated: [SPARK-45920][SQL] group by ordinal should be idempotent

dongjoon Thu, 16 Nov 2023 00:49:33 -0800

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new daebb5996e2 [SPARK-45920][SQL] group by ordinal should be idempotent
daebb5996e2 is described below

commit daebb5996e20e831220b9a9cd69fb4cd23e53c7e
Author: Wenchen Fan <wenc...@databricks.com>
AuthorDate: Thu Nov 16 00:45:04 2023 -0800

    [SPARK-45920][SQL] group by ordinal should be idempotent
    
    ### What changes were proposed in this pull request?
    
    GROUP BY ordinal is not idempotent today. If the ordinal points to another 
integer literal and the plan get analyzed again, we will re-do the ordinal 
resolution which can lead to wrong result or index out-of-bound error. This PR 
fixes it by using a hack: if the ordinal points to another integer literal, 
don't replace the ordinal.
    
    ### Why are the changes needed?
    
    For advanced users or Spark plugins, they may manipulate the logical plans 
directly. We need to make the framework more reliable.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No
    
    ### How was this patch tested?
    
    new test
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    no
    
    Closes #43797 from cloud-fan/group.
    
    Authored-by: Wenchen Fan <wenc...@databricks.com>
    Signed-off-by: Dongjoon Hyun <dh...@apple.com>
---
 .../spark/sql/catalyst/analysis/Analyzer.scala     | 14 ++++++++++++-
 .../SubstituteUnresolvedOrdinalsSuite.scala        | 23 ++++++++++++++++++++--
 .../analyzer-results/group-by-ordinal.sql.out      |  2 +-
 3 files changed, 35 insertions(+), 4 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 780edf5d8af..14c8b740f68 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -2002,7 +2002,19 @@ class Analyzer(override val catalogManager: 
CatalogManager) extends RuleExecutor
               throw 
QueryCompilationErrors.groupByPositionRefersToAggregateFunctionError(
                 index, ordinalExpr)
             } else {
-              ordinalExpr
+              trimAliases(ordinalExpr) match {
+                // HACK ALERT: If the ordinal expression is also an integer 
literal, don't use it
+                //             but still keep the ordinal literal. The reason 
is we may repeatedly
+                //             analyze the plan. Using a different integer 
literal may lead to
+                //             a repeat GROUP BY ordinal resolution which is 
wrong. GROUP BY
+                //             constant is meaningless so whatever value does 
not matter here.
+                // TODO: (SPARK-45932) GROUP BY ordinal should pull out 
grouping expressions to
+                //       a Project, then the resolved ordinal expression is 
always
+                //       `AttributeReference`.
+                case Literal(_: Int, IntegerType) =>
+                  Literal(index)
+                case _ => ordinalExpr
+              }
             }
           } else {
             throw QueryCompilationErrors.groupByPositionRangeError(index, 
aggs.size)
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala
index b0d7ace646e..953b2c8bb10 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala
@@ -17,10 +17,11 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
-import org.apache.spark.sql.catalyst.analysis.TestRelations.testRelation2
+import org.apache.spark.sql.catalyst.analysis.TestRelations.{testRelation, 
testRelation2}
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
-import org.apache.spark.sql.catalyst.expressions.Literal
+import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, Literal}
+import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
 import org.apache.spark.sql.internal.SQLConf
 
 class SubstituteUnresolvedOrdinalsSuite extends AnalysisTest {
@@ -67,4 +68,22 @@ class SubstituteUnresolvedOrdinalsSuite extends AnalysisTest 
{
         testRelation2.groupBy(Literal(1), Literal(2))($"a", $"b"))
     }
   }
+
+  test("SPARK-45920: group by ordinal repeated analysis") {
+    val plan = testRelation.groupBy(Literal(1))(Literal(100).as("a")).analyze
+    comparePlans(
+      plan,
+      testRelation.groupBy(Literal(1))(Literal(100).as("a"))
+    )
+
+    val testRelationWithData = testRelation.copy(data = Seq(new 
GenericInternalRow(Array(1: Any))))
+    // Copy the plan to reset its `analyzed` flag, so that analyzer rules will 
re-apply.
+    val copiedPlan = plan.transform {
+      case _: LocalRelation => testRelationWithData
+    }
+    comparePlans(
+      copiedPlan.analyze, // repeated analysis
+      testRelationWithData.groupBy(Literal(1))(Literal(100).as("a"))
+    )
+  }
 }
diff --git 
a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-ordinal.sql.out
 
b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-ordinal.sql.out
index c8c34a856d4..1bcde5bd367 100644
--- 
a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-ordinal.sql.out
+++ 
b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-ordinal.sql.out
@@ -61,7 +61,7 @@ Aggregate [a#x, a#x], [a#x, 1 AS 1#x, sum(b#x) AS sum(b)#xL]
 -- !query
 select a, 1, sum(b) from data group by 1, 2
 -- !query analysis
-Aggregate [a#x, 1], [a#x, 1 AS 1#x, sum(b#x) AS sum(b)#xL]
+Aggregate [a#x, 2], [a#x, 1 AS 1#x, sum(b#x) AS sum(b)#xL]
 +- SubqueryAlias data
    +- View (`data`, [a#x,b#x])
       +- Project [cast(a#x as int) AS a#x, cast(b#x as int) AS b#x]


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

(spark) branch master updated: [SPARK-45920][SQL] group by ordinal should be idempotent

Reply via email to