(spark) branch branch-4.0 updated: [SPARK-47895][SQL] group by alias should be idempotent

wenchen Fri, 04 Apr 2025 11:54:04 -0700

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/branch-4.0 by this push:
     new 70e0e20b985f [SPARK-47895][SQL] group by alias should be idempotent
70e0e20b985f is described below

commit 70e0e20b985f7d497541826ea69ebd5c8c8c5c39
Author: Mihailo Milosevic <mihailo.milose...@databricks.com>
AuthorDate: Mon Mar 31 21:44:38 2025 +0800

    [SPARK-47895][SQL] group by alias should be idempotent
    
    ### What changes were proposed in this pull request?
    This is a followup of https://github.com/apache/spark/pull/43797 . GROUP BY 
ALIAS has the same bug and this PR applies the same fix to GROUP BY ALIAS
    
    ### Why are the changes needed?
    For advanced users or Spark plugins, they may manipulate the logical plans 
directly. We need to make the framework more reliable.
    
    ### Does this PR introduce _any_ user-facing change?
    Yes, we will fix the error.
    
    ### How was this patch tested?
    Test added.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No.
    
    Closes #50461 from mihailom-db/mihailom-db/fixGroupBy.
    
    Authored-by: Mihailo Milosevic <mihailo.milose...@databricks.com>
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
    (cherry picked from commit 02db87282dabca112b0d688560064f8b71e63d18)
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
---
 .../analysis/ResolveReferencesInAggregate.scala        | 14 +++++++++++++-
 .../analysis/SubstituteUnresolvedOrdinalsSuite.scala   | 18 ++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala
index 7ea90854932e..d1aee4ca2a9d 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala
@@ -116,7 +116,19 @@ class ResolveReferencesInAggregate(val catalogManager: 
CatalogManager) extends S
       groupExprs.map { g =>
         g.transformWithPruning(_.containsPattern(UNRESOLVED_ATTRIBUTE)) {
           case u: UnresolvedAttribute =>
-            selectList.find(ne => conf.resolver(ne.name, u.name)).getOrElse(u)
+            val (result, index) =
+              selectList.zipWithIndex.find(ne => conf.resolver(ne._1.name, 
u.name))
+                .getOrElse((u, -1))
+
+            trimAliases(result) match {
+              // HACK ALERT: If the expanded grouping expression is an integer 
literal, don't use it
+              //             but use an integer literal of the index. The 
reason is we may
+              //             repeatedly analyze the plan, and the original 
integer literal may cause
+              //             failures with a later GROUP BY ordinal 
resolution. GROUP BY constant is
+              //             meaningless so whatever value does not matter 
here.
+              case IntegerLiteral(_) => Literal(index + 1)
+              case _ => result
+            }
         }
       }
     } else {
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala
index 39cf298aec43..106a607ba6df 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala
@@ -104,4 +104,22 @@ class SubstituteUnresolvedOrdinalsSuite extends 
AnalysisTest {
       testRelationWithData.groupBy(Literal(1))(Literal(100).as("a"))
     )
   }
+
+  test("SPARK-47895: group by alias repeated analysis") {
+    val plan = testRelation.groupBy($"b")(Literal(100).as("b")).analyze
+    comparePlans(
+      plan,
+      testRelation.groupBy(Literal(1))(Literal(100).as("b"))
+    )
+
+    val testRelationWithData = testRelation.copy(data = Seq(new 
GenericInternalRow(Array(1: Any))))
+    // Copy the plan to reset its `analyzed` flag, so that analyzer rules will 
re-apply.
+    val copiedPlan = plan.transform {
+      case _: LocalRelation => testRelationWithData
+    }
+    comparePlans(
+      copiedPlan.analyze, // repeated analysis
+      testRelationWithData.groupBy(Literal(1))(Literal(100).as("b"))
+    )
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

(spark) branch branch-4.0 updated: [SPARK-47895][SQL] group by alias should be idempotent

Reply via email to