This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch branch-3.5 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.5 by this push: new f0054c5a10b [SPARK-45920][SQL][3.5] group by ordinal should be idempotent f0054c5a10b is described below commit f0054c5a10bf388688e7b2914cb639c96ffdd8f3 Author: Wenchen Fan <wenc...@databricks.com> AuthorDate: Thu Nov 16 08:16:20 2023 -0800 [SPARK-45920][SQL][3.5] group by ordinal should be idempotent backport https://github.com/apache/spark/pull/43797 ### What changes were proposed in this pull request? GROUP BY ordinal is not idempotent today. If the ordinal points to another integer literal and the plan get analyzed again, we will re-do the ordinal resolution which can lead to wrong result or index out-of-bound error. This PR fixes it by using a hack: if the ordinal points to another integer literal, don't replace the ordinal. ### Why are the changes needed? For advanced users or Spark plugins, they may manipulate the logical plans directly. We need to make the framework more reliable. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? new test ### Was this patch authored or co-authored using generative AI tooling? no Closes #43836 from cloud-fan/3.5-port. Authored-by: Wenchen Fan <wenc...@databricks.com> Signed-off-by: Dongjoon Hyun <dh...@apple.com> --- .../spark/sql/catalyst/analysis/Analyzer.scala | 14 ++++++++++++- .../SubstituteUnresolvedOrdinalsSuite.scala | 23 ++++++++++++++++++++-- .../analyzer-results/group-by-ordinal.sql.out | 2 +- 3 files changed, 35 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 80cb5d8c608..02b9c244543 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -1970,7 +1970,19 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor throw QueryCompilationErrors.groupByPositionRefersToAggregateFunctionError( index, ordinalExpr) } else { - ordinalExpr + trimAliases(ordinalExpr) match { + // HACK ALERT: If the ordinal expression is also an integer literal, don't use it + // but still keep the ordinal literal. The reason is we may repeatedly + // analyze the plan. Using a different integer literal may lead to + // a repeat GROUP BY ordinal resolution which is wrong. GROUP BY + // constant is meaningless so whatever value does not matter here. + // TODO: (SPARK-45932) GROUP BY ordinal should pull out grouping expressions to + // a Project, then the resolved ordinal expression is always + // `AttributeReference`. + case Literal(_: Int, IntegerType) => + Literal(index) + case _ => ordinalExpr + } } } else { throw QueryCompilationErrors.groupByPositionRangeError(index, aggs.size) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala index b0d7ace646e..953b2c8bb10 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala @@ -17,10 +17,11 @@ package org.apache.spark.sql.catalyst.analysis -import org.apache.spark.sql.catalyst.analysis.TestRelations.testRelation2 +import org.apache.spark.sql.catalyst.analysis.TestRelations.{testRelation, testRelation2} import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ -import org.apache.spark.sql.catalyst.expressions.Literal +import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, Literal} +import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.internal.SQLConf class SubstituteUnresolvedOrdinalsSuite extends AnalysisTest { @@ -67,4 +68,22 @@ class SubstituteUnresolvedOrdinalsSuite extends AnalysisTest { testRelation2.groupBy(Literal(1), Literal(2))($"a", $"b")) } } + + test("SPARK-45920: group by ordinal repeated analysis") { + val plan = testRelation.groupBy(Literal(1))(Literal(100).as("a")).analyze + comparePlans( + plan, + testRelation.groupBy(Literal(1))(Literal(100).as("a")) + ) + + val testRelationWithData = testRelation.copy(data = Seq(new GenericInternalRow(Array(1: Any)))) + // Copy the plan to reset its `analyzed` flag, so that analyzer rules will re-apply. + val copiedPlan = plan.transform { + case _: LocalRelation => testRelationWithData + } + comparePlans( + copiedPlan.analyze, // repeated analysis + testRelationWithData.groupBy(Literal(1))(Literal(100).as("a")) + ) + } } diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-ordinal.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-ordinal.sql.out index c8c34a856d4..1bcde5bd367 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-ordinal.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-ordinal.sql.out @@ -61,7 +61,7 @@ Aggregate [a#x, a#x], [a#x, 1 AS 1#x, sum(b#x) AS sum(b)#xL] -- !query select a, 1, sum(b) from data group by 1, 2 -- !query analysis -Aggregate [a#x, 1], [a#x, 1 AS 1#x, sum(b#x) AS sum(b)#xL] +Aggregate [a#x, 2], [a#x, 1 AS 1#x, sum(b#x) AS sum(b)#xL] +- SubqueryAlias data +- View (`data`, [a#x,b#x]) +- Project [cast(a#x as int) AS a#x, cast(b#x as int) AS b#x] --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org