This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.5 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.5 by this push: new 8bcbf7701388 [SPARK-47561][SQL] Fix analyzer rule order issues about Alias 8bcbf7701388 is described below commit 8bcbf7701388a2da06369ae9317d7707624edba0 Author: Wenchen Fan <wenc...@databricks.com> AuthorDate: Tue Mar 26 07:45:54 2024 -0700 [SPARK-47561][SQL] Fix analyzer rule order issues about Alias ### What changes were proposed in this pull request? We found two analyzer rule execution order issues in our internal workloads: - `CreateStruct.apply` creates `NamePlaceholder` for unresolved `NamedExpression`. However, with certain rule execution order, the `NamedExpression` may be removed (e.g. remove unnecessary `Alias`) before `NamePlaceholder` is resolved, then `NamePlaceholder` can't be resolved anymore. - UNPIVOT uses `UnresolvedAlias` to wrap `UnresolvedAttribute`. There is a conflict about how to determine the final alias name. If `ResolveAliases` runs first, then `UnresolvedAlias` will be removed and eventually the alias will be `b` for nested column `a.b`. If `ResolveReferences` runs first, then we resolve `a.b` first and then `UnresolvedAlias` will determine the alias as `a.b` not `b`. This PR fixes the two issues - `CreateStruct.apply` should determine the field name immediately if the input is `Alias` - The parser rule for UNPIVOT should follow how we parse SELECT and return `UnresolvedAttribute` directly without the `UnresolvedAlias` wrapper. It's a bit risky to fix the order issue between `ResolveAliases` and `ResolveReferences` as it can change the final query schema, we will save it for later. ### Why are the changes needed? fix unstable analyzer behavior with different rule execution orders. ### Does this PR introduce _any_ user-facing change? Yes, some failed queries can run now. The issue for UNPIVOT only affects the error message. ### How was this patch tested? verified by our internal workloads. The repro query is quite complicated to trigger a certain rule execution order so we won't add tests for it. The fix is quite obvious. ### Was this patch authored or co-authored using generative AI tooling? no Closes #45718 from cloud-fan/rule. Authored-by: Wenchen Fan <wenc...@databricks.com> Signed-off-by: Dongjoon Hyun <dh...@apple.com> --- .../catalyst/expressions/complexTypeCreator.scala | 1 + .../spark/sql/catalyst/parser/AstBuilder.scala | 2 +- .../sql/catalyst/parser/UnpivotParserSuite.scala | 39 ++++++++++------------ 3 files changed, 20 insertions(+), 22 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala index 205121913121..c95a0987330d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala @@ -372,6 +372,7 @@ object CreateStruct { // alias name inside CreateNamedStruct. case (u: UnresolvedAttribute, _) => Seq(Literal(u.nameParts.last), u) case (u @ UnresolvedExtractValue(_, e: Literal), _) if e.dataType == StringType => Seq(e, u) + case (a: Alias, _) => Seq(Literal(a.name), a) case (e: NamedExpression, _) if e.resolved => Seq(Literal(e.name), e) case (e: NamedExpression, _) => Seq(NamePlaceholder, e) case (e, index) => Seq(Literal(s"col${index + 1}"), e) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 90fbdd94dc38..5d68aed9245a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -1291,7 +1291,7 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { * Create an Unpivot column. */ override def visitUnpivotColumn(ctx: UnpivotColumnContext): NamedExpression = withOrigin(ctx) { - UnresolvedAlias(UnresolvedAttribute(visitMultipartIdentifier(ctx.multipartIdentifier))) + UnresolvedAttribute(visitMultipartIdentifier(ctx.multipartIdentifier)) } /** diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/UnpivotParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/UnpivotParserSuite.scala index c680e08c1c83..3012ef6f1544 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/UnpivotParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/UnpivotParserSuite.scala @@ -39,7 +39,7 @@ class UnpivotParserSuite extends AnalysisTest { "SELECT * FROM t UNPIVOT (val FOR col in (a, b))", Unpivot( None, - Some(Seq(Seq(UnresolvedAlias($"a")), Seq(UnresolvedAlias($"b")))), + Some(Seq(Seq($"a"), Seq($"b"))), None, "col", Seq("val"), @@ -59,7 +59,7 @@ class UnpivotParserSuite extends AnalysisTest { sql, Unpivot( None, - Some(Seq(Seq(UnresolvedAlias($"a")), Seq(UnresolvedAlias($"b")))), + Some(Seq(Seq($"a"), Seq($"b"))), Some(Seq(Some("A"), None)), "col", Seq("val"), @@ -76,7 +76,7 @@ class UnpivotParserSuite extends AnalysisTest { "SELECT * FROM t UNPIVOT ((val1, val2) FOR col in ((a, b), (c, d)))", Unpivot( None, - Some(Seq(Seq($"a", $"b").map(UnresolvedAlias(_)), Seq($"c", $"d").map(UnresolvedAlias(_)))), + Some(Seq(Seq($"a", $"b"), Seq($"c", $"d"))), None, "col", Seq("val1", "val2"), @@ -96,10 +96,7 @@ class UnpivotParserSuite extends AnalysisTest { sql, Unpivot( None, - Some(Seq( - Seq($"a", $"b").map(UnresolvedAlias(_)), - Seq($"c", $"d").map(UnresolvedAlias(_)) - )), + Some(Seq(Seq($"a", $"b"), Seq($"c", $"d"))), Some(Seq(Some("first"), None)), "col", Seq("val1", "val2"), @@ -132,7 +129,7 @@ class UnpivotParserSuite extends AnalysisTest { sql, Unpivot( None, - Some(Seq(Seq(UnresolvedAlias($"a")), Seq(UnresolvedAlias($"b")))), + Some(Seq(Seq($"a"), Seq($"b"))), None, "col", Seq("val"), @@ -169,7 +166,7 @@ class UnpivotParserSuite extends AnalysisTest { "SELECT * FROM t UNPIVOT EXCLUDE NULLS (val FOR col in (a, b))", Unpivot( None, - Some(Seq(Seq(UnresolvedAlias($"a")), Seq(UnresolvedAlias($"b")))), + Some(Seq(Seq($"a"), Seq($"b"))), None, "col", Seq("val"), @@ -184,7 +181,7 @@ class UnpivotParserSuite extends AnalysisTest { "SELECT * FROM t UNPIVOT INCLUDE NULLS (val FOR col in (a, b))", Unpivot( None, - Some(Seq(Seq(UnresolvedAlias($"a")), Seq(UnresolvedAlias($"b")))), + Some(Seq(Seq($"a"), Seq($"b"))), None, "col", Seq("val"), @@ -199,7 +196,7 @@ class UnpivotParserSuite extends AnalysisTest { "SELECT * FROM t1 UNPIVOT (val FOR col in (a, b)) JOIN t2", Unpivot( None, - Some(Seq(Seq(UnresolvedAlias($"a")), Seq(UnresolvedAlias($"b")))), + Some(Seq(Seq($"a"), Seq($"b"))), None, "col", Seq("val"), @@ -211,7 +208,7 @@ class UnpivotParserSuite extends AnalysisTest { "SELECT * FROM t1 JOIN t2 UNPIVOT (val FOR col in (a, b))", Unpivot( None, - Some(Seq(Seq(UnresolvedAlias($"a")), Seq(UnresolvedAlias($"b")))), + Some(Seq(Seq($"a"), Seq($"b"))), None, "col", Seq("val"), @@ -224,7 +221,7 @@ class UnpivotParserSuite extends AnalysisTest { table("t1").join( Unpivot( None, - Some(Seq(Seq(UnresolvedAlias($"a")), Seq(UnresolvedAlias($"b")))), + Some(Seq(Seq($"a"), Seq($"b"))), None, "col", Seq("val"), @@ -239,7 +236,7 @@ class UnpivotParserSuite extends AnalysisTest { "SELECT * FROM t1 UNPIVOT (val FOR col in (a, b)), t2", Unpivot( None, - Some(Seq(Seq(UnresolvedAlias($"a")), Seq(UnresolvedAlias($"b")))), + Some(Seq(Seq($"a"), Seq($"b"))), None, "col", Seq("val"), @@ -251,7 +248,7 @@ class UnpivotParserSuite extends AnalysisTest { "SELECT * FROM t1, t2 UNPIVOT (val FOR col in (a, b))", Unpivot( None, - Some(Seq(Seq(UnresolvedAlias($"a")), Seq(UnresolvedAlias($"b")))), + Some(Seq(Seq($"a"), Seq($"b"))), None, "col", Seq("val"), @@ -267,7 +264,7 @@ class UnpivotParserSuite extends AnalysisTest { table("t1").join( Unpivot( None, - Some(Seq(Seq(UnresolvedAlias($"a")), Seq(UnresolvedAlias($"b")))), + Some(Seq(Seq($"a"), Seq($"b"))), None, "col", Seq("val"), @@ -282,7 +279,7 @@ class UnpivotParserSuite extends AnalysisTest { table("t1").join( Unpivot( None, - Some(Seq(Seq(UnresolvedAlias($"a")), Seq(UnresolvedAlias($"b")))), + Some(Seq(Seq($"a"), Seq($"b"))), None, "col", Seq("val"), @@ -296,7 +293,7 @@ class UnpivotParserSuite extends AnalysisTest { "SELECT * FROM t1, t2 JOIN t3 UNPIVOT (val FOR col in (a, b))", Unpivot( None, - Some(Seq(Seq(UnresolvedAlias($"a")), Seq(UnresolvedAlias($"b")))), + Some(Seq(Seq($"a"), Seq($"b"))), None, "col", Seq("val"), @@ -311,7 +308,7 @@ class UnpivotParserSuite extends AnalysisTest { table("t1").join( Unpivot( None, - Some(Seq(Seq(UnresolvedAlias($"a")), Seq(UnresolvedAlias($"b")))), + Some(Seq(Seq($"a"), Seq($"b"))), None, "col", Seq("val"), @@ -326,13 +323,13 @@ class UnpivotParserSuite extends AnalysisTest { "SELECT * FROM t1 UNPIVOT (val FOR col in (a, b)) UNPIVOT (val FOR col in (a, b))", Unpivot( None, - Some(Seq(Seq(UnresolvedAlias($"a")), Seq(UnresolvedAlias($"b")))), + Some(Seq(Seq($"a"), Seq($"b"))), None, "col", Seq("val"), Unpivot( None, - Some(Seq(Seq(UnresolvedAlias($"a")), Seq(UnresolvedAlias($"b")))), + Some(Seq(Seq($"a"), Seq($"b"))), None, "col", Seq("val"), --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org