This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new bbeb8d7417ba [SPARK-46640][SQL] Fix RemoveRedundantAlias by excluding subquery attributes bbeb8d7417ba is described below commit bbeb8d7417bafa09ad5202347175a47b3217e27f Author: Nikhil Sheoran <125331115+nikhilsheoran...@users.noreply.github.com> AuthorDate: Fri Jan 12 10:20:49 2024 +0800 [SPARK-46640][SQL] Fix RemoveRedundantAlias by excluding subquery attributes ### What changes were proposed in this pull request? - In `RemoveRedundantAliases`, we have an `excluded` AttributeSet argument denoting the references for which we should not remove aliases. For a query with subquery expressions, adding the attributes references by the subquery in the `excluded` set prevents rewrites that might remove presumedly redundant aliases. (Changes in RemoveRedundantAlias) - Added a configuration flag to disable this fix, if not needed. - Added a unit test with Filter exists subquery expression to show how the alias would have been removed. ### Why are the changes needed? - `RemoveRedundantAliases` does not take into account the outer attributes of a `SubqueryExpression` when considering redundant aliases, potentially removing them if it thinks they are redundant. - This can cause scenarios where a subquery expression has conditions like `a#x = a#x` i.e. both the attribute names and the expression ID(s) are the same. This can then lead to conflicting expression ID(s) error. - For example, in the query example below, the `RemoveRedundantAliases` would remove the alias `a#0 as a#1` and replace `a#1` with `a#0` in the Filter exists subquery expression which would create an issue if the subquery expression had an attribute with reference `a#0` (possible due to different scan relation instances possibly having the same attribute ID(s) (Ref: #40662) ``` Filter exists [a#1 && (a#1 = b#2)] : +- LocalRelation <empty>, [b#2] +- Project [a#0 AS a#1] +- LocalRelation <empty>, [a#0] ``` becomes ``` Filter exists [a#0 && (a#0 = b#2)] : +- LocalRelation <empty>, [b#2] +- LocalRelation <empty>, [a#0] ``` - The changes are needed to fix this bug. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - Added a unit test with Filter exists subquery expression to show how the alias would have been removed. ### Was this patch authored or co-authored using generative AI tooling? No Closes #44645 from nikhilsheoran-db/SPARK-46640. Authored-by: Nikhil Sheoran <125331115+nikhilsheoran...@users.noreply.github.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../spark/sql/catalyst/optimizer/Optimizer.scala | 12 +++++- .../org/apache/spark/sql/internal/SQLConf.scala | 9 ++++ .../RemoveRedundantAliasAndProjectSuite.scala | 48 ++++++++++++++++++++++ 3 files changed, 68 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index 61791b35df85..8fcc7c7c26b4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -584,10 +584,20 @@ object RemoveRedundantAliases extends Rule[LogicalPlan] { } case _ => + val subQueryAttributes = if (conf.getConf(SQLConf + .EXCLUDE_SUBQUERY_EXP_REFS_FROM_REMOVE_REDUNDANT_ALIASES)) { + // Collect the references for all the subquery expressions in the plan. + AttributeSet.fromAttributeSets(plan.expressions.collect { + case e: SubqueryExpression => e.references + }) + } else { + AttributeSet.empty + } + // Remove redundant aliases in the subtree(s). val currentNextAttrPairs = mutable.Buffer.empty[(Attribute, Attribute)] val newNode = plan.mapChildren { child => - val newChild = removeRedundantAliases(child, excluded) + val newChild = removeRedundantAliases(child, excluded ++ subQueryAttributes) currentNextAttrPairs ++= createAttributeMapping(child, newChild) newChild } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 1928e74363cb..743a2e20c885 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -4513,6 +4513,15 @@ object SQLConf { .booleanConf .createWithDefault(true) + val EXCLUDE_SUBQUERY_EXP_REFS_FROM_REMOVE_REDUNDANT_ALIASES = + buildConf("spark.sql.optimizer.excludeSubqueryRefsFromRemoveRedundantAliases.enabled") + .internal() + .doc("When true, exclude the references from the subquery expressions (in, exists, etc.) " + + s"while removing redundant aliases.") + .version("4.0.0") + .booleanConf + .createWithDefault(true) + val TIME_TRAVEL_TIMESTAMP_KEY = buildConf("spark.sql.timeTravelTimestampKey") .doc("The option name to specify the time travel timestamp when reading a table.") diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAliasAndProjectSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAliasAndProjectSuite.scala index cd19e5062ae1..8a0a0466ca74 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAliasAndProjectSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAliasAndProjectSuite.scala @@ -23,6 +23,7 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules._ +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.MetadataBuilder class RemoveRedundantAliasAndProjectSuite extends PlanTest { @@ -130,4 +131,51 @@ class RemoveRedundantAliasAndProjectSuite extends PlanTest { correlated = false) comparePlans(optimized, expected) } + + test("SPARK-46640: do not remove outer references from a subquery expression") { + val a = $"a".int + val a_alias = Alias(a, "a")() + val a_alias_attr = a_alias.toAttribute + val b = $"b".int + + // The original input query + // Filter exists [a#1 && (a#1 = b#2)] + // : +- LocalRelation <empty>, [b#2] + // +- Project [a#0 AS a#1] + // +- LocalRelation <empty>, [a#0] + val query = Filter( + Exists( + LocalRelation(b), + outerAttrs = Seq(a_alias_attr), + joinCond = Seq(EqualTo(a_alias_attr, b)) + ), + Project(Seq(a_alias), LocalRelation(a)) + ) + + // The alias would not be removed if excluding subquery references is enabled. + val expectedWhenExcluded = query + + // The alias would have been removed if excluding subquery references is disabled. + // Filter exists [a#0 && (a#0 = b#2)] + // : +- LocalRelation <empty>, [b#2] + // +- LocalRelation <empty>, [a#0] + val expectedWhenNotExcluded = Filter( + Exists( + LocalRelation(b), + outerAttrs = Seq(a), + joinCond = Seq(EqualTo(a, b)) + ), + LocalRelation(a) + ) + + withSQLConf(SQLConf.EXCLUDE_SUBQUERY_EXP_REFS_FROM_REMOVE_REDUNDANT_ALIASES.key -> "true") { + val optimized = Optimize.execute(query) + comparePlans(optimized, expectedWhenExcluded) + } + + withSQLConf(SQLConf.EXCLUDE_SUBQUERY_EXP_REFS_FROM_REMOVE_REDUNDANT_ALIASES.key -> "false") { + val optimized = Optimize.execute(query) + comparePlans(optimized, expectedWhenNotExcluded) + } + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org