This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.3 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.3 by this push: new c25624b4d0c [SPARK-36718][SQL][FOLLOWUP] Improve the extract-only check in CollapseProject c25624b4d0c is described below commit c25624b4d0c2d77f0a6db7e70ecf750e9a1143f2 Author: Wenchen Fan <wenc...@databricks.com> AuthorDate: Tue May 17 15:56:47 2022 +0800 [SPARK-36718][SQL][FOLLOWUP] Improve the extract-only check in CollapseProject ### What changes were proposed in this pull request? This is a followup of https://github.com/apache/spark/pull/36510 , to fix a corner case: if the `CreateStruct` is only referenced once in non-extract expressions, we should still allow collapsing the projects. ### Why are the changes needed? completely fix the perf regression ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? a new test Closes #36572 from cloud-fan/regression. Authored-by: Wenchen Fan <wenc...@databricks.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> (cherry picked from commit 98fad57221d4dffc6f1fe28d9aca1093172ecf72) Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../apache/spark/sql/catalyst/optimizer/Optimizer.scala | 16 +++++++++------- .../sql/catalyst/optimizer/CollapseProjectSuite.scala | 11 +++++++++++ 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index 759a7044f15..94e9d3cdd14 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -991,20 +991,22 @@ object CollapseProject extends Rule[LogicalPlan] with AliasHelper { val producer = producerMap.getOrElse(reference, reference) producer.deterministic && (count == 1 || alwaysInline || { val relatedConsumers = consumers.filter(_.references.contains(reference)) - val extractOnly = relatedConsumers.forall(isExtractOnly(_, reference)) + // It's still exactly-only if there is only one reference in non-extract expressions, + // as we won't duplicate the expensive CreateStruct-like expressions. + val extractOnly = relatedConsumers.map(refCountInNonExtract(_, reference)).sum <= 1 shouldInline(producer, extractOnly) }) } } - private def isExtractOnly(expr: Expression, ref: Attribute): Boolean = { - def hasRefInNonExtractValue(e: Expression): Boolean = e match { - case a: Attribute => a.semanticEquals(ref) + private def refCountInNonExtract(expr: Expression, ref: Attribute): Int = { + def refCount(e: Expression): Int = e match { + case a: Attribute if a.semanticEquals(ref) => 1 // The first child of `ExtractValue` is the complex type to be extracted. - case e: ExtractValue if e.children.head.semanticEquals(ref) => false - case _ => e.children.exists(hasRefInNonExtractValue) + case e: ExtractValue if e.children.head.semanticEquals(ref) => 0 + case _ => e.children.map(refCount).sum } - !hasRefInNonExtractValue(expr) + refCount(expr) } /** diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseProjectSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseProjectSuite.scala index f6c3209726b..ba5c5572e24 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseProjectSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseProjectSuite.scala @@ -141,6 +141,17 @@ class CollapseProjectSuite extends PlanTest { .select(($"a" + ($"a" + 1)).as("add")) .analyze comparePlans(optimized2, expected2) + + // referencing `CreateStruct` only once in non-extract expression is OK. + val query3 = testRelation + .select(namedStruct("a", $"a", "a_plus_1", $"a" + 1).as("struct")) + .select($"struct", $"struct".getField("a")) + .analyze + val optimized3 = Optimize.execute(query3) + val expected3 = testRelation + .select(namedStruct("a", $"a", "a_plus_1", $"a" + 1).as("struct"), $"a".as("struct.a")) + .analyze + comparePlans(optimized3, expected3) } test("preserve top-level alias metadata while collapsing projects") { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org