[spark] branch master updated: [SPARK-36718][SQL][FOLLOWUP] Fix the `isExtractOnly` check in CollapseProject

dongjoon Wed, 11 May 2022 21:59:31 -0700

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 547f032d04b [SPARK-36718][SQL][FOLLOWUP] Fix the `isExtractOnly` check 
in CollapseProject
547f032d04b is described below

commit 547f032d04bd2cf06c54b5a4a2f984f5166beb7d
Author: Wenchen Fan <cloud0...@gmail.com>
AuthorDate: Wed May 11 21:58:14 2022 -0700

    [SPARK-36718][SQL][FOLLOWUP] Fix the `isExtractOnly` check in 
CollapseProject
    
    ### What changes were proposed in this pull request?
    
    This PR fixes a perf regression in Spark 3.3 caused by 
https://github.com/apache/spark/pull/33958
    
    In `CollapseProject`, we want to treat `CreateStruct` and its friends as 
cheap expressions if they are only referenced by `ExtractValue`, but the check 
is too conservative, which causes a perf regression. This PR fixes this check. 
Now "extract-only" means: the attribute only appears as a child of 
`ExtractValue`, but the consumer expression can be in any shape.
    
    ### Why are the changes needed?
    
    Fixes perf regression
    
    ### Does this PR introduce _any_ user-facing change?
    
    No
    
    ### How was this patch tested?
    
    new tests
    
    Closes #36510 from cloud-fan/bug.
    
    Lead-authored-by: Wenchen Fan <cloud0...@gmail.com>
    Co-authored-by: Wenchen Fan <wenc...@databricks.com>
    Signed-off-by: Dongjoon Hyun <dongj...@apache.org>
---
 .../spark/sql/catalyst/optimizer/Optimizer.scala       | 14 ++++++++------
 .../sql/catalyst/optimizer/CollapseProjectSuite.scala  | 18 +++++++++++++++---
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 3a36e506f4e..9215609f154 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -1014,12 +1014,14 @@ object CollapseProject extends Rule[LogicalPlan] with 
AliasHelper {
       }
   }
 
-  @scala.annotation.tailrec
-  private def isExtractOnly(expr: Expression, ref: Attribute): Boolean = expr 
match {
-    case a: Alias => isExtractOnly(a.child, ref)
-    case e: ExtractValue => isExtractOnly(e.children.head, ref)
-    case a: Attribute => a.semanticEquals(ref)
-    case _ => false
+  private def isExtractOnly(expr: Expression, ref: Attribute): Boolean = {
+    def hasRefInNonExtractValue(e: Expression): Boolean = e match {
+      case a: Attribute => a.semanticEquals(ref)
+      // The first child of `ExtractValue` is the complex type to be extracted.
+      case e: ExtractValue if e.children.head.semanticEquals(ref) => false
+      case _ => e.children.exists(hasRefInNonExtractValue)
+    }
+    !hasRefInNonExtractValue(expr)
   }
 
   /**
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseProjectSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseProjectSuite.scala
index 93646b6f1bc..dd075837d51 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseProjectSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseProjectSuite.scala
@@ -30,7 +30,8 @@ class CollapseProjectSuite extends PlanTest {
   object Optimize extends RuleExecutor[LogicalPlan] {
     val batches =
       Batch("Subqueries", FixedPoint(10), EliminateSubqueryAliases) ::
-      Batch("CollapseProject", Once, CollapseProject) :: Nil
+      Batch("CollapseProject", Once, CollapseProject) ::
+      Batch("SimplifyExtractValueOps", Once, SimplifyExtractValueOps) :: Nil
   }
 
   val testRelation = LocalRelation($"a".int, $"b".int)
@@ -125,12 +126,23 @@ class CollapseProjectSuite extends PlanTest {
 
   test("SPARK-36718: do not collapse project if non-cheap expressions will be 
repeated") {
     val query = testRelation
-      .select(($"a" + 1).as(Symbol("a_plus_1")))
-      .select(($"a_plus_1" + $"a_plus_1").as(Symbol("a_2_plus_2")))
+      .select(($"a" + 1).as("a_plus_1"))
+      .select(($"a_plus_1" + $"a_plus_1").as("a_2_plus_2"))
       .analyze
 
     val optimized = Optimize.execute(query)
     comparePlans(optimized, query)
+
+    // CreateStruct is an exception if it's only referenced by ExtractValue.
+    val query2 = testRelation
+      .select(namedStruct("a", $"a", "a_plus_1", $"a" + 1).as("struct"))
+      .select(($"struct".getField("a") + 
$"struct".getField("a_plus_1")).as("add"))
+      .analyze
+    val optimized2 = Optimize.execute(query2)
+    val expected2 = testRelation
+      .select(($"a" + ($"a" + 1)).as("add"))
+      .analyze
+    comparePlans(optimized2, expected2)
   }
 
   test("preserve top-level alias metadata while collapsing projects") {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-36718][SQL][FOLLOWUP] Fix the `isExtractOnly` check in CollapseProject

Reply via email to