This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.3 by this push:
     new 052d60c28a8 [SPARK-40228][SQL][3.3] Do not simplify multiLike if child 
is not a cheap expression
052d60c28a8 is described below

commit 052d60c28a8fd0e4e33051aa0682d3df4d979ae8
Author: Yuming Wang <yumw...@ebay.com>
AuthorDate: Fri Sep 9 16:48:34 2022 -0700

    [SPARK-40228][SQL][3.3] Do not simplify multiLike if child is not a cheap 
expression
    
    This PR backport https://github.com/apache/spark/pull/37672 to branch-3.3.
    
    The original PR's description:
    
    ### What changes were proposed in this pull request?
    
    Do not simplify multiLike if child is not a cheap expression.
    
    ### Why are the changes needed?
    
    1. Simplifying multiLike in this cases can not benefit the query because it 
cannot be pushed down.
    2. Reduce the number of evaluations for these expressions.
    
    For example:
    ```sql
    select * from t1 where substr(name, 1, 5) like any('%a', 'b%', '%c%');
    ```
    ```
    == Physical Plan ==
    *(1) Filter ((EndsWith(substr(name#0, 1, 5), a) OR 
StartsWith(substr(name#0, 1, 5), b)) OR Contains(substr(name#0, 1, 5), c))
       +- *(1) ColumnarToRow
          +- FileScan parquet default.t1[name#0] Batched: true, DataFilters: 
[((EndsWith(substr(name#0, 1, 5), a) OR StartsWith(substr(name#0, 1, 5), b)) OR 
Contains(substr(n..., Format: Parquet, PartitionFilters: [], PushedFilters: [], 
ReadSchema: struct<name:string>
    ```
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Unit test.
    
    Closes #37813 from wangyum/SPARK-40228-branch-3.3.
    
    Authored-by: Yuming Wang <yumw...@ebay.com>
    Signed-off-by: Dongjoon Hyun <dongj...@apache.org>
---
 .../org/apache/spark/sql/catalyst/optimizer/Optimizer.scala | 13 +++++++++++++
 .../apache/spark/sql/catalyst/optimizer/expressions.scala   | 12 ++++++++----
 .../sql/catalyst/optimizer/LikeSimplificationSuite.scala    | 13 +++++++++++++
 3 files changed, 34 insertions(+), 4 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 3f756ea459c..9794a310b6d 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -1075,6 +1075,19 @@ object CollapseProject extends Rule[LogicalPlan] with 
AliasHelper {
     case _ => false
   }
 
+  /**
+   * Check if the given expression is cheap that we can inline it.
+   */
+  def isCheap(e: Expression): Boolean = e match {
+    case _: Attribute | _: OuterReference => true
+    case _ if e.foldable => true
+    // PythonUDF is handled by the rule ExtractPythonUDFs
+    case _: PythonUDF => true
+    // Alias and ExtractValue are very cheap.
+    case _: Alias | _: ExtractValue => e.children.forall(isCheap)
+    case _ => false
+  }
+
   /**
    * Return all the references of the given expression without deduplication, 
which is different
    * from `Expression.references`.
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
index 158734597f7..a3d826aff51 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
@@ -773,10 +773,14 @@ object LikeSimplification extends Rule[LogicalPlan] {
       } else {
         simplifyLike(input, pattern.toString, escapeChar).getOrElse(l)
       }
-    case l @ LikeAll(child, patterns) => simplifyMultiLike(child, patterns, l)
-    case l @ NotLikeAll(child, patterns) => simplifyMultiLike(child, patterns, 
l)
-    case l @ LikeAny(child, patterns) => simplifyMultiLike(child, patterns, l)
-    case l @ NotLikeAny(child, patterns) => simplifyMultiLike(child, patterns, 
l)
+    case l @ LikeAll(child, patterns) if CollapseProject.isCheap(child) =>
+      simplifyMultiLike(child, patterns, l)
+    case l @ NotLikeAll(child, patterns) if CollapseProject.isCheap(child) =>
+      simplifyMultiLike(child, patterns, l)
+    case l @ LikeAny(child, patterns) if CollapseProject.isCheap(child) =>
+      simplifyMultiLike(child, patterns, l)
+    case l @ NotLikeAny(child, patterns) if CollapseProject.isCheap(child) =>
+      simplifyMultiLike(child, patterns, l)
   }
 }
 
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala
index c06c92f9c15..2d3be86fa28 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala
@@ -24,6 +24,7 @@ import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
 import org.apache.spark.sql.types.{BooleanType, StringType}
+import org.apache.spark.unsafe.types.UTF8String
 
 class LikeSimplificationSuite extends PlanTest {
 
@@ -232,4 +233,16 @@ class LikeSimplificationSuite extends PlanTest {
 
     comparePlans(optimized, correctAnswer)
   }
+
+  test("SPARK-40228: Simplify multiLike if child is foldable expression") {
+    comparePlans(Optimize.execute(testRelation.where("a" likeAny("abc%", "", 
"ab")).analyze),
+      testRelation.where(StartsWith("a", "abc") || EqualTo("a", "") || 
EqualTo("a", "ab") ||
+        LikeAny("a", Seq.empty[UTF8String])).analyze)
+  }
+
+  test("SPARK-40228: Do not simplify multiLike if child is not a cheap 
expression") {
+    val originalQuery = testRelation.where($"a".substring(1, 5) 
likeAny("abc%", "", "ab")).analyze
+
+    comparePlans(Optimize.execute(originalQuery), originalQuery)
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to