This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch branch-3.3 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.3 by this push: new 052d60c28a8 [SPARK-40228][SQL][3.3] Do not simplify multiLike if child is not a cheap expression 052d60c28a8 is described below commit 052d60c28a8fd0e4e33051aa0682d3df4d979ae8 Author: Yuming Wang <yumw...@ebay.com> AuthorDate: Fri Sep 9 16:48:34 2022 -0700 [SPARK-40228][SQL][3.3] Do not simplify multiLike if child is not a cheap expression This PR backport https://github.com/apache/spark/pull/37672 to branch-3.3. The original PR's description: ### What changes were proposed in this pull request? Do not simplify multiLike if child is not a cheap expression. ### Why are the changes needed? 1. Simplifying multiLike in this cases can not benefit the query because it cannot be pushed down. 2. Reduce the number of evaluations for these expressions. For example: ```sql select * from t1 where substr(name, 1, 5) like any('%a', 'b%', '%c%'); ``` ``` == Physical Plan == *(1) Filter ((EndsWith(substr(name#0, 1, 5), a) OR StartsWith(substr(name#0, 1, 5), b)) OR Contains(substr(name#0, 1, 5), c)) +- *(1) ColumnarToRow +- FileScan parquet default.t1[name#0] Batched: true, DataFilters: [((EndsWith(substr(name#0, 1, 5), a) OR StartsWith(substr(name#0, 1, 5), b)) OR Contains(substr(n..., Format: Parquet, PartitionFilters: [], PushedFilters: [], ReadSchema: struct<name:string> ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Unit test. Closes #37813 from wangyum/SPARK-40228-branch-3.3. Authored-by: Yuming Wang <yumw...@ebay.com> Signed-off-by: Dongjoon Hyun <dongj...@apache.org> --- .../org/apache/spark/sql/catalyst/optimizer/Optimizer.scala | 13 +++++++++++++ .../apache/spark/sql/catalyst/optimizer/expressions.scala | 12 ++++++++---- .../sql/catalyst/optimizer/LikeSimplificationSuite.scala | 13 +++++++++++++ 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index 3f756ea459c..9794a310b6d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -1075,6 +1075,19 @@ object CollapseProject extends Rule[LogicalPlan] with AliasHelper { case _ => false } + /** + * Check if the given expression is cheap that we can inline it. + */ + def isCheap(e: Expression): Boolean = e match { + case _: Attribute | _: OuterReference => true + case _ if e.foldable => true + // PythonUDF is handled by the rule ExtractPythonUDFs + case _: PythonUDF => true + // Alias and ExtractValue are very cheap. + case _: Alias | _: ExtractValue => e.children.forall(isCheap) + case _ => false + } + /** * Return all the references of the given expression without deduplication, which is different * from `Expression.references`. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala index 158734597f7..a3d826aff51 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala @@ -773,10 +773,14 @@ object LikeSimplification extends Rule[LogicalPlan] { } else { simplifyLike(input, pattern.toString, escapeChar).getOrElse(l) } - case l @ LikeAll(child, patterns) => simplifyMultiLike(child, patterns, l) - case l @ NotLikeAll(child, patterns) => simplifyMultiLike(child, patterns, l) - case l @ LikeAny(child, patterns) => simplifyMultiLike(child, patterns, l) - case l @ NotLikeAny(child, patterns) => simplifyMultiLike(child, patterns, l) + case l @ LikeAll(child, patterns) if CollapseProject.isCheap(child) => + simplifyMultiLike(child, patterns, l) + case l @ NotLikeAll(child, patterns) if CollapseProject.isCheap(child) => + simplifyMultiLike(child, patterns, l) + case l @ LikeAny(child, patterns) if CollapseProject.isCheap(child) => + simplifyMultiLike(child, patterns, l) + case l @ NotLikeAny(child, patterns) if CollapseProject.isCheap(child) => + simplifyMultiLike(child, patterns, l) } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala index c06c92f9c15..2d3be86fa28 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala @@ -24,6 +24,7 @@ import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules._ import org.apache.spark.sql.types.{BooleanType, StringType} +import org.apache.spark.unsafe.types.UTF8String class LikeSimplificationSuite extends PlanTest { @@ -232,4 +233,16 @@ class LikeSimplificationSuite extends PlanTest { comparePlans(optimized, correctAnswer) } + + test("SPARK-40228: Simplify multiLike if child is foldable expression") { + comparePlans(Optimize.execute(testRelation.where("a" likeAny("abc%", "", "ab")).analyze), + testRelation.where(StartsWith("a", "abc") || EqualTo("a", "") || EqualTo("a", "ab") || + LikeAny("a", Seq.empty[UTF8String])).analyze) + } + + test("SPARK-40228: Do not simplify multiLike if child is not a cheap expression") { + val originalQuery = testRelation.where($"a".substring(1, 5) likeAny("abc%", "", "ab")).analyze + + comparePlans(Optimize.execute(originalQuery), originalQuery) + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org