This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 964e091 [SPARK-36703][SQL] Remove the Sort if it is the child of RepartitionByExpression 964e091 is described below commit 964e091f71a387613d41ef494c356f187c1f1ea5 Author: Yuming Wang <yumw...@ebay.com> AuthorDate: Wed Dec 29 17:00:23 2021 +0800 [SPARK-36703][SQL] Remove the Sort if it is the child of RepartitionByExpression ### What changes were proposed in this pull request? This pr removes the `Sort` if it is the child of `RepartitionByExpression`. For example: ```sql spark.range(10).selectExpr("cast(id AS string)").sort("id").repartition($"id").explain() ``` Before this PR: ``` == Optimized Logical Plan == RepartitionByExpression [id#2] +- Sort [id#2 ASC NULLS FIRST], true +- Project [cast(id#0L as string) AS id#2] +- Range (0, 10, step=1, splits=Some(2)) ``` After this PR: ``` == Optimized Logical Plan == RepartitionByExpression [id#2] +- Project [cast(id#0L as string) AS id#2] +- Range (0, 10, step=1, splits=Some(2)) ``` ### Why are the changes needed? Remove useless sort. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Unit test. Closes #33946 from wangyum/SPARK-36703. Authored-by: Yuming Wang <yumw...@ebay.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../org/apache/spark/sql/catalyst/optimizer/Optimizer.scala | 10 +++++----- .../sql/catalyst/optimizer/CollapseRepartitionSuite.scala | 12 ++++++++++++ 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index 12e6888..56a2b9b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -24,7 +24,7 @@ import org.apache.spark.sql.catalyst.catalog.{InMemoryCatalog, SessionCatalog} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.catalyst.plans._ -import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.catalyst.plans.logical.{RepartitionOperation, _} import org.apache.spark.sql.catalyst.rules._ import org.apache.spark.sql.catalyst.trees.AlwaysProcess import org.apache.spark.sql.catalyst.trees.TreePattern._ @@ -1062,10 +1062,10 @@ object CollapseRepartition extends Rule[LogicalPlan] { case (false, true) => if (r.numPartitions >= child.numPartitions) child else r case _ => r.copy(child = child.child) } - // Case 2: When a RepartitionByExpression has a child of Repartition or RepartitionByExpression - // we can remove the child. - case r @ RepartitionByExpression(_, child: RepartitionOperation, _) => - r.copy(child = child.child) + // Case 2: When a RepartitionByExpression has a child of global Sort, Repartition or + // RepartitionByExpression we can remove the child. + case r @ RepartitionByExpression(_, child @ (Sort(_, true, _) | _: RepartitionOperation), _) => + r.withNewChildren(child.children) } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseRepartitionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseRepartitionSuite.scala index 8cc8dec..177545f 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseRepartitionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseRepartitionSuite.scala @@ -195,4 +195,16 @@ class CollapseRepartitionSuite extends PlanTest { comparePlans(optimized1, correctAnswer) comparePlans(optimized2, correctAnswer) } + + test("SPARK-36703: Remove the global Sort if it is the child of RepartitionByExpression") { + val originalQuery1 = testRelation + .orderBy('a.asc, 'b.asc) + .distribute('a)(20) + comparePlans(Optimize.execute(originalQuery1.analyze), testRelation.distribute('a)(20).analyze) + + val originalQuery2 = testRelation.distribute('a)(10) + .sortBy('a.asc, 'b.asc) + .distribute('a)(20) + comparePlans(Optimize.execute(originalQuery2.analyze), originalQuery2.analyze) + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org