[spark] branch master updated: [SPARK-36703][SQL] Remove the Sort if it is the child of RepartitionByExpression

wenchen Wed, 29 Dec 2021 01:01:30 -0800

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 964e091  [SPARK-36703][SQL] Remove the Sort if it is the child of 
RepartitionByExpression
964e091 is described below

commit 964e091f71a387613d41ef494c356f187c1f1ea5
Author: Yuming Wang <yumw...@ebay.com>
AuthorDate: Wed Dec 29 17:00:23 2021 +0800

    [SPARK-36703][SQL] Remove the Sort if it is the child of 
RepartitionByExpression
    
    ### What changes were proposed in this pull request?
    
    This pr removes the `Sort` if it is the child of `RepartitionByExpression`. 
For example:
    ```sql
    spark.range(10).selectExpr("cast(id AS 
string)").sort("id").repartition($"id").explain()
    ```
    Before this PR:
    ```
    == Optimized Logical Plan ==
    RepartitionByExpression [id#2]
    +- Sort [id#2 ASC NULLS FIRST], true
       +- Project [cast(id#0L as string) AS id#2]
          +- Range (0, 10, step=1, splits=Some(2))
    ```
    After this PR:
    ```
    == Optimized Logical Plan ==
    RepartitionByExpression [id#2]
    +- Project [cast(id#0L as string) AS id#2]
       +- Range (0, 10, step=1, splits=Some(2))
    ```
    
    ### Why are the changes needed?
    
    Remove useless sort.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Unit test.
    
    Closes #33946 from wangyum/SPARK-36703.
    
    Authored-by: Yuming Wang <yumw...@ebay.com>
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
---
 .../org/apache/spark/sql/catalyst/optimizer/Optimizer.scala  | 10 +++++-----
 .../sql/catalyst/optimizer/CollapseRepartitionSuite.scala    | 12 ++++++++++++
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 12e6888..56a2b9b 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -24,7 +24,7 @@ import 
org.apache.spark.sql.catalyst.catalog.{InMemoryCatalog, SessionCatalog}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.plans._
-import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.plans.logical.{RepartitionOperation, _}
 import org.apache.spark.sql.catalyst.rules._
 import org.apache.spark.sql.catalyst.trees.AlwaysProcess
 import org.apache.spark.sql.catalyst.trees.TreePattern._
@@ -1062,10 +1062,10 @@ object CollapseRepartition extends Rule[LogicalPlan] {
       case (false, true) => if (r.numPartitions >= child.numPartitions) child 
else r
       case _ => r.copy(child = child.child)
     }
-    // Case 2: When a RepartitionByExpression has a child of Repartition or 
RepartitionByExpression
-    // we can remove the child.
-    case r @ RepartitionByExpression(_, child: RepartitionOperation, _) =>
-      r.copy(child = child.child)
+    // Case 2: When a RepartitionByExpression has a child of global Sort, 
Repartition or
+    // RepartitionByExpression we can remove the child.
+    case r @ RepartitionByExpression(_, child @ (Sort(_, true, _) | _: 
RepartitionOperation), _) =>
+      r.withNewChildren(child.children)
   }
 }
 
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseRepartitionSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseRepartitionSuite.scala
index 8cc8dec..177545f 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseRepartitionSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseRepartitionSuite.scala
@@ -195,4 +195,16 @@ class CollapseRepartitionSuite extends PlanTest {
     comparePlans(optimized1, correctAnswer)
     comparePlans(optimized2, correctAnswer)
   }
+
+  test("SPARK-36703: Remove the global Sort if it is the child of 
RepartitionByExpression") {
+    val originalQuery1 = testRelation
+      .orderBy('a.asc, 'b.asc)
+      .distribute('a)(20)
+    comparePlans(Optimize.execute(originalQuery1.analyze), 
testRelation.distribute('a)(20).analyze)
+
+    val originalQuery2 = testRelation.distribute('a)(10)
+      .sortBy('a.asc, 'b.asc)
+      .distribute('a)(20)
+    comparePlans(Optimize.execute(originalQuery2.analyze), 
originalQuery2.analyze)
+  }
 }

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-36703][SQL] Remove the Sort if it is the child of RepartitionByExpression

Reply via email to