cloud-fan commented on a change in pull request #26946: [SPARK-30036][SQL] Fix: REPARTITION hint does not work with order by URL: https://github.com/apache/spark/pull/26946#discussion_r361093943
########## File path: sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala ########## @@ -421,6 +421,52 @@ class PlannerSuite extends SharedSparkSession { } } + test("SPARK-30036: Romove unnecessary RoundRobinPartitioning " + + "if SortExec is followed by RoundRobinPartitioning") { + val distribution = OrderedDistribution(SortOrder(Literal(1), Ascending) :: Nil) + val partitioning = RoundRobinPartitioning(5) + assert(!partitioning.satisfies(distribution)) + + val inputPlan = SortExec(SortOrder(Literal(1), Ascending) :: Nil, + global = true, + child = ShuffleExchangeExec( + partitioning, + DummySparkPlan(outputPartitioning = partitioning))) + val outputPlan = EnsureRequirements(spark.sessionState.conf).apply(inputPlan) + assert(outputPlan.find { + case ShuffleExchangeExec(_: RoundRobinPartitioning, _, _) => true + case _ => false + }.isEmpty, + "RoundRobinPartitioning should be changed to RangePartitioning") + + val query = testData.select('key, 'value).repartition(2).sort('key.asc) + assert(query.rdd.getNumPartitions == 2) + assert(query.rdd.collectPartitions()(0).map(_.get(0)).toSeq == (1 to 50)) + } + + test("SPARK-30036: Romove unnecessary HashPartitioning " + + "if SortExec is followed by HashPartitioning") { + val distribution = OrderedDistribution(SortOrder(Literal(1), Ascending) :: Nil) + val partitioning = HashPartitioning(Literal(1) :: Nil, 5) + assert(!partitioning.satisfies(distribution)) + + val inputPlan = SortExec(SortOrder(Literal(1), Ascending) :: Nil, + global = true, + child = ShuffleExchangeExec( + partitioning, + DummySparkPlan(outputPartitioning = partitioning))) + val outputPlan = EnsureRequirements(spark.sessionState.conf).apply(inputPlan) + assert(outputPlan.find { + case ShuffleExchangeExec(_: HashPartitioning, _, _) => true + case _ => false + }.isEmpty, + "HashPartitioning should be changed to RangePartitioning") + + val query = testData.select('key, 'value).repartition(5, 'key).sort('key.asc) Review comment: I'm not very sure about this. `df.repartition` is a low-level API that allows users to hash-partition the data. There is also a `df.repartitionByRange` to do range partitioning. We shouldn't break users' expectations. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org