Github user gatorsmile commented on a diff in the pull request: https://github.com/apache/spark/pull/21049#discussion_r180963692 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala --- @@ -307,6 +309,32 @@ object RemoveRedundantProject extends Rule[LogicalPlan] { } } +/** + * Remove [[Sort]] in subqueries that do not affect the set of rows produced, only their + * order. Subqueries produce unordered sets of rows so sorting their output is unnecessary. + */ +object RemoveSubquerySorts extends Rule[LogicalPlan] { + + /** + * Removes all [[Sort]] operators from a plan that are accessible from the root operator via + * 0 or more [[Project]], [[Filter]] or [[View]] operators. + */ + private def removeTopLevelSorts(plan: LogicalPlan): LogicalPlan = { + plan match { + case Sort(_, _, child) => removeTopLevelSorts(child) + case Project(fields, child) => Project(fields, removeTopLevelSorts(child)) + case Filter(condition, child) => Filter(condition, removeTopLevelSorts(child)) + case View(tbl, output, child) => View(tbl, output, removeTopLevelSorts(child)) + case _ => plan + } + } + + def apply(plan: LogicalPlan): LogicalPlan = plan transform { + case Subquery(child) => Subquery(removeTopLevelSorts(child)) + case SubqueryAlias(name, child) => SubqueryAlias(name, removeTopLevelSorts(child)) --- End diff -- `SubqueryAlias` is not the subquery you want. This is just an alias of a query/table/view. For example, ```Scala Seq((1, 2, "1"), (3, 4, "3")).toDF("int", "int2", "str_sort").orderBy('int.asc).as('df1) ```
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org