[spark] branch master updated: [SPARK-41637][SQL] ORDER BY ALL

wenchen Thu, 22 Dec 2022 20:53:01 -0800

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 6d77aa54d04 [SPARK-41637][SQL] ORDER BY ALL
6d77aa54d04 is described below

commit 6d77aa54d04a8f8ee7533b146bc24f5fa0518f7d
Author: Reynold Xin <r...@databricks.com>
AuthorDate: Fri Dec 23 12:52:36 2022 +0800

    [SPARK-41637][SQL] ORDER BY ALL
    
    ### What changes were proposed in this pull request?
    This patch adds ORDER BY ALL support to SQL. ORDER BY ALL is a syntactic 
sugar to sort the output by all the fields, from left to right. It also allows 
specifying asc/desc as well as null ordering.
    
    ### Why are the changes needed?
    It's a good convenience sugar added initially by DuckDB to avoid repeating 
the fields.
    
    ### Does this PR introduce _any_ user-facing change?
    Yes. See above.
    
    ### How was this patch tested?
    Added SQL tests.
    
    Closes #39144 from rxin/orderbyall.
    
    Authored-by: Reynold Xin <r...@databricks.com>
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
---
 .../spark/sql/catalyst/analysis/Analyzer.scala     |   1 +
 .../sql/catalyst/analysis/ResolveOrderByAll.scala  |  81 +++++++++
 .../sql/catalyst/rules/RuleIdCollection.scala      |   1 +
 .../resources/sql-tests/inputs/order-by-all.sql    |  44 +++++
 .../sql-tests/results/order-by-all.sql.out         | 202 +++++++++++++++++++++
 5 files changed, 329 insertions(+)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index ea3b81722af..ccaf07a4d52 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -297,6 +297,7 @@ class Analyzer(override val catalogManager: CatalogManager)
       ResolveGroupingAnalytics ::
       ResolvePivot ::
       ResolveUnpivot ::
+      ResolveOrderByAll ::
       ResolveGroupByAll ::
       ResolveOrdinalInOrderByAndGroupBy ::
       ResolveAggAliasInGroupBy ::
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveOrderByAll.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveOrderByAll.scala
new file mode 100644
index 00000000000..7cf584dadcf
--- /dev/null
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveOrderByAll.scala
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.analysis
+
+import org.apache.spark.sql.catalyst.expressions.SortOrder
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Sort}
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.catalyst.trees.TreePattern.{SORT, 
UNRESOLVED_ATTRIBUTE}
+
+/**
+ * Resolve "order by all" in the following SQL pattern:
+ *  `select col1, col2 from table order by all`.
+ *
+ * It orders the query result by all columns, from left to right. The query 
above becomes:
+ *
+ *  `select col1, col2 from table order by col1, col2`
+ *
+ * This should also support specifying asc/desc, and nulls first/last.
+ */
+object ResolveOrderByAll extends Rule[LogicalPlan] {
+
+  val ALL = "ALL"
+
+  /**
+   * An extractor to pull out the SortOrder field in the ORDER BY ALL clause. 
We pull out that
+   * SortOrder object so we can pass its direction and null ordering.
+   */
+  object OrderByAll {
+    def unapply(s: Sort): Option[SortOrder] = {
+      // This only applies to global ordering.
+      if (!s.global) {
+        return None
+      }
+      // Don't do this if we have more than one order field. That means it's 
not order by all.
+      if (s.order.size != 1) {
+        return None
+      }
+      // Don't do this if there's a child field called ALL. That should take 
precedence.
+      if (s.child.output.exists(_.name.toUpperCase() == ALL)) {
+        return None
+      }
+
+      s.order.find { so =>
+        so.child match {
+          case a: UnresolvedAttribute => a.name.toUpperCase() == ALL
+          case _ => false
+        }
+      }
+    }
+  }
+
+  override def apply(plan: LogicalPlan): LogicalPlan = 
plan.resolveOperatorsUpWithPruning(
+    _.containsAllPatterns(UNRESOLVED_ATTRIBUTE, SORT), ruleId) {
+    // This only makes sense if the child is resolved.
+    case s: Sort if s.child.resolved =>
+      s match {
+        case OrderByAll(sortOrder) =>
+          // Replace a single order by all with N fields, where N = child's 
output, while
+          // retaining the same asc/desc and nulls ordering.
+          val order = s.child.output.map(a => sortOrder.copy(child = a))
+          s.copy(order = order)
+        case _ =>
+          s
+      }
+  }
+}
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala
index 32550b8552d..41aa68f0ec6 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala
@@ -91,6 +91,7 @@ object RuleIdCollection {
       "org.apache.spark.sql.catalyst.analysis.ResolveInlineTables" ::
       "org.apache.spark.sql.catalyst.analysis.ResolveLambdaVariables" ::
       
"org.apache.spark.sql.catalyst.analysis.ResolveLateralColumnAliasReference" ::
+      "org.apache.spark.sql.catalyst.analysis.ResolveOrderByAll" ::
       "org.apache.spark.sql.catalyst.analysis.ResolveTimeZone" ::
       "org.apache.spark.sql.catalyst.analysis.ResolveUnion" ::
       "org.apache.spark.sql.catalyst.analysis.SubstituteUnresolvedOrdinals" ::
diff --git a/sql/core/src/test/resources/sql-tests/inputs/order-by-all.sql 
b/sql/core/src/test/resources/sql-tests/inputs/order-by-all.sql
new file mode 100644
index 00000000000..37dfb1d3499
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/order-by-all.sql
@@ -0,0 +1,44 @@
+create temporary view data as select * from values
+  (0, 1),
+  (0, 2),
+  (1, 3),
+  (1, NULL)
+  as data(g, i);
+
+-- most basic test with only 1 column
+select g from data order by all;
+
+-- two columns
+select * from data order by all;
+
+-- test case insensitive
+select * from data order by aLl;
+
+-- asc/desc
+select * from data order by all asc;
+select * from data order by all desc;
+
+-- nulls first / last
+select * from data order by all nulls first;
+select * from data order by all nulls last;
+
+-- combining nulls first/last/asc/desc
+select * from data order by all asc nulls first;
+select * from data order by all desc nulls first;
+select * from data order by all asc nulls last;
+select * from data order by all desc nulls last;
+
+-- set operations from duckdb
+select * from data union all select * from data order by all;
+select * from data union select * from data order by all;
+
+-- limit
+select * from data order by all limit 2;
+
+-- precedence: if there's a column already named all, reference that, instead 
of expanding.
+-- result should be 1, 2, 3, and not 3, 2, 1
+select * from values("z", 1), ("y", 2), ("x", 3) AS T(col1, all) order by all;
+
+-- shouldn't work in window functions
+select name, dept, rank() over (partition by dept order by all) as rank
+from values('Lisa', 'Sales', 10000, 35) as T(name, dept, salary, age);
diff --git a/sql/core/src/test/resources/sql-tests/results/order-by-all.sql.out 
b/sql/core/src/test/resources/sql-tests/results/order-by-all.sql.out
new file mode 100644
index 00000000000..7612b3c30d4
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/order-by-all.sql.out
@@ -0,0 +1,202 @@
+-- Automatically generated by SQLQueryTestSuite
+-- !query
+create temporary view data as select * from values
+  (0, 1),
+  (0, 2),
+  (1, 3),
+  (1, NULL)
+  as data(g, i)
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+select g from data order by all
+-- !query schema
+struct<g:int>
+-- !query output
+0
+0
+1
+1
+
+
+-- !query
+select * from data order by all
+-- !query schema
+struct<g:int,i:int>
+-- !query output
+0      1
+0      2
+1      NULL
+1      3
+
+
+-- !query
+select * from data order by aLl
+-- !query schema
+struct<g:int,i:int>
+-- !query output
+0      1
+0      2
+1      NULL
+1      3
+
+
+-- !query
+select * from data order by all asc
+-- !query schema
+struct<g:int,i:int>
+-- !query output
+0      1
+0      2
+1      NULL
+1      3
+
+
+-- !query
+select * from data order by all desc
+-- !query schema
+struct<g:int,i:int>
+-- !query output
+1      3
+1      NULL
+0      2
+0      1
+
+
+-- !query
+select * from data order by all nulls first
+-- !query schema
+struct<g:int,i:int>
+-- !query output
+0      1
+0      2
+1      NULL
+1      3
+
+
+-- !query
+select * from data order by all nulls last
+-- !query schema
+struct<g:int,i:int>
+-- !query output
+0      1
+0      2
+1      3
+1      NULL
+
+
+-- !query
+select * from data order by all asc nulls first
+-- !query schema
+struct<g:int,i:int>
+-- !query output
+0      1
+0      2
+1      NULL
+1      3
+
+
+-- !query
+select * from data order by all desc nulls first
+-- !query schema
+struct<g:int,i:int>
+-- !query output
+1      NULL
+1      3
+0      2
+0      1
+
+
+-- !query
+select * from data order by all asc nulls last
+-- !query schema
+struct<g:int,i:int>
+-- !query output
+0      1
+0      2
+1      3
+1      NULL
+
+
+-- !query
+select * from data order by all desc nulls last
+-- !query schema
+struct<g:int,i:int>
+-- !query output
+1      3
+1      NULL
+0      2
+0      1
+
+
+-- !query
+select * from data union all select * from data order by all
+-- !query schema
+struct<g:int,i:int>
+-- !query output
+0      1
+0      1
+0      2
+0      2
+1      NULL
+1      NULL
+1      3
+1      3
+
+
+-- !query
+select * from data union select * from data order by all
+-- !query schema
+struct<g:int,i:int>
+-- !query output
+0      1
+0      2
+1      NULL
+1      3
+
+
+-- !query
+select * from data order by all limit 2
+-- !query schema
+struct<g:int,i:int>
+-- !query output
+0      1
+0      2
+
+
+-- !query
+select * from values("z", 1), ("y", 2), ("x", 3) AS T(col1, all) order by all
+-- !query schema
+struct<col1:string,all:int>
+-- !query output
+z      1
+y      2
+x      3
+
+
+-- !query
+select name, dept, rank() over (partition by dept order by all) as rank
+from values('Lisa', 'Sales', 10000, 35) as T(name, dept, salary, age)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "UNRESOLVED_COLUMN.WITH_SUGGESTION",
+  "sqlState" : "42000",
+  "messageParameters" : {
+    "objectName" : "`all`",
+    "proposal" : "`T`.`age`, `T`.`name`, `T`.`dept`, `T`.`salary`"
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 60,
+    "stopIndex" : 62,
+    "fragment" : "all"
+  } ]
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-41637][SQL] ORDER BY ALL

Reply via email to