This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new d3237b22b5 fix: schema error when parsing order-by expressions (#10234)
d3237b22b5 is described below
commit d3237b22b50c3217ba572aa93aa905e65398728c
Author: Jonah Gao <[email protected]>
AuthorDate: Wed May 1 19:15:24 2024 +0800
fix: schema error when parsing order-by expressions (#10234)
* fix: schema error when parsing order-by expressions
* add test from issue
* improve order_by_to_sort_expr
* add test
* Update datafusion/sqllogictest/test_files/order.slt
Co-authored-by: Andrew Lamb <[email protected]>
* fix tests
* add test
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
datafusion/sql/src/expr/function.rs | 19 +++++--
datafusion/sql/src/expr/mod.rs | 1 +
datafusion/sql/src/expr/order_by.rs | 43 ++++++++++++---
datafusion/sql/src/query.rs | 79 +++++++++++++++++----------
datafusion/sql/src/select.rs | 24 ++++++--
datafusion/sql/src/set_expr.rs | 2 +-
datafusion/sql/src/statement.rs | 2 +-
datafusion/sql/tests/sql_integration.rs | 2 +-
datafusion/sqllogictest/test_files/order.slt | 67 ++++++++++++++++++++++-
datafusion/sqllogictest/test_files/select.slt | 2 +-
10 files changed, 191 insertions(+), 50 deletions(-)
diff --git a/datafusion/sql/src/expr/function.rs
b/datafusion/sql/src/expr/function.rs
index 68cba15634..3adf296078 100644
--- a/datafusion/sql/src/expr/function.rs
+++ b/datafusion/sql/src/expr/function.rs
@@ -150,6 +150,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
planner_context,
// Numeric literals in window function ORDER BY are treated as
constants
false,
+ None,
)?;
let func_deps = schema.functional_dependencies();
@@ -219,8 +220,13 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
} else {
// User defined aggregate functions (UDAF) have precedence in case
it has the same name as a scalar built-in function
if let Some(fm) = self.context_provider.get_aggregate_meta(&name) {
- let order_by =
- self.order_by_to_sort_expr(&order_by, schema,
planner_context, true)?;
+ let order_by = self.order_by_to_sort_expr(
+ &order_by,
+ schema,
+ planner_context,
+ true,
+ None,
+ )?;
let order_by = (!order_by.is_empty()).then_some(order_by);
let args = self.function_args_to_expr(args, schema,
planner_context)?;
// TODO: Support filter and distinct for UDAFs
@@ -236,8 +242,13 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
// next, aggregate built-ins
if let Ok(fun) = AggregateFunction::from_str(&name) {
- let order_by =
- self.order_by_to_sort_expr(&order_by, schema,
planner_context, true)?;
+ let order_by = self.order_by_to_sort_expr(
+ &order_by,
+ schema,
+ planner_context,
+ true,
+ None,
+ )?;
let order_by = (!order_by.is_empty()).then_some(order_by);
let args = self.function_args_to_expr(args, schema,
planner_context)?;
let filter: Option<Box<Expr>> = filter
diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs
index 13f559a0eb..ed5421edfb 100644
--- a/datafusion/sql/src/expr/mod.rs
+++ b/datafusion/sql/src/expr/mod.rs
@@ -699,6 +699,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
input_schema,
planner_context,
true,
+ None,
)?)
} else {
None
diff --git a/datafusion/sql/src/expr/order_by.rs
b/datafusion/sql/src/expr/order_by.rs
index 4ccdf6c2d4..4dd81517e9 100644
--- a/datafusion/sql/src/expr/order_by.rs
+++ b/datafusion/sql/src/expr/order_by.rs
@@ -24,16 +24,39 @@ use sqlparser::ast::{Expr as SQLExpr, OrderByExpr, Value};
impl<'a, S: ContextProvider> SqlToRel<'a, S> {
/// Convert sql [OrderByExpr] to `Vec<Expr>`.
///
- /// If `literal_to_column` is true, treat any numeric literals (e.g. `2`)
as a 1 based index
- /// into the SELECT list (e.g. `SELECT a, b FROM table ORDER BY 2`).
+ /// `input_schema` and `additional_schema` are used to resolve column
references in the order-by expressions.
+ /// `input_schema` is the schema of the input logical plan, typically
derived from the SELECT list.
+ ///
+ /// Usually order-by expressions can only reference the input plan's
columns.
+ /// But the `SELECT ... FROM ... ORDER BY ...` syntax is a special case.
Besides the input schema,
+ /// it can reference an `additional_schema` derived from the `FROM` clause.
+ ///
+ /// If `literal_to_column` is true, treat any numeric literals (e.g. `2`)
as a 1 based index into the
+ /// SELECT list (e.g. `SELECT a, b FROM table ORDER BY 2`). Literals only
reference the `input_schema`.
+ ///
/// If false, interpret numeric literals as constant values.
pub(crate) fn order_by_to_sort_expr(
&self,
exprs: &[OrderByExpr],
- schema: &DFSchema,
+ input_schema: &DFSchema,
planner_context: &mut PlannerContext,
literal_to_column: bool,
+ additional_schema: Option<&DFSchema>,
) -> Result<Vec<Expr>> {
+ if exprs.is_empty() {
+ return Ok(vec![]);
+ }
+
+ let mut combined_schema;
+ let order_by_schema = match additional_schema {
+ Some(schema) => {
+ combined_schema = input_schema.clone();
+ combined_schema.merge(schema);
+ &combined_schema
+ }
+ None => input_schema,
+ };
+
let mut expr_vec = vec![];
for e in exprs {
let OrderByExpr {
@@ -52,17 +75,23 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
return plan_err!(
"Order by index starts at 1 for column indexes"
);
- } else if schema.fields().len() < field_index {
+ } else if input_schema.fields().len() < field_index {
return plan_err!(
"Order by column out of bounds, specified: {},
max: {}",
field_index,
- schema.fields().len()
+ input_schema.fields().len()
);
}
-
Expr::Column(Column::from(schema.qualified_field(field_index - 1)))
+ Expr::Column(Column::from(
+ input_schema.qualified_field(field_index - 1),
+ ))
}
- e => self.sql_expr_to_logical_expr(e.clone(), schema,
planner_context)?,
+ e => self.sql_expr_to_logical_expr(
+ e.clone(),
+ order_by_schema,
+ planner_context,
+ )?,
};
let asc = asc.unwrap_or(true);
expr_vec.push(Expr::Sort(Sort::new(
diff --git a/datafusion/sql/src/query.rs b/datafusion/sql/src/query.rs
index d5d3bcc4a1..fdc739646a 100644
--- a/datafusion/sql/src/query.rs
+++ b/datafusion/sql/src/query.rs
@@ -25,7 +25,7 @@ use datafusion_expr::{
Operator,
};
use sqlparser::ast::{
- Expr as SQLExpr, Offset as SQLOffset, OrderByExpr, Query, SetExpr, Value,
+ Expr as SQLExpr, Offset as SQLOffset, Query, SelectInto, SetExpr, Value,
};
impl<'a, S: ContextProvider> SqlToRel<'a, S> {
@@ -46,29 +46,35 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
query: Query,
planner_context: &mut PlannerContext,
) -> Result<LogicalPlan> {
- let mut set_expr = query.body;
if let Some(with) = query.with {
self.plan_with_clause(with, planner_context)?;
}
- // Take the `SelectInto` for later processing.
- let select_into = match set_expr.as_mut() {
- SetExpr::Select(select) => select.into.take(),
- _ => None,
- };
- let plan = self.set_expr_to_plan(*set_expr, planner_context)?;
- let plan = self.order_by(plan, query.order_by, planner_context)?;
- let mut plan = self.limit(plan, query.offset, query.limit)?;
- if let Some(into) = select_into {
- plan =
LogicalPlan::Ddl(DdlStatement::CreateMemoryTable(CreateMemoryTable {
- name: self.object_name_to_table_reference(into.name)?,
- constraints: Constraints::empty(),
- input: Arc::new(plan),
- if_not_exists: false,
- or_replace: false,
- column_defaults: vec![],
- }))
+
+ let set_expr = *query.body;
+ match set_expr {
+ SetExpr::Select(mut select) => {
+ let select_into = select.into.take();
+ // Order-by expressions may refer to columns in the `FROM`
clause,
+ // so we need to process `SELECT` and `ORDER BY` together.
+ let plan =
+ self.select_to_plan(*select, query.order_by,
planner_context)?;
+ let plan = self.limit(plan, query.offset, query.limit)?;
+ // Process the `SELECT INTO` after `LIMIT`.
+ self.select_into(plan, select_into)
+ }
+ other => {
+ let plan = self.set_expr_to_plan(other, planner_context)?;
+ let order_by_rex = self.order_by_to_sort_expr(
+ &query.order_by,
+ plan.schema(),
+ planner_context,
+ true,
+ None,
+ )?;
+ let plan = self.order_by(plan, order_by_rex)?;
+ self.limit(plan, query.offset, query.limit)
+ }
}
- Ok(plan)
}
/// Wrap a plan in a limit
@@ -114,26 +120,43 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
}
/// Wrap the logical in a sort
- fn order_by(
+ pub(super) fn order_by(
&self,
plan: LogicalPlan,
- order_by: Vec<OrderByExpr>,
- planner_context: &mut PlannerContext,
+ order_by: Vec<Expr>,
) -> Result<LogicalPlan> {
if order_by.is_empty() {
return Ok(plan);
}
- let order_by_rex =
- self.order_by_to_sort_expr(&order_by, plan.schema(),
planner_context, true)?;
-
if let LogicalPlan::Distinct(Distinct::On(ref distinct_on)) = plan {
// In case of `DISTINCT ON` we must capture the sort expressions
since during the plan
// optimization we're effectively doing a `first_value`
aggregation according to them.
- let distinct_on =
distinct_on.clone().with_sort_expr(order_by_rex)?;
+ let distinct_on = distinct_on.clone().with_sort_expr(order_by)?;
Ok(LogicalPlan::Distinct(Distinct::On(distinct_on)))
} else {
- LogicalPlanBuilder::from(plan).sort(order_by_rex)?.build()
+ LogicalPlanBuilder::from(plan).sort(order_by)?.build()
+ }
+ }
+
+ /// Wrap the logical plan in a `SelectInto`
+ fn select_into(
+ &self,
+ plan: LogicalPlan,
+ select_into: Option<SelectInto>,
+ ) -> Result<LogicalPlan> {
+ match select_into {
+ Some(into) => Ok(LogicalPlan::Ddl(DdlStatement::CreateMemoryTable(
+ CreateMemoryTable {
+ name: self.object_name_to_table_reference(into.name)?,
+ constraints: Constraints::empty(),
+ input: Arc::new(plan),
+ if_not_exists: false,
+ or_replace: false,
+ column_defaults: vec![],
+ },
+ ))),
+ _ => Ok(plan),
}
}
}
diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs
index fdcef0ef6a..730e84cd09 100644
--- a/datafusion/sql/src/select.rs
+++ b/datafusion/sql/src/select.rs
@@ -29,7 +29,7 @@ use datafusion_common::{not_impl_err, plan_err,
DataFusionError, Result};
use datafusion_common::{Column, UnnestOptions};
use datafusion_expr::expr::{Alias, Unnest};
use datafusion_expr::expr_rewriter::{
- normalize_col, normalize_col_with_schemas_and_ambiguity_check,
+ normalize_col, normalize_col_with_schemas_and_ambiguity_check,
normalize_cols,
};
use datafusion_expr::utils::{
expand_qualified_wildcard, expand_wildcard, expr_as_column_expr,
expr_to_columns,
@@ -39,8 +39,8 @@ use datafusion_expr::{
Expr, Filter, GroupingSet, LogicalPlan, LogicalPlanBuilder, Partitioning,
};
use sqlparser::ast::{
- Distinct, Expr as SQLExpr, GroupByExpr, ReplaceSelectItem,
WildcardAdditionalOptions,
- WindowType,
+ Distinct, Expr as SQLExpr, GroupByExpr, OrderByExpr, ReplaceSelectItem,
+ WildcardAdditionalOptions, WindowType,
};
use sqlparser::ast::{NamedWindowDefinition, Select, SelectItem,
TableWithJoins};
@@ -49,6 +49,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
pub(super) fn select_to_plan(
&self,
mut select: Select,
+ order_by: Vec<OrderByExpr>,
planner_context: &mut PlannerContext,
) -> Result<LogicalPlan> {
// check for unsupported syntax first
@@ -94,6 +95,17 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
let mut combined_schema = base_plan.schema().as_ref().clone();
combined_schema.merge(projected_plan.schema());
+ // Order-by expressions prioritize referencing columns from the select
list,
+ // then from the FROM clause.
+ let order_by_rex = self.order_by_to_sort_expr(
+ &order_by,
+ projected_plan.schema().as_ref(),
+ planner_context,
+ true,
+ Some(base_plan.schema().as_ref()),
+ )?;
+ let order_by_rex = normalize_cols(order_by_rex, &projected_plan)?;
+
// this alias map is resolved and looked up in both having exprs and
group by exprs
let alias_map = extract_aliases(&select_exprs);
@@ -248,9 +260,9 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
.collect::<Result<Vec<_>>>()?;
// Build the final plan
- return LogicalPlanBuilder::from(base_plan)
+ LogicalPlanBuilder::from(base_plan)
.distinct_on(on_expr, select_exprs, None)?
- .build();
+ .build()
}
}?;
@@ -274,6 +286,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
plan
};
+ let plan = self.order_by(plan, order_by_rex)?;
+
Ok(plan)
}
diff --git a/datafusion/sql/src/set_expr.rs b/datafusion/sql/src/set_expr.rs
index cbe41c33c7..248aad8469 100644
--- a/datafusion/sql/src/set_expr.rs
+++ b/datafusion/sql/src/set_expr.rs
@@ -27,7 +27,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
planner_context: &mut PlannerContext,
) -> Result<LogicalPlan> {
match set_expr {
- SetExpr::Select(s) => self.select_to_plan(*s, planner_context),
+ SetExpr::Select(s) => self.select_to_plan(*s, vec![],
planner_context),
SetExpr::Values(v) => self.sql_values_to_plan(v, planner_context),
SetExpr::SetOperation {
op,
diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs
index c81217aa70..137bb5fb20 100644
--- a/datafusion/sql/src/statement.rs
+++ b/datafusion/sql/src/statement.rs
@@ -942,7 +942,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
for expr in order_exprs {
// Convert each OrderByExpr to a SortExpr:
let expr_vec =
- self.order_by_to_sort_expr(&expr, schema, planner_context,
true)?;
+ self.order_by_to_sort_expr(&expr, schema, planner_context,
true, None)?;
// Verify that columns of all SortExprs exist in the schema:
for expr in expr_vec.iter() {
for column in expr.to_columns()?.iter() {
diff --git a/datafusion/sql/tests/sql_integration.rs
b/datafusion/sql/tests/sql_integration.rs
index 319aa5b5fd..0af0604963 100644
--- a/datafusion/sql/tests/sql_integration.rs
+++ b/datafusion/sql/tests/sql_integration.rs
@@ -3646,7 +3646,7 @@ fn test_select_distinct_order_by() {
let sql = "SELECT distinct '1' from person order by id";
let expected =
- "Error during planning: For SELECT DISTINCT, ORDER BY expressions id
must appear in select list";
+ "Error during planning: For SELECT DISTINCT, ORDER BY expressions
person.id must appear in select list";
// It should return error.
let result = logical_plan(sql);
diff --git a/datafusion/sqllogictest/test_files/order.slt
b/datafusion/sqllogictest/test_files/order.slt
index 1030ee1b2c..0b43b6e31a 100644
--- a/datafusion/sqllogictest/test_files/order.slt
+++ b/datafusion/sqllogictest/test_files/order.slt
@@ -374,11 +374,11 @@ select * from t SORT BY time;
# distinct on a column not in the select list should not work
-statement error For SELECT DISTINCT, ORDER BY expressions time must appear in
select list
+statement error DataFusion error: Error during planning: For SELECT DISTINCT,
ORDER BY expressions t\.time must appear in select list
SELECT DISTINCT value FROM t ORDER BY time;
# distinct on an expression of a column not in the select list should not work
-statement error For SELECT DISTINCT, ORDER BY expressions time must appear in
select list
+statement error DataFusion error: Error during planning: For SELECT DISTINCT,
ORDER BY expressions t\.time must appear in select list
SELECT DISTINCT date_trunc('hour', time) FROM t ORDER BY time;
# distinct on a column that is in the select list but aliasted should work
@@ -888,6 +888,69 @@ select column2, column1 from foo ORDER BY column2 desc,
column1 desc;
[1] 4
[0] 2
+# Test issue: https://github.com/apache/datafusion/issues/10013
+# There is both a HAVING clause and an ORDER BY clause in the query.
+query I
+SELECT
+ SUM(column1)
+FROM foo
+HAVING SUM(column1) > 0
+ORDER BY SUM(column1)
+----
+16
+
+# ORDER BY with a GROUP BY clause
+query I
+SELECT SUM(column1)
+ FROM foo
+GROUP BY column2
+ORDER BY SUM(column1)
+----
+0
+2
+2
+2
+3
+3
+4
+
+# ORDER BY with a GROUP BY clause and a HAVING clause
+query I
+SELECT
+ SUM(column1)
+FROM foo
+GROUP BY column2
+HAVING SUM(column1) < 3
+ORDER BY SUM(column1)
+----
+0
+2
+2
+2
+
+# ORDER BY without a HAVING clause
+query I
+SELECT SUM(column1) FROM foo ORDER BY SUM(column1)
+----
+16
+
+# Order by unprojected aggregate expressions is not supported
+query error DataFusion error: This feature is not implemented: Physical plan
does not support logical expression AggregateFunction
+SELECT column2 FROM foo ORDER BY SUM(column1)
+
+statement ok
+create table ambiguity_test(a int, b int) as values (1,20), (2,10);
+
+# Order-by expressions prioritize referencing columns from the select list.
+query II
+select a as b, b as c2 from ambiguity_test order by b
+----
+1 20
+2 10
+
# Cleanup
statement ok
drop table foo;
+
+statement ok
+drop table ambiguity_test;
diff --git a/datafusion/sqllogictest/test_files/select.slt
b/datafusion/sqllogictest/test_files/select.slt
index 6da6fcd0d8..f91cfd6be6 100644
--- a/datafusion/sqllogictest/test_files/select.slt
+++ b/datafusion/sqllogictest/test_files/select.slt
@@ -488,7 +488,7 @@ select '1' from foo order by column1;
1
# foo distinct order by
-statement error DataFusion error: Error during planning: For SELECT DISTINCT,
ORDER BY expressions column1 must appear in select list
+statement error DataFusion error: Error during planning: For SELECT DISTINCT,
ORDER BY expressions foo\.column1 must appear in select list
select distinct '1' from foo order by column1;
# distincts for float nan
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]