This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 82cd7f3cdb fix: optimizer `common_sub_expression_eliminate` fails in a
window function (#17852)
82cd7f3cdb is described below
commit 82cd7f3cdb8dbe0b63b8b62f54543641598655a0
Author: Khanh Duong <[email protected]>
AuthorDate: Mon Oct 6 12:47:22 2025 +0900
fix: optimizer `common_sub_expression_eliminate` fails in a window function
(#17852)
* fix: prevent modifying output schema in `scalar_subquery_to_join`
* docs: explain why we need a projection after transforming to a join
* test: window function fails if its child subquery is rewritten to join
* fix: disable `skip_failed_rules` rule for _query has type error_ test
---
.../optimizer/src/scalar_subquery_to_join.rs | 254 +++++++++++----------
datafusion/sqllogictest/test_files/window.slt | 23 +-
2 files changed, 159 insertions(+), 118 deletions(-)
diff --git a/datafusion/optimizer/src/scalar_subquery_to_join.rs
b/datafusion/optimizer/src/scalar_subquery_to_join.rs
index 2f9a2f6bb9..48d1182527 100644
--- a/datafusion/optimizer/src/scalar_subquery_to_join.rs
+++ b/datafusion/optimizer/src/scalar_subquery_to_join.rs
@@ -37,6 +37,8 @@ use datafusion_expr::utils::conjunction;
use datafusion_expr::{expr, EmptyRelation, Expr, LogicalPlan,
LogicalPlanBuilder};
/// Optimizer rule for rewriting subquery filters to joins
+/// and places additional projection on top of the filter, to preserve
+/// original schema.
#[derive(Default, Debug)]
pub struct ScalarSubqueryToJoin {}
@@ -123,8 +125,13 @@ impl OptimizerRule for ScalarSubqueryToJoin {
return
Ok(Transformed::no(LogicalPlan::Filter(filter)));
}
}
+
+ // Preserve original schema as new Join might have more fields
than what Filter & parents expect.
+ let projection =
+
filter.input.schema().columns().into_iter().map(Expr::from);
let new_plan = LogicalPlanBuilder::from(cur_input)
.filter(rewrite_expr)?
+ .project(projection)?
.build()?;
Ok(Transformed::yes(new_plan))
}
@@ -452,18 +459,19 @@ mod tests {
plan,
@r"
Projection: customer.c_custkey [c_custkey:Int64]
- Filter: Int32(1) < __scalar_sq_1.max(orders.o_custkey) AND Int32(1)
< __scalar_sq_2.max(orders.o_custkey) [c_custkey:Int64, c_name:Utf8,
max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N,
max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
- Left Join: Filter: __scalar_sq_2.o_custkey = customer.c_custkey
[c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N,
o_custkey:Int64;N, __always_true:Boolean;N, max(orders.o_custkey):Int64;N,
o_custkey:Int64;N, __always_true:Boolean;N]
- Left Join: Filter: __scalar_sq_1.o_custkey = customer.c_custkey
[c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N,
o_custkey:Int64;N, __always_true:Boolean;N]
- TableScan: customer [c_custkey:Int64, c_name:Utf8]
- SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N,
o_custkey:Int64, __always_true:Boolean]
+ Projection: customer.c_custkey, customer.c_name [c_custkey:Int64,
c_name:Utf8]
+ Filter: Int32(1) < __scalar_sq_1.max(orders.o_custkey) AND
Int32(1) < __scalar_sq_2.max(orders.o_custkey) [c_custkey:Int64, c_name:Utf8,
max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N,
max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
+ Left Join: Filter: __scalar_sq_2.o_custkey = customer.c_custkey
[c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N,
o_custkey:Int64;N, __always_true:Boolean;N, max(orders.o_custkey):Int64;N,
o_custkey:Int64;N, __always_true:Boolean;N]
+ Left Join: Filter: __scalar_sq_1.o_custkey =
customer.c_custkey [c_custkey:Int64, c_name:Utf8,
max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
+ TableScan: customer [c_custkey:Int64, c_name:Utf8]
+ SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N,
o_custkey:Int64, __always_true:Boolean]
+ Projection: max(orders.o_custkey), orders.o_custkey,
__always_true [max(orders.o_custkey):Int64;N, o_custkey:Int64,
__always_true:Boolean]
+ Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS
__always_true]], aggr=[[max(orders.o_custkey)]] [o_custkey:Int64,
__always_true:Boolean, max(orders.o_custkey):Int64;N]
+ TableScan: orders [o_orderkey:Int64, o_custkey:Int64,
o_orderstatus:Utf8, o_totalprice:Float64;N]
+ SubqueryAlias: __scalar_sq_2 [max(orders.o_custkey):Int64;N,
o_custkey:Int64, __always_true:Boolean]
Projection: max(orders.o_custkey), orders.o_custkey,
__always_true [max(orders.o_custkey):Int64;N, o_custkey:Int64,
__always_true:Boolean]
Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS
__always_true]], aggr=[[max(orders.o_custkey)]] [o_custkey:Int64,
__always_true:Boolean, max(orders.o_custkey):Int64;N]
TableScan: orders [o_orderkey:Int64, o_custkey:Int64,
o_orderstatus:Utf8, o_totalprice:Float64;N]
- SubqueryAlias: __scalar_sq_2 [max(orders.o_custkey):Int64;N,
o_custkey:Int64, __always_true:Boolean]
- Projection: max(orders.o_custkey), orders.o_custkey,
__always_true [max(orders.o_custkey):Int64;N, o_custkey:Int64,
__always_true:Boolean]
- Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS
__always_true]], aggr=[[max(orders.o_custkey)]] [o_custkey:Int64,
__always_true:Boolean, max(orders.o_custkey):Int64;N]
- TableScan: orders [o_orderkey:Int64, o_custkey:Int64,
o_orderstatus:Utf8, o_totalprice:Float64;N]
"
)
}
@@ -506,19 +514,21 @@ mod tests {
plan,
@r"
Projection: customer.c_custkey [c_custkey:Int64]
- Filter: customer.c_acctbal < __scalar_sq_1.sum(orders.o_totalprice)
[c_custkey:Int64, c_name:Utf8, sum(orders.o_totalprice):Float64;N,
o_custkey:Int64;N, __always_true:Boolean;N]
- Left Join: Filter: __scalar_sq_1.o_custkey = customer.c_custkey
[c_custkey:Int64, c_name:Utf8, sum(orders.o_totalprice):Float64;N,
o_custkey:Int64;N, __always_true:Boolean;N]
- TableScan: customer [c_custkey:Int64, c_name:Utf8]
- SubqueryAlias: __scalar_sq_1
[sum(orders.o_totalprice):Float64;N, o_custkey:Int64, __always_true:Boolean]
- Projection: sum(orders.o_totalprice), orders.o_custkey,
__always_true [sum(orders.o_totalprice):Float64;N, o_custkey:Int64,
__always_true:Boolean]
- Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS
__always_true]], aggr=[[sum(orders.o_totalprice)]] [o_custkey:Int64,
__always_true:Boolean, sum(orders.o_totalprice):Float64;N]
- Filter: orders.o_totalprice <
__scalar_sq_2.sum(lineitem.l_extendedprice) [o_orderkey:Int64, o_custkey:Int64,
o_orderstatus:Utf8, o_totalprice:Float64;N,
sum(lineitem.l_extendedprice):Float64;N, l_orderkey:Int64;N,
__always_true:Boolean;N]
- Left Join: Filter: __scalar_sq_2.l_orderkey =
orders.o_orderkey [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8,
o_totalprice:Float64;N, sum(lineitem.l_extendedprice):Float64;N,
l_orderkey:Int64;N, __always_true:Boolean;N]
- TableScan: orders [o_orderkey:Int64, o_custkey:Int64,
o_orderstatus:Utf8, o_totalprice:Float64;N]
- SubqueryAlias: __scalar_sq_2
[sum(lineitem.l_extendedprice):Float64;N, l_orderkey:Int64,
__always_true:Boolean]
- Projection: sum(lineitem.l_extendedprice),
lineitem.l_orderkey, __always_true [sum(lineitem.l_extendedprice):Float64;N,
l_orderkey:Int64, __always_true:Boolean]
- Aggregate: groupBy=[[lineitem.l_orderkey,
Boolean(true) AS __always_true]], aggr=[[sum(lineitem.l_extendedprice)]]
[l_orderkey:Int64, __always_true:Boolean,
sum(lineitem.l_extendedprice):Float64;N]
- TableScan: lineitem [l_orderkey:Int64,
l_partkey:Int64, l_suppkey:Int64, l_linenumber:Int32, l_quantity:Float64,
l_extendedprice:Float64]
+ Projection: customer.c_custkey, customer.c_name [c_custkey:Int64,
c_name:Utf8]
+ Filter: customer.c_acctbal <
__scalar_sq_1.sum(orders.o_totalprice) [c_custkey:Int64, c_name:Utf8,
sum(orders.o_totalprice):Float64;N, o_custkey:Int64;N, __always_true:Boolean;N]
+ Left Join: Filter: __scalar_sq_1.o_custkey = customer.c_custkey
[c_custkey:Int64, c_name:Utf8, sum(orders.o_totalprice):Float64;N,
o_custkey:Int64;N, __always_true:Boolean;N]
+ TableScan: customer [c_custkey:Int64, c_name:Utf8]
+ SubqueryAlias: __scalar_sq_1
[sum(orders.o_totalprice):Float64;N, o_custkey:Int64, __always_true:Boolean]
+ Projection: sum(orders.o_totalprice), orders.o_custkey,
__always_true [sum(orders.o_totalprice):Float64;N, o_custkey:Int64,
__always_true:Boolean]
+ Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS
__always_true]], aggr=[[sum(orders.o_totalprice)]] [o_custkey:Int64,
__always_true:Boolean, sum(orders.o_totalprice):Float64;N]
+ Projection: orders.o_orderkey, orders.o_custkey,
orders.o_orderstatus, orders.o_totalprice [o_orderkey:Int64, o_custkey:Int64,
o_orderstatus:Utf8, o_totalprice:Float64;N]
+ Filter: orders.o_totalprice <
__scalar_sq_2.sum(lineitem.l_extendedprice) [o_orderkey:Int64, o_custkey:Int64,
o_orderstatus:Utf8, o_totalprice:Float64;N,
sum(lineitem.l_extendedprice):Float64;N, l_orderkey:Int64;N,
__always_true:Boolean;N]
+ Left Join: Filter: __scalar_sq_2.l_orderkey =
orders.o_orderkey [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8,
o_totalprice:Float64;N, sum(lineitem.l_extendedprice):Float64;N,
l_orderkey:Int64;N, __always_true:Boolean;N]
+ TableScan: orders [o_orderkey:Int64,
o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
+ SubqueryAlias: __scalar_sq_2
[sum(lineitem.l_extendedprice):Float64;N, l_orderkey:Int64,
__always_true:Boolean]
+ Projection: sum(lineitem.l_extendedprice),
lineitem.l_orderkey, __always_true [sum(lineitem.l_extendedprice):Float64;N,
l_orderkey:Int64, __always_true:Boolean]
+ Aggregate: groupBy=[[lineitem.l_orderkey,
Boolean(true) AS __always_true]], aggr=[[sum(lineitem.l_extendedprice)]]
[l_orderkey:Int64, __always_true:Boolean,
sum(lineitem.l_extendedprice):Float64;N]
+ TableScan: lineitem [l_orderkey:Int64,
l_partkey:Int64, l_suppkey:Int64, l_linenumber:Int32, l_quantity:Float64,
l_extendedprice:Float64]
"
)
}
@@ -547,14 +557,15 @@ mod tests {
plan,
@r"
Projection: customer.c_custkey [c_custkey:Int64]
- Filter: customer.c_custkey = __scalar_sq_1.max(orders.o_custkey)
[c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N,
o_custkey:Int64;N, __always_true:Boolean;N]
- Left Join: Filter: customer.c_custkey = __scalar_sq_1.o_custkey
[c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N,
o_custkey:Int64;N, __always_true:Boolean;N]
- TableScan: customer [c_custkey:Int64, c_name:Utf8]
- SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N,
o_custkey:Int64, __always_true:Boolean]
- Projection: max(orders.o_custkey), orders.o_custkey,
__always_true [max(orders.o_custkey):Int64;N, o_custkey:Int64,
__always_true:Boolean]
- Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS
__always_true]], aggr=[[max(orders.o_custkey)]] [o_custkey:Int64,
__always_true:Boolean, max(orders.o_custkey):Int64;N]
- Filter: orders.o_orderkey = Int32(1) [o_orderkey:Int64,
o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
- TableScan: orders [o_orderkey:Int64, o_custkey:Int64,
o_orderstatus:Utf8, o_totalprice:Float64;N]
+ Projection: customer.c_custkey, customer.c_name [c_custkey:Int64,
c_name:Utf8]
+ Filter: customer.c_custkey = __scalar_sq_1.max(orders.o_custkey)
[c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N,
o_custkey:Int64;N, __always_true:Boolean;N]
+ Left Join: Filter: customer.c_custkey = __scalar_sq_1.o_custkey
[c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N,
o_custkey:Int64;N, __always_true:Boolean;N]
+ TableScan: customer [c_custkey:Int64, c_name:Utf8]
+ SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N,
o_custkey:Int64, __always_true:Boolean]
+ Projection: max(orders.o_custkey), orders.o_custkey,
__always_true [max(orders.o_custkey):Int64;N, o_custkey:Int64,
__always_true:Boolean]
+ Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS
__always_true]], aggr=[[max(orders.o_custkey)]] [o_custkey:Int64,
__always_true:Boolean, max(orders.o_custkey):Int64;N]
+ Filter: orders.o_orderkey = Int32(1) [o_orderkey:Int64,
o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
+ TableScan: orders [o_orderkey:Int64, o_custkey:Int64,
o_orderstatus:Utf8, o_totalprice:Float64;N]
"
)
}
@@ -583,13 +594,14 @@ mod tests {
plan,
@r"
Projection: customer.c_custkey [c_custkey:Int64]
- Filter: customer.c_custkey = __scalar_sq_1.max(orders.o_custkey)
[c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N]
- Left Join: Filter: Boolean(true) [c_custkey:Int64, c_name:Utf8,
max(orders.o_custkey):Int64;N]
- TableScan: customer [c_custkey:Int64, c_name:Utf8]
- SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N]
- Projection: max(orders.o_custkey)
[max(orders.o_custkey):Int64;N]
- Aggregate: groupBy=[[]], aggr=[[max(orders.o_custkey)]]
[max(orders.o_custkey):Int64;N]
- TableScan: orders [o_orderkey:Int64, o_custkey:Int64,
o_orderstatus:Utf8, o_totalprice:Float64;N]
+ Projection: customer.c_custkey, customer.c_name [c_custkey:Int64,
c_name:Utf8]
+ Filter: customer.c_custkey = __scalar_sq_1.max(orders.o_custkey)
[c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N]
+ Left Join: Filter: Boolean(true) [c_custkey:Int64, c_name:Utf8,
max(orders.o_custkey):Int64;N]
+ TableScan: customer [c_custkey:Int64, c_name:Utf8]
+ SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N]
+ Projection: max(orders.o_custkey)
[max(orders.o_custkey):Int64;N]
+ Aggregate: groupBy=[[]], aggr=[[max(orders.o_custkey)]]
[max(orders.o_custkey):Int64;N]
+ TableScan: orders [o_orderkey:Int64, o_custkey:Int64,
o_orderstatus:Utf8, o_totalprice:Float64;N]
"
)
}
@@ -614,14 +626,15 @@ mod tests {
plan,
@r"
Projection: customer.c_custkey [c_custkey:Int64]
- Filter: customer.c_custkey = __scalar_sq_1.max(orders.o_custkey)
[c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N]
- Left Join: Filter: Boolean(true) [c_custkey:Int64, c_name:Utf8,
max(orders.o_custkey):Int64;N]
- TableScan: customer [c_custkey:Int64, c_name:Utf8]
- SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N]
- Projection: max(orders.o_custkey)
[max(orders.o_custkey):Int64;N]
- Aggregate: groupBy=[[]], aggr=[[max(orders.o_custkey)]]
[max(orders.o_custkey):Int64;N]
- Filter: orders.o_custkey = orders.o_custkey
[o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
- TableScan: orders [o_orderkey:Int64, o_custkey:Int64,
o_orderstatus:Utf8, o_totalprice:Float64;N]
+ Projection: customer.c_custkey, customer.c_name [c_custkey:Int64,
c_name:Utf8]
+ Filter: customer.c_custkey = __scalar_sq_1.max(orders.o_custkey)
[c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N]
+ Left Join: Filter: Boolean(true) [c_custkey:Int64, c_name:Utf8,
max(orders.o_custkey):Int64;N]
+ TableScan: customer [c_custkey:Int64, c_name:Utf8]
+ SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N]
+ Projection: max(orders.o_custkey)
[max(orders.o_custkey):Int64;N]
+ Aggregate: groupBy=[[]], aggr=[[max(orders.o_custkey)]]
[max(orders.o_custkey):Int64;N]
+ Filter: orders.o_custkey = orders.o_custkey
[o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
+ TableScan: orders [o_orderkey:Int64, o_custkey:Int64,
o_orderstatus:Utf8, o_totalprice:Float64;N]
"
)
}
@@ -774,13 +787,14 @@ mod tests {
plan,
@r"
Projection: customer.c_custkey [c_custkey:Int64]
- Filter: customer.c_custkey = __scalar_sq_1.max(orders.o_custkey) +
Int32(1) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey) +
Int32(1):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
- Left Join: Filter: customer.c_custkey = __scalar_sq_1.o_custkey
[c_custkey:Int64, c_name:Utf8, max(orders.o_custkey) + Int32(1):Int64;N,
o_custkey:Int64;N, __always_true:Boolean;N]
- TableScan: customer [c_custkey:Int64, c_name:Utf8]
- SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey) +
Int32(1):Int64;N, o_custkey:Int64, __always_true:Boolean]
- Projection: max(orders.o_custkey) + Int32(1),
orders.o_custkey, __always_true [max(orders.o_custkey) + Int32(1):Int64;N,
o_custkey:Int64, __always_true:Boolean]
- Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS
__always_true]], aggr=[[max(orders.o_custkey)]] [o_custkey:Int64,
__always_true:Boolean, max(orders.o_custkey):Int64;N]
- TableScan: orders [o_orderkey:Int64, o_custkey:Int64,
o_orderstatus:Utf8, o_totalprice:Float64;N]
+ Projection: customer.c_custkey, customer.c_name [c_custkey:Int64,
c_name:Utf8]
+ Filter: customer.c_custkey = __scalar_sq_1.max(orders.o_custkey) +
Int32(1) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey) +
Int32(1):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
+ Left Join: Filter: customer.c_custkey = __scalar_sq_1.o_custkey
[c_custkey:Int64, c_name:Utf8, max(orders.o_custkey) + Int32(1):Int64;N,
o_custkey:Int64;N, __always_true:Boolean;N]
+ TableScan: customer [c_custkey:Int64, c_name:Utf8]
+ SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey) +
Int32(1):Int64;N, o_custkey:Int64, __always_true:Boolean]
+ Projection: max(orders.o_custkey) + Int32(1),
orders.o_custkey, __always_true [max(orders.o_custkey) + Int32(1):Int64;N,
o_custkey:Int64, __always_true:Boolean]
+ Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS
__always_true]], aggr=[[max(orders.o_custkey)]] [o_custkey:Int64,
__always_true:Boolean, max(orders.o_custkey):Int64;N]
+ TableScan: orders [o_orderkey:Int64, o_custkey:Int64,
o_orderstatus:Utf8, o_totalprice:Float64;N]
"
)
}
@@ -877,13 +891,14 @@ mod tests {
plan,
@r"
Projection: customer.c_custkey [c_custkey:Int64]
- Filter: customer.c_custkey >= __scalar_sq_1.max(orders.o_custkey)
AND customer.c_custkey = Int32(1) [c_custkey:Int64, c_name:Utf8,
max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
- Left Join: Filter: customer.c_custkey = __scalar_sq_1.o_custkey
[c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N,
o_custkey:Int64;N, __always_true:Boolean;N]
- TableScan: customer [c_custkey:Int64, c_name:Utf8]
- SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N,
o_custkey:Int64, __always_true:Boolean]
- Projection: max(orders.o_custkey), orders.o_custkey,
__always_true [max(orders.o_custkey):Int64;N, o_custkey:Int64,
__always_true:Boolean]
- Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS
__always_true]], aggr=[[max(orders.o_custkey)]] [o_custkey:Int64,
__always_true:Boolean, max(orders.o_custkey):Int64;N]
- TableScan: orders [o_orderkey:Int64, o_custkey:Int64,
o_orderstatus:Utf8, o_totalprice:Float64;N]
+ Projection: customer.c_custkey, customer.c_name [c_custkey:Int64,
c_name:Utf8]
+ Filter: customer.c_custkey >= __scalar_sq_1.max(orders.o_custkey)
AND customer.c_custkey = Int32(1) [c_custkey:Int64, c_name:Utf8,
max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
+ Left Join: Filter: customer.c_custkey = __scalar_sq_1.o_custkey
[c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N,
o_custkey:Int64;N, __always_true:Boolean;N]
+ TableScan: customer [c_custkey:Int64, c_name:Utf8]
+ SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N,
o_custkey:Int64, __always_true:Boolean]
+ Projection: max(orders.o_custkey), orders.o_custkey,
__always_true [max(orders.o_custkey):Int64;N, o_custkey:Int64,
__always_true:Boolean]
+ Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS
__always_true]], aggr=[[max(orders.o_custkey)]] [o_custkey:Int64,
__always_true:Boolean, max(orders.o_custkey):Int64;N]
+ TableScan: orders [o_orderkey:Int64, o_custkey:Int64,
o_orderstatus:Utf8, o_totalprice:Float64;N]
"
)
}
@@ -914,13 +929,14 @@ mod tests {
plan,
@r"
Projection: customer.c_custkey [c_custkey:Int64]
- Filter: customer.c_custkey = __scalar_sq_1.max(orders.o_custkey) AND
customer.c_custkey = Int32(1) [c_custkey:Int64, c_name:Utf8,
max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
- Left Join: Filter: customer.c_custkey = __scalar_sq_1.o_custkey
[c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N,
o_custkey:Int64;N, __always_true:Boolean;N]
- TableScan: customer [c_custkey:Int64, c_name:Utf8]
- SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N,
o_custkey:Int64, __always_true:Boolean]
- Projection: max(orders.o_custkey), orders.o_custkey,
__always_true [max(orders.o_custkey):Int64;N, o_custkey:Int64,
__always_true:Boolean]
- Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS
__always_true]], aggr=[[max(orders.o_custkey)]] [o_custkey:Int64,
__always_true:Boolean, max(orders.o_custkey):Int64;N]
- TableScan: orders [o_orderkey:Int64, o_custkey:Int64,
o_orderstatus:Utf8, o_totalprice:Float64;N]
+ Projection: customer.c_custkey, customer.c_name [c_custkey:Int64,
c_name:Utf8]
+ Filter: customer.c_custkey = __scalar_sq_1.max(orders.o_custkey)
AND customer.c_custkey = Int32(1) [c_custkey:Int64, c_name:Utf8,
max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
+ Left Join: Filter: customer.c_custkey = __scalar_sq_1.o_custkey
[c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N,
o_custkey:Int64;N, __always_true:Boolean;N]
+ TableScan: customer [c_custkey:Int64, c_name:Utf8]
+ SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N,
o_custkey:Int64, __always_true:Boolean]
+ Projection: max(orders.o_custkey), orders.o_custkey,
__always_true [max(orders.o_custkey):Int64;N, o_custkey:Int64,
__always_true:Boolean]
+ Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS
__always_true]], aggr=[[max(orders.o_custkey)]] [o_custkey:Int64,
__always_true:Boolean, max(orders.o_custkey):Int64;N]
+ TableScan: orders [o_orderkey:Int64, o_custkey:Int64,
o_orderstatus:Utf8, o_totalprice:Float64;N]
"
)
}
@@ -952,13 +968,14 @@ mod tests {
plan,
@r"
Projection: customer.c_custkey [c_custkey:Int64]
- Filter: customer.c_custkey = __scalar_sq_1.max(orders.o_custkey) OR
customer.c_custkey = Int32(1) [c_custkey:Int64, c_name:Utf8,
max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
- Left Join: Filter: customer.c_custkey = __scalar_sq_1.o_custkey
[c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N,
o_custkey:Int64;N, __always_true:Boolean;N]
- TableScan: customer [c_custkey:Int64, c_name:Utf8]
- SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N,
o_custkey:Int64, __always_true:Boolean]
- Projection: max(orders.o_custkey), orders.o_custkey,
__always_true [max(orders.o_custkey):Int64;N, o_custkey:Int64,
__always_true:Boolean]
- Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS
__always_true]], aggr=[[max(orders.o_custkey)]] [o_custkey:Int64,
__always_true:Boolean, max(orders.o_custkey):Int64;N]
- TableScan: orders [o_orderkey:Int64, o_custkey:Int64,
o_orderstatus:Utf8, o_totalprice:Float64;N]
+ Projection: customer.c_custkey, customer.c_name [c_custkey:Int64,
c_name:Utf8]
+ Filter: customer.c_custkey = __scalar_sq_1.max(orders.o_custkey)
OR customer.c_custkey = Int32(1) [c_custkey:Int64, c_name:Utf8,
max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
+ Left Join: Filter: customer.c_custkey = __scalar_sq_1.o_custkey
[c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N,
o_custkey:Int64;N, __always_true:Boolean;N]
+ TableScan: customer [c_custkey:Int64, c_name:Utf8]
+ SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N,
o_custkey:Int64, __always_true:Boolean]
+ Projection: max(orders.o_custkey), orders.o_custkey,
__always_true [max(orders.o_custkey):Int64;N, o_custkey:Int64,
__always_true:Boolean]
+ Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS
__always_true]], aggr=[[max(orders.o_custkey)]] [o_custkey:Int64,
__always_true:Boolean, max(orders.o_custkey):Int64;N]
+ TableScan: orders [o_orderkey:Int64, o_custkey:Int64,
o_orderstatus:Utf8, o_totalprice:Float64;N]
"
)
}
@@ -983,13 +1000,14 @@ mod tests {
plan,
@r"
Projection: test.c [c:UInt32]
- Filter: test.c < __scalar_sq_1.min(sq.c) [a:UInt32, b:UInt32,
c:UInt32, min(sq.c):UInt32;N, a:UInt32;N, __always_true:Boolean;N]
- Left Join: Filter: test.a = __scalar_sq_1.a [a:UInt32, b:UInt32,
c:UInt32, min(sq.c):UInt32;N, a:UInt32;N, __always_true:Boolean;N]
- TableScan: test [a:UInt32, b:UInt32, c:UInt32]
- SubqueryAlias: __scalar_sq_1 [min(sq.c):UInt32;N, a:UInt32,
__always_true:Boolean]
- Projection: min(sq.c), sq.a, __always_true
[min(sq.c):UInt32;N, a:UInt32, __always_true:Boolean]
- Aggregate: groupBy=[[sq.a, Boolean(true) AS __always_true]],
aggr=[[min(sq.c)]] [a:UInt32, __always_true:Boolean, min(sq.c):UInt32;N]
- TableScan: sq [a:UInt32, b:UInt32, c:UInt32]
+ Projection: test.a, test.b, test.c [a:UInt32, b:UInt32, c:UInt32]
+ Filter: test.c < __scalar_sq_1.min(sq.c) [a:UInt32, b:UInt32,
c:UInt32, min(sq.c):UInt32;N, a:UInt32;N, __always_true:Boolean;N]
+ Left Join: Filter: test.a = __scalar_sq_1.a [a:UInt32,
b:UInt32, c:UInt32, min(sq.c):UInt32;N, a:UInt32;N, __always_true:Boolean;N]
+ TableScan: test [a:UInt32, b:UInt32, c:UInt32]
+ SubqueryAlias: __scalar_sq_1 [min(sq.c):UInt32;N, a:UInt32,
__always_true:Boolean]
+ Projection: min(sq.c), sq.a, __always_true
[min(sq.c):UInt32;N, a:UInt32, __always_true:Boolean]
+ Aggregate: groupBy=[[sq.a, Boolean(true) AS
__always_true]], aggr=[[min(sq.c)]] [a:UInt32, __always_true:Boolean,
min(sq.c):UInt32;N]
+ TableScan: sq [a:UInt32, b:UInt32, c:UInt32]
"
)
}
@@ -1013,13 +1031,14 @@ mod tests {
plan,
@r"
Projection: customer.c_custkey [c_custkey:Int64]
- Filter: customer.c_custkey < __scalar_sq_1.max(orders.o_custkey)
[c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N]
- Left Join: Filter: Boolean(true) [c_custkey:Int64, c_name:Utf8,
max(orders.o_custkey):Int64;N]
- TableScan: customer [c_custkey:Int64, c_name:Utf8]
- SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N]
- Projection: max(orders.o_custkey)
[max(orders.o_custkey):Int64;N]
- Aggregate: groupBy=[[]], aggr=[[max(orders.o_custkey)]]
[max(orders.o_custkey):Int64;N]
- TableScan: orders [o_orderkey:Int64, o_custkey:Int64,
o_orderstatus:Utf8, o_totalprice:Float64;N]
+ Projection: customer.c_custkey, customer.c_name [c_custkey:Int64,
c_name:Utf8]
+ Filter: customer.c_custkey < __scalar_sq_1.max(orders.o_custkey)
[c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N]
+ Left Join: Filter: Boolean(true) [c_custkey:Int64, c_name:Utf8,
max(orders.o_custkey):Int64;N]
+ TableScan: customer [c_custkey:Int64, c_name:Utf8]
+ SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N]
+ Projection: max(orders.o_custkey)
[max(orders.o_custkey):Int64;N]
+ Aggregate: groupBy=[[]], aggr=[[max(orders.o_custkey)]]
[max(orders.o_custkey):Int64;N]
+ TableScan: orders [o_orderkey:Int64, o_custkey:Int64,
o_orderstatus:Utf8, o_totalprice:Float64;N]
"
)
}
@@ -1042,13 +1061,14 @@ mod tests {
plan,
@r"
Projection: customer.c_custkey [c_custkey:Int64]
- Filter: customer.c_custkey = __scalar_sq_1.max(orders.o_custkey)
[c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N]
- Left Join: Filter: Boolean(true) [c_custkey:Int64, c_name:Utf8,
max(orders.o_custkey):Int64;N]
- TableScan: customer [c_custkey:Int64, c_name:Utf8]
- SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N]
- Projection: max(orders.o_custkey)
[max(orders.o_custkey):Int64;N]
- Aggregate: groupBy=[[]], aggr=[[max(orders.o_custkey)]]
[max(orders.o_custkey):Int64;N]
- TableScan: orders [o_orderkey:Int64, o_custkey:Int64,
o_orderstatus:Utf8, o_totalprice:Float64;N]
+ Projection: customer.c_custkey, customer.c_name [c_custkey:Int64,
c_name:Utf8]
+ Filter: customer.c_custkey = __scalar_sq_1.max(orders.o_custkey)
[c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N]
+ Left Join: Filter: Boolean(true) [c_custkey:Int64, c_name:Utf8,
max(orders.o_custkey):Int64;N]
+ TableScan: customer [c_custkey:Int64, c_name:Utf8]
+ SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N]
+ Projection: max(orders.o_custkey)
[max(orders.o_custkey):Int64;N]
+ Aggregate: groupBy=[[]], aggr=[[max(orders.o_custkey)]]
[max(orders.o_custkey):Int64;N]
+ TableScan: orders [o_orderkey:Int64, o_custkey:Int64,
o_orderstatus:Utf8, o_totalprice:Float64;N]
"
)
}
@@ -1092,18 +1112,19 @@ mod tests {
plan,
@r"
Projection: customer.c_custkey [c_custkey:Int64]
- Filter: customer.c_custkey BETWEEN
__scalar_sq_1.min(orders.o_custkey) AND __scalar_sq_2.max(orders.o_custkey)
[c_custkey:Int64, c_name:Utf8, min(orders.o_custkey):Int64;N,
o_custkey:Int64;N, __always_true:Boolean;N, max(orders.o_custkey):Int64;N,
o_custkey:Int64;N, __always_true:Boolean;N]
- Left Join: Filter: customer.c_custkey = __scalar_sq_2.o_custkey
[c_custkey:Int64, c_name:Utf8, min(orders.o_custkey):Int64;N,
o_custkey:Int64;N, __always_true:Boolean;N, max(orders.o_custkey):Int64;N,
o_custkey:Int64;N, __always_true:Boolean;N]
- Left Join: Filter: customer.c_custkey = __scalar_sq_1.o_custkey
[c_custkey:Int64, c_name:Utf8, min(orders.o_custkey):Int64;N,
o_custkey:Int64;N, __always_true:Boolean;N]
- TableScan: customer [c_custkey:Int64, c_name:Utf8]
- SubqueryAlias: __scalar_sq_1 [min(orders.o_custkey):Int64;N,
o_custkey:Int64, __always_true:Boolean]
- Projection: min(orders.o_custkey), orders.o_custkey,
__always_true [min(orders.o_custkey):Int64;N, o_custkey:Int64,
__always_true:Boolean]
- Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS
__always_true]], aggr=[[min(orders.o_custkey)]] [o_custkey:Int64,
__always_true:Boolean, min(orders.o_custkey):Int64;N]
+ Projection: customer.c_custkey, customer.c_name [c_custkey:Int64,
c_name:Utf8]
+ Filter: customer.c_custkey BETWEEN
__scalar_sq_1.min(orders.o_custkey) AND __scalar_sq_2.max(orders.o_custkey)
[c_custkey:Int64, c_name:Utf8, min(orders.o_custkey):Int64;N,
o_custkey:Int64;N, __always_true:Boolean;N, max(orders.o_custkey):Int64;N,
o_custkey:Int64;N, __always_true:Boolean;N]
+ Left Join: Filter: customer.c_custkey = __scalar_sq_2.o_custkey
[c_custkey:Int64, c_name:Utf8, min(orders.o_custkey):Int64;N,
o_custkey:Int64;N, __always_true:Boolean;N, max(orders.o_custkey):Int64;N,
o_custkey:Int64;N, __always_true:Boolean;N]
+ Left Join: Filter: customer.c_custkey =
__scalar_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8,
min(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
+ TableScan: customer [c_custkey:Int64, c_name:Utf8]
+ SubqueryAlias: __scalar_sq_1 [min(orders.o_custkey):Int64;N,
o_custkey:Int64, __always_true:Boolean]
+ Projection: min(orders.o_custkey), orders.o_custkey,
__always_true [min(orders.o_custkey):Int64;N, o_custkey:Int64,
__always_true:Boolean]
+ Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS
__always_true]], aggr=[[min(orders.o_custkey)]] [o_custkey:Int64,
__always_true:Boolean, min(orders.o_custkey):Int64;N]
+ TableScan: orders [o_orderkey:Int64, o_custkey:Int64,
o_orderstatus:Utf8, o_totalprice:Float64;N]
+ SubqueryAlias: __scalar_sq_2 [max(orders.o_custkey):Int64;N,
o_custkey:Int64, __always_true:Boolean]
+ Projection: max(orders.o_custkey), orders.o_custkey,
__always_true [max(orders.o_custkey):Int64;N, o_custkey:Int64,
__always_true:Boolean]
+ Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS
__always_true]], aggr=[[max(orders.o_custkey)]] [o_custkey:Int64,
__always_true:Boolean, max(orders.o_custkey):Int64;N]
TableScan: orders [o_orderkey:Int64, o_custkey:Int64,
o_orderstatus:Utf8, o_totalprice:Float64;N]
- SubqueryAlias: __scalar_sq_2 [max(orders.o_custkey):Int64;N,
o_custkey:Int64, __always_true:Boolean]
- Projection: max(orders.o_custkey), orders.o_custkey,
__always_true [max(orders.o_custkey):Int64;N, o_custkey:Int64,
__always_true:Boolean]
- Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS
__always_true]], aggr=[[max(orders.o_custkey)]] [o_custkey:Int64,
__always_true:Boolean, max(orders.o_custkey):Int64;N]
- TableScan: orders [o_orderkey:Int64, o_custkey:Int64,
o_orderstatus:Utf8, o_totalprice:Float64;N]
"
)
}
@@ -1139,18 +1160,19 @@ mod tests {
plan,
@r"
Projection: customer.c_custkey [c_custkey:Int64]
- Filter: customer.c_custkey BETWEEN
__scalar_sq_1.min(orders.o_custkey) AND __scalar_sq_2.max(orders.o_custkey)
[c_custkey:Int64, c_name:Utf8, min(orders.o_custkey):Int64;N,
max(orders.o_custkey):Int64;N]
- Left Join: Filter: Boolean(true) [c_custkey:Int64, c_name:Utf8,
min(orders.o_custkey):Int64;N, max(orders.o_custkey):Int64;N]
- Left Join: Filter: Boolean(true) [c_custkey:Int64, c_name:Utf8,
min(orders.o_custkey):Int64;N]
- TableScan: customer [c_custkey:Int64, c_name:Utf8]
- SubqueryAlias: __scalar_sq_1 [min(orders.o_custkey):Int64;N]
- Projection: min(orders.o_custkey)
[min(orders.o_custkey):Int64;N]
- Aggregate: groupBy=[[]], aggr=[[min(orders.o_custkey)]]
[min(orders.o_custkey):Int64;N]
+ Projection: customer.c_custkey, customer.c_name [c_custkey:Int64,
c_name:Utf8]
+ Filter: customer.c_custkey BETWEEN
__scalar_sq_1.min(orders.o_custkey) AND __scalar_sq_2.max(orders.o_custkey)
[c_custkey:Int64, c_name:Utf8, min(orders.o_custkey):Int64;N,
max(orders.o_custkey):Int64;N]
+ Left Join: Filter: Boolean(true) [c_custkey:Int64, c_name:Utf8,
min(orders.o_custkey):Int64;N, max(orders.o_custkey):Int64;N]
+ Left Join: Filter: Boolean(true) [c_custkey:Int64,
c_name:Utf8, min(orders.o_custkey):Int64;N]
+ TableScan: customer [c_custkey:Int64, c_name:Utf8]
+ SubqueryAlias: __scalar_sq_1 [min(orders.o_custkey):Int64;N]
+ Projection: min(orders.o_custkey)
[min(orders.o_custkey):Int64;N]
+ Aggregate: groupBy=[[]], aggr=[[min(orders.o_custkey)]]
[min(orders.o_custkey):Int64;N]
+ TableScan: orders [o_orderkey:Int64, o_custkey:Int64,
o_orderstatus:Utf8, o_totalprice:Float64;N]
+ SubqueryAlias: __scalar_sq_2 [max(orders.o_custkey):Int64;N]
+ Projection: max(orders.o_custkey)
[max(orders.o_custkey):Int64;N]
+ Aggregate: groupBy=[[]], aggr=[[max(orders.o_custkey)]]
[max(orders.o_custkey):Int64;N]
TableScan: orders [o_orderkey:Int64, o_custkey:Int64,
o_orderstatus:Utf8, o_totalprice:Float64;N]
- SubqueryAlias: __scalar_sq_2 [max(orders.o_custkey):Int64;N]
- Projection: max(orders.o_custkey)
[max(orders.o_custkey):Int64;N]
- Aggregate: groupBy=[[]], aggr=[[max(orders.o_custkey)]]
[max(orders.o_custkey):Int64;N]
- TableScan: orders [o_orderkey:Int64, o_custkey:Int64,
o_orderstatus:Utf8, o_totalprice:Float64;N]
"
)
}
diff --git a/datafusion/sqllogictest/test_files/window.slt
b/datafusion/sqllogictest/test_files/window.slt
index f07ca0de0f..f1a708d84d 100644
--- a/datafusion/sqllogictest/test_files/window.slt
+++ b/datafusion/sqllogictest/test_files/window.slt
@@ -2602,7 +2602,7 @@ SELECT
FROM aggregate_test_100;
statement ok
-set datafusion.optimizer.skip_failed_rules = true
+set datafusion.optimizer.skip_failed_rules = false
# Error is returned from the logical plan.
query error Cannot cast Utf8\("1 DAY"\) to Int8
@@ -6122,4 +6122,23 @@ FROM (
a21, a22, a23, a24, a25, a26, a27, a28, a29, a30, a31, a32, a33, a34,
a35, a36, a37, a38, a39, a40
);
----
-1
\ No newline at end of file
+1
+
+# window_with_subquery_rewritten_to_join
+# the optimizer `scalar_subquery_to_join` rewrites
+# `WHERE acctbal > ( SELECT AVG(acctbal) FROM suppliers)` into a Join,
+# breaking the input schema passed to the window function above.
+# See: https://github.com/apache/datafusion/issues/17770
+query I
+WITH suppliers AS (
+ SELECT *
+ FROM (VALUES (1, 10.0), (1, 20.0)) AS t(nation, acctbal)
+)
+SELECT
+ ROW_NUMBER() OVER (PARTITION BY nation ORDER BY acctbal DESC) AS rn
+FROM suppliers AS s
+WHERE acctbal > (
+ SELECT AVG(acctbal) FROM suppliers
+);
+----
+1
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]