This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new a8847e1a82 fix: Support `NOT <field> IN (<subquery>)` via anti join
(#10936)
a8847e1a82 is described below
commit a8847e1a825f8e588362430d99363672d8c3e7db
Author: Andrey Koshchiy <[email protected]>
AuthorDate: Mon Jun 17 23:28:12 2024 +0300
fix: Support `NOT <field> IN (<subquery>)` via anti join (#10936)
* fix: rewriting NOT IN (<subquery>) to anti join
* add wrapped_not_not_in_subquery test
---
.../src/decorrelate_predicate_subquery.rs | 86 ++++++++++++++++++++--
datafusion/sqllogictest/test_files/subquery.slt | 13 ++++
2 files changed, 94 insertions(+), 5 deletions(-)
diff --git a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs
b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs
index 88ce300e5c..e5e97b693c 100644
--- a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs
+++ b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs
@@ -33,7 +33,7 @@ use
datafusion_expr::expr_rewriter::create_col_from_scalar_expr;
use datafusion_expr::logical_plan::{JoinType, Subquery};
use datafusion_expr::utils::{conjunction, split_conjunction,
split_conjunction_owned};
use datafusion_expr::{
- exists, in_subquery, not_exists, not_in_subquery, BinaryExpr, Expr, Filter,
+ exists, in_subquery, not, not_exists, not_in_subquery, BinaryExpr, Expr,
Filter,
LogicalPlan, LogicalPlanBuilder, Operator,
};
@@ -79,6 +79,25 @@ impl DecorrelatePredicateSubquery {
let mut others = vec![];
for it in filters.into_iter() {
match it {
+ Expr::Not(not_expr) => match *not_expr {
+ Expr::InSubquery(InSubquery {
+ expr,
+ subquery,
+ negated,
+ }) => {
+ let new_subquery = self.rewrite_subquery(subquery,
config)?;
+ subqueries.push(SubqueryInfo::new_with_in_expr(
+ new_subquery,
+ *expr,
+ !negated,
+ ));
+ }
+ Expr::Exists(Exists { subquery, negated }) => {
+ let new_subquery = self.rewrite_subquery(subquery,
config)?;
+ subqueries.push(SubqueryInfo::new(new_subquery,
!negated));
+ }
+ expr => others.push(not(expr)),
+ },
Expr::InSubquery(InSubquery {
expr,
subquery,
@@ -126,9 +145,17 @@ impl OptimizerRule for DecorrelatePredicateSubquery {
};
// if there are no subqueries in the predicate, return the original
plan
- let has_subqueries = split_conjunction(&filter.predicate)
- .iter()
- .any(|expr| matches!(expr, Expr::InSubquery(_) | Expr::Exists(_)));
+ let has_subqueries =
+ split_conjunction(&filter.predicate)
+ .iter()
+ .any(|expr| match expr {
+ Expr::Not(not_expr) => {
+ matches!(not_expr.as_ref(), Expr::InSubquery(_) |
Expr::Exists(_))
+ }
+ Expr::InSubquery(_) | Expr::Exists(_) => true,
+ _ => false,
+ });
+
if !has_subqueries {
return Ok(Transformed::no(LogicalPlan::Filter(filter)));
}
@@ -351,7 +378,7 @@ mod tests {
use crate::test::*;
use arrow::datatypes::DataType;
- use datafusion_expr::{and, binary_expr, col, lit, or, out_ref_col};
+ use datafusion_expr::{and, binary_expr, col, lit, not, or, out_ref_col};
fn assert_optimized_plan_equal(plan: LogicalPlan, expected: &str) ->
Result<()> {
assert_optimized_plan_eq_display_indent(
@@ -1099,6 +1126,55 @@ mod tests {
Ok(())
}
+ #[test]
+ fn wrapped_not_in_subquery() -> Result<()> {
+ let table_scan = test_table_scan()?;
+ let plan = LogicalPlanBuilder::from(table_scan)
+ .filter(not(in_subquery(col("c"),
test_subquery_with_name("sq")?)))?
+ .project(vec![col("test.b")])?
+ .build()?;
+
+ let expected = "Projection: test.b [b:UInt32]\
+ \n LeftAnti Join: Filter: test.c = __correlated_sq_1.c [a:UInt32,
b:UInt32, c:UInt32]\
+ \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\
+ \n SubqueryAlias: __correlated_sq_1 [c:UInt32]\
+ \n Projection: sq.c [c:UInt32]\
+ \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]";
+
+ assert_optimized_plan_eq_display_indent(
+ Arc::new(DecorrelatePredicateSubquery::new()),
+ plan,
+ expected,
+ );
+ Ok(())
+ }
+
+ #[test]
+ fn wrapped_not_not_in_subquery() -> Result<()> {
+ let table_scan = test_table_scan()?;
+ let plan = LogicalPlanBuilder::from(table_scan)
+ .filter(not(not_in_subquery(
+ col("c"),
+ test_subquery_with_name("sq")?,
+ )))?
+ .project(vec![col("test.b")])?
+ .build()?;
+
+ let expected = "Projection: test.b [b:UInt32]\
+ \n LeftSemi Join: Filter: test.c = __correlated_sq_1.c [a:UInt32,
b:UInt32, c:UInt32]\
+ \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\
+ \n SubqueryAlias: __correlated_sq_1 [c:UInt32]\
+ \n Projection: sq.c [c:UInt32]\
+ \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]";
+
+ assert_optimized_plan_eq_display_indent(
+ Arc::new(DecorrelatePredicateSubquery::new()),
+ plan,
+ expected,
+ );
+ Ok(())
+ }
+
#[test]
fn in_subquery_both_side_expr() -> Result<()> {
let table_scan = test_table_scan()?;
diff --git a/datafusion/sqllogictest/test_files/subquery.slt
b/datafusion/sqllogictest/test_files/subquery.slt
index 73fc30c9ae..eb0904b230 100644
--- a/datafusion/sqllogictest/test_files/subquery.slt
+++ b/datafusion/sqllogictest/test_files/subquery.slt
@@ -127,6 +127,19 @@ where t1.t1_id + 12 not in (
----
22 b 2
+# wrapped_not_in_subquery_to_join_with_correlated_outer_filter
+query ITI rowsort
+select t1.t1_id,
+ t1.t1_name,
+ t1.t1_int
+from t1
+where not t1.t1_id + 12 in (
+ select t2.t2_id + 1 from t2 where t1.t1_int > 0
+ )
+----
+22 b 2
+
+
# in subquery with two parentheses, see #5529
query ITI rowsort
select t1.t1_id,
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]