This is an automated email from the ASF dual-hosted git repository.
yjhjstz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git
The following commit(s) were added to refs/heads/main by this push:
new f020573d35 Fix wrong join_rel size estimates for anti join. (#934)
f020573d35 is described below
commit f020573d3598b440526f2fdbeac46b974ba8bd17
Author: Tender Wang <[email protected]>
AuthorDate: Wed Mar 19 03:18:08 2025 +0800
Fix wrong join_rel size estimates for anti join. (#934)
* Fix wrong join_rel size estimates for anti join.
In Postgres planner, we set join_rel selectivity in
calc_joinrel_size_estimate()
for anti join type. Maybe it's codes historical reasons, we have another
place
to set anti join selectivity in clauselist_selectivity_ext().
This can lead to wrong join_rel size estimates for anti join type. We can
remove
the selectivity computation in clauselist_selectivity_ext(). So we can make
codes
be consistent with upstream.
* Fix notin plan diff.
* Fix join_gp_optimizer.out plan diff.
---
src/backend/optimizer/path/clausesel.c | 9 +-------
src/test/regress/expected/join_gp.out | 28 +++++++++++++++++++++++++
src/test/regress/expected/join_gp_optimizer.out | 25 ++++++++++++++++++++++
src/test/regress/expected/notin.out | 26 +++++++++++------------
src/test/regress/sql/join_gp.sql | 12 +++++++++++
5 files changed, 79 insertions(+), 21 deletions(-)
diff --git a/src/backend/optimizer/path/clausesel.c
b/src/backend/optimizer/path/clausesel.c
index 34b37d6b51..f4a6aad7ed 100644
--- a/src/backend/optimizer/path/clausesel.c
+++ b/src/backend/optimizer/path/clausesel.c
@@ -406,14 +406,7 @@ clauselist_selectivity_ext(PlannerInfo *root,
}
pfree(rgsel);
- /*
- * For Anti Semi Join, selectivity is determined by the fraction of
- * tuples that do no match
- */
- if (JOIN_ANTI == jointype || JOIN_LASJ_NOTIN == jointype)
- {
- s1 = (1 - s1);
- }
+
return s1;
}
diff --git a/src/test/regress/expected/join_gp.out
b/src/test/regress/expected/join_gp.out
index 1dbbd4eaa4..6e2a3af395 100644
--- a/src/test/regress/expected/join_gp.out
+++ b/src/test/regress/expected/join_gp.out
@@ -47,6 +47,34 @@ select * from l l1 join l l2 on l1.a = l2.a left join l l3
on l1.a = l3.a and l1
2 | 2 | 2
(5 rows)
+--
+-- test anti_join/left_anti_semi_join selectivities
+--
+create table aj_t1(a int, b int, c int) distributed by (a);
+create table aj_t2(a int, b int, c int) distributed by (a);
+insert into aj_t1 values(1,1,1);
+insert into aj_t2 values(1,1,1),(2,2,2);
+explain(costs off) select t1.a from aj_t1 t1 where not exists (select 1 from
aj_t2 t2 where t1.b = t2.b and t1.c = t2.c);
+ QUERY PLAN
+------------------------------------------------------------------
+ Gather Motion 3:1 (slice1; segments: 3)
+ -> Hash Anti Join
+ Hash Cond: ((t1.b = t2.b) AND (t1.c = t2.c))
+ -> Redistribute Motion 3:3 (slice2; segments: 3)
+ Hash Key: t1.b, t1.c
+ -> Seq Scan on aj_t1 t1
+ -> Hash
+ -> Redistribute Motion 3:3 (slice3; segments: 3)
+ Hash Key: t2.b, t2.c
+ -> Seq Scan on aj_t2 t2
+ Optimizer: Postgres query optimizer
+(11 rows)
+
+select t1.a from aj_t1 t1 where not exists (select 1 from aj_t2 t2 where t1.b
= t2.b and t1.c = t2.c);
+ a
+---
+(0 rows)
+
--
-- test hash join
--
diff --git a/src/test/regress/expected/join_gp_optimizer.out
b/src/test/regress/expected/join_gp_optimizer.out
index c37b860b7b..4138130b0a 100644
--- a/src/test/regress/expected/join_gp_optimizer.out
+++ b/src/test/regress/expected/join_gp_optimizer.out
@@ -47,6 +47,31 @@ select * from l l1 join l l2 on l1.a = l2.a left join l l3
on l1.a = l3.a and l1
2 | 2 | 2
(5 rows)
+--
+-- test anti_join/left_anti_semi_join selectivities
+--
+create table aj_t1(a int, b int, c int) distributed by (a);
+create table aj_t2(a int, b int, c int) distributed by (a);
+insert into aj_t1 values(1,1,1);
+insert into aj_t2 values(1,1,1),(2,2,2);
+explain(costs off) select t1.a from aj_t1 t1 where not exists (select 1 from
aj_t2 t2 where t1.b = t2.b and t1.c = t2.c);
+ QUERY PLAN
+---------------------------------------------------------------
+ Gather Motion 3:1 (slice1; segments: 3)
+ -> Hash Anti Join
+ Hash Cond: ((t1.b = t2.b) AND (t1.c = t2.c))
+ -> Seq Scan on aj_t1 t1
+ -> Hash
+ -> Broadcast Motion 3:3 (slice2; segments: 3)
+ -> Seq Scan on aj_t2 t2
+ Optimizer: GPORCA
+(8 rows)
+
+select t1.a from aj_t1 t1 where not exists (select 1 from aj_t2 t2 where t1.b
= t2.b and t1.c = t2.c);
+ a
+---
+(0 rows)
+
--
-- test hash join
--
diff --git a/src/test/regress/expected/notin.out
b/src/test/regress/expected/notin.out
index f0b404e735..5b1688fe00 100644
--- a/src/test/regress/expected/notin.out
+++ b/src/test/regress/expected/notin.out
@@ -317,19 +317,19 @@ select c1 from t1 where c1 > 6 and c1 not in
--
explain select c1 from t1,t2 where c1 not in
(select c3 from t3) and c1 = c2;
- QUERY PLAN
--------------------------------------------------------------------------------------------------------
- Gather Motion 3:1 (slice1; segments: 3) (cost=5.38..8.62 rows=4 width=4)
- -> Hash Join (cost=5.38..8.62 rows=2 width=4)
- Hash Cond: (t1.c1 = t2.c2)
- -> Hash Left Anti Semi (Not-In) Join (cost=2.26..5.46 rows=2
width=4)
- Hash Cond: (t1.c1 = t3.c3)
- -> Seq Scan on t1 (cost=0.00..3.10 rows=4 width=4)
- -> Hash (cost=2.15..2.15 rows=3 width=4)
- -> Broadcast Motion 3:3 (slice2; segments: 3)
(cost=0.00..2.15 rows=3 width=4)
- -> Seq Scan on t3 (cost=0.00..2.03 rows=1 width=4)
- -> Hash (cost=3.05..3.05 rows=2 width=4)
- -> Seq Scan on t2 (cost=0.00..3.05 rows=2 width=4)
+ QUERY PLAN
+-------------------------------------------------------------------------------------------------
+ Gather Motion 3:1 (slice1; segments: 3) (cost=2.12..3.25 rows=4 width=4)
+ -> Hash Left Anti Semi (Not-In) Join (cost=2.12..3.20 rows=1 width=4)
+ Hash Cond: (t1.c1 = t3.c3)
+ -> Hash Join (cost=1.04..2.10 rows=2 width=4)
+ Hash Cond: (t1.c1 = t2.c2)
+ -> Seq Scan on t1 (cost=0.00..1.03 rows=3 width=4)
+ -> Hash (cost=1.02..1.02 rows=2 width=4)
+ -> Seq Scan on t2 (cost=0.00..1.02 rows=2 width=4)
+ -> Hash (cost=1.05..1.05 rows=3 width=4)
+ -> Broadcast Motion 3:3 (slice2; segments: 3)
(cost=0.00..1.05 rows=3 width=4)
+ -> Seq Scan on t3 (cost=0.00..1.01 rows=1 width=4)
Optimizer: Postgres query optimizer
(12 rows)
diff --git a/src/test/regress/sql/join_gp.sql b/src/test/regress/sql/join_gp.sql
index 3061391c99..fae1fdc7c5 100644
--- a/src/test/regress/sql/join_gp.sql
+++ b/src/test/regress/sql/join_gp.sql
@@ -24,6 +24,18 @@ create temp table l(a int);
insert into l values (1), (1), (2);
select * from l l1 join l l2 on l1.a = l2.a left join l l3 on l1.a = l3.a and
l1.a = 2 order by 1,2,3;
+--
+-- test anti_join/left_anti_semi_join selectivities
+--
+create table aj_t1(a int, b int, c int) distributed by (a);
+create table aj_t2(a int, b int, c int) distributed by (a);
+insert into aj_t1 values(1,1,1);
+insert into aj_t2 values(1,1,1),(2,2,2);
+
+explain(costs off) select t1.a from aj_t1 t1 where not exists (select 1 from
aj_t2 t2 where t1.b = t2.b and t1.c = t2.c);
+
+select t1.a from aj_t1 t1 where not exists (select 1 from aj_t2 t2 where t1.b
= t2.b and t1.c = t2.c);
+
--
-- test hash join
--
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]