This is an automated email from the ASF dual-hosted git repository.
englefly pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 0e9fad4fe9 [stats](nereids) improve Anti join stats estimation #22444
0e9fad4fe9 is described below
commit 0e9fad4fe9e95e07b2337e8e5baa24528756e3a9
Author: minghong <[email protected]>
AuthorDate: Fri Aug 4 12:48:39 2023 +0800
[stats](nereids) improve Anti join stats estimation #22444
No impact on TPC-H
impact on TPC-DS 16/69/94 improved
---
.../apache/doris/nereids/stats/JoinEstimation.java | 7 ++--
.../nereids_tpcds_shape_sf100_p0/shape/query16.out | 38 +++++++++++-----------
.../nereids_tpcds_shape_sf100_p0/shape/query69.out | 5 +--
.../nereids_tpcds_shape_sf100_p0/shape/query94.out | 7 ++--
4 files changed, 31 insertions(+), 26 deletions(-)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java
index 36dc90b343..9c42acf5fd 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java
@@ -44,6 +44,7 @@ import java.util.stream.Collectors;
* TODO: Update other props in the ColumnStats properly.
*/
public class JoinEstimation {
+ private static double DEFAULT_ANTI_JOIN_SELECTIVITY_COEFFICIENT = 0.3;
private static EqualTo normalizeHashJoinCondition(EqualTo equalTo,
Statistics leftStats, Statistics rightStats) {
boolean changeOrder = equalTo.left().getInputSlots().stream().anyMatch(
@@ -221,7 +222,8 @@ public class JoinEstimation {
if (join.getJoinType().isSemiJoin()) {
rowCount = semiRowCount;
} else {
- rowCount = leftStats.getRowCount() - semiRowCount;
+ rowCount = Math.max(leftStats.getRowCount() - semiRowCount,
+ leftStats.getRowCount() *
DEFAULT_ANTI_JOIN_SELECTIVITY_COEFFICIENT);
}
} else {
//right semi or anti
@@ -230,7 +232,8 @@ public class JoinEstimation {
if (join.getJoinType().isSemiJoin()) {
rowCount = semiRowCount;
} else {
- rowCount = rightStats.getRowCount() - semiRowCount;
+ rowCount = Math.max(rightStats.getRowCount() - semiRowCount,
+ rightStats.getRowCount() *
DEFAULT_ANTI_JOIN_SELECTIVITY_COEFFICIENT);
}
}
return Math.max(1, rowCount);
diff --git
a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query16.out
b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query16.out
index 967e3b6063..4b580416f2 100644
--- a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query16.out
+++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query16.out
@@ -8,32 +8,32 @@ PhysicalResultSink
----------PhysicalDistribute
------------hashAgg[LOCAL]
--------------PhysicalProject
-----------------hashJoin[INNER_JOIN](cs1.cs_call_center_sk =
call_center.cc_call_center_sk)
-------------------PhysicalProject
---------------------filter(cc_county IN ('Ziebach County', 'Luce County',
'Richland County', 'Daviess County', 'Barrow County'))
-----------------------PhysicalOlapScan[call_center]
+----------------hashJoin[INNER_JOIN](cs1.cs_ship_date_sk = date_dim.d_date_sk)
------------------PhysicalDistribute
--------------------PhysicalProject
----------------------hashJoin[RIGHT_SEMI_JOIN](cs1.cs_order_number =
cs2.cs_order_number)( not (cs_warehouse_sk = cs_warehouse_sk))
------------------------PhysicalDistribute
--------------------------PhysicalProject
----------------------------PhysicalOlapScan[catalog_sales]
-------------------------PhysicalDistribute
---------------------------hashJoin[INNER_JOIN](cs1.cs_ship_date_sk =
date_dim.d_date_sk)
-----------------------------PhysicalProject
-------------------------------filter((cast(d_date as DATETIMEV2(0)) <=
cast(days_add(cast('2002-4-01' as DATEV2), INTERVAL 60 DAY) as
DATETIMEV2(0)))(date_dim.d_date >= 2002-04-01))
---------------------------------PhysicalOlapScan[date_dim]
+------------------------hashJoin[INNER_JOIN](cs1.cs_call_center_sk =
call_center.cc_call_center_sk)
+--------------------------hashJoin[RIGHT_ANTI_JOIN](cs1.cs_order_number =
cr1.cr_order_number)
----------------------------PhysicalDistribute
-------------------------------hashJoin[RIGHT_ANTI_JOIN](cs1.cs_order_number =
cr1.cr_order_number)
+------------------------------PhysicalProject
+--------------------------------PhysicalOlapScan[catalog_returns]
+----------------------------PhysicalDistribute
+------------------------------hashJoin[INNER_JOIN](cs1.cs_ship_addr_sk =
customer_address.ca_address_sk)
+--------------------------------PhysicalProject
+----------------------------------PhysicalOlapScan[catalog_sales]
--------------------------------PhysicalDistribute
----------------------------------PhysicalProject
-------------------------------------PhysicalOlapScan[catalog_returns]
---------------------------------PhysicalDistribute
-----------------------------------hashJoin[INNER_JOIN](cs1.cs_ship_addr_sk =
customer_address.ca_address_sk)
-------------------------------------PhysicalProject
---------------------------------------PhysicalOlapScan[catalog_sales]
-------------------------------------PhysicalDistribute
---------------------------------------PhysicalProject
-----------------------------------------filter((cast(ca_state as VARCHAR(*)) =
'WV'))
-------------------------------------------PhysicalOlapScan[customer_address]
+------------------------------------filter((cast(ca_state as VARCHAR(*)) =
'WV'))
+--------------------------------------PhysicalOlapScan[customer_address]
+--------------------------PhysicalDistribute
+----------------------------PhysicalProject
+------------------------------filter(cc_county IN ('Ziebach County', 'Luce
County', 'Richland County', 'Daviess County', 'Barrow County'))
+--------------------------------PhysicalOlapScan[call_center]
+------------------PhysicalDistribute
+--------------------PhysicalProject
+----------------------filter((cast(d_date as DATETIMEV2(0)) <=
cast(days_add(cast('2002-4-01' as DATEV2), INTERVAL 60 DAY) as
DATETIMEV2(0)))(date_dim.d_date >= 2002-04-01))
+------------------------PhysicalOlapScan[date_dim]
diff --git
a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query69.out
b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query69.out
index 653e6e166b..96ae616ceb 100644
--- a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query69.out
+++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query69.out
@@ -32,8 +32,9 @@ PhysicalResultSink
------------------------------------PhysicalOlapScan[date_dim]
------------------------PhysicalDistribute
--------------------------hashJoin[INNER_JOIN](customer_demographics.cd_demo_sk
= c.c_current_cdemo_sk)
-----------------------------PhysicalProject
-------------------------------PhysicalOlapScan[customer_demographics]
+----------------------------PhysicalDistribute
+------------------------------PhysicalProject
+--------------------------------PhysicalOlapScan[customer_demographics]
----------------------------PhysicalDistribute
------------------------------hashJoin[RIGHT_ANTI_JOIN](c.c_customer_sk =
web_sales.ws_bill_customer_sk)
--------------------------------PhysicalDistribute
diff --git
a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query94.out
b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query94.out
index b613247de9..26640d46a8 100644
--- a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query94.out
+++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query94.out
@@ -9,9 +9,6 @@ PhysicalResultSink
------------hashAgg[LOCAL]
--------------PhysicalProject
----------------hashJoin[INNER_JOIN](ws1.ws_ship_date_sk = date_dim.d_date_sk)
-------------------PhysicalProject
---------------------filter((date_dim.d_date >= 2000-02-01)(cast(d_date as
DATETIMEV2(0)) <= cast(days_add(cast('2000-2-01' as DATEV2), INTERVAL 60 DAY)
as DATETIMEV2(0))))
-----------------------PhysicalOlapScan[date_dim]
------------------PhysicalDistribute
--------------------PhysicalProject
----------------------hashJoin[RIGHT_SEMI_JOIN](ws1.ws_order_number =
ws2.ws_order_number)( not (ws_warehouse_sk = ws_warehouse_sk))
@@ -35,4 +32,8 @@ PhysicalResultSink
--------------------------------PhysicalProject
----------------------------------filter((cast(web_company_name as VARCHAR(*))
= 'pri'))
------------------------------------PhysicalOlapScan[web_site]
+------------------PhysicalDistribute
+--------------------PhysicalProject
+----------------------filter((date_dim.d_date >= 2000-02-01)(cast(d_date as
DATETIMEV2(0)) <= cast(days_add(cast('2000-2-01' as DATEV2), INTERVAL 60 DAY)
as DATETIMEV2(0))))
+------------------------PhysicalOlapScan[date_dim]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]