This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git

commit 2eb30166c0ab6dbcbacf18f956e3871c522009bd
Author: LiBinfeng <[email protected]>
AuthorDate: Tue Oct 24 22:25:20 2023 +0800

    [Fix](Nereids) fix column statistic derive in outer join estimation (#25586)
    
    Problem:
    When join estimation, upper join output slot statistic ndv would go wrong
    Example:
    we have two table:
    tableA (a1[ndv = 10.0]) tableB(b1[ndv = 0.0], b2[ndv = 10.0])
    tableA left join tableB on A.a1 = B.b1. which B.b1 with ndv zero.
    the problem is after join estimation, B.b2 changed to 1.0.
    Reason:
    When estimating outer join, we can assume it behave like inner join. But we 
estimation then like inner join do
    Solved:
    When estimation outer join, output slot would update seperatly.
---
 .../apache/doris/nereids/stats/JoinEstimation.java | 11 +++--
 .../doris/nereids/stats/JoinEstimateTest.java      | 54 ++++++++++++++++++++++
 2 files changed, 62 insertions(+), 3 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java
index ef4575e3308..800886c177f 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java
@@ -279,6 +279,11 @@ public class JoinEstimation {
      */
     public static Statistics estimate(Statistics leftStats, Statistics 
rightStats, Join join) {
         JoinType joinType = join.getJoinType();
+        Statistics crossJoinStats = new StatisticsBuilder()
+                .setRowCount(Math.max(1, leftStats.getRowCount()) * 
Math.max(1, rightStats.getRowCount()))
+                .putColumnStatistics(leftStats.columnStatistics())
+                .putColumnStatistics(rightStats.columnStatistics())
+                .build();
         if (joinType.isSemiOrAntiJoin()) {
             return estimateSemiOrAnti(leftStats, rightStats, join);
         } else if (joinType == JoinType.INNER_JOIN) {
@@ -289,15 +294,15 @@ public class JoinEstimation {
             Statistics innerJoinStats = estimateInnerJoin(leftStats, 
rightStats, join);
             double rowCount = Math.max(leftStats.getRowCount(), 
innerJoinStats.getRowCount());
             rowCount = Math.max(leftStats.getRowCount(), rowCount);
-            return innerJoinStats.withRowCountAndEnforceValid(rowCount);
+            return crossJoinStats.withRowCountAndEnforceValid(rowCount);
         } else if (joinType == JoinType.RIGHT_OUTER_JOIN) {
             Statistics innerJoinStats = estimateInnerJoin(leftStats, 
rightStats, join);
             double rowCount = Math.max(rightStats.getRowCount(), 
innerJoinStats.getRowCount());
             rowCount = Math.max(rowCount, rightStats.getRowCount());
-            return innerJoinStats.withRowCountAndEnforceValid(rowCount);
+            return crossJoinStats.withRowCountAndEnforceValid(rowCount);
         } else if (joinType == JoinType.FULL_OUTER_JOIN) {
             Statistics innerJoinStats = estimateInnerJoin(leftStats, 
rightStats, join);
-            return 
innerJoinStats.withRowCountAndEnforceValid(leftStats.getRowCount()
+            return 
crossJoinStats.withRowCountAndEnforceValid(leftStats.getRowCount()
                     + rightStats.getRowCount() + innerJoinStats.getRowCount());
         } else if (joinType == JoinType.CROSS_JOIN) {
             return new StatisticsBuilder()
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/JoinEstimateTest.java 
b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/JoinEstimateTest.java
index f49bf94e2d3..2735e26da46 100644
--- 
a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/JoinEstimateTest.java
+++ 
b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/JoinEstimateTest.java
@@ -91,4 +91,58 @@ public class JoinEstimateTest {
         Assertions.assertNotNull(outAStats);
         Assertions.assertEquals(5, outBStats.ndv);
     }
+
+    @Test
+    public void testOuterJoinStats() {
+        SlotReference a = new SlotReference("a", IntegerType.INSTANCE);
+        SlotReference b = new SlotReference("b", IntegerType.INSTANCE);
+        SlotReference c = new SlotReference("c", IntegerType.INSTANCE);
+        EqualTo eq = new EqualTo(a, b);
+        Statistics leftStats = new 
StatisticsBuilder().setRowCount(100).build();
+        leftStats.addColumnStats(a,
+                new ColumnStatisticBuilder()
+                        .setCount(100)
+                        .setNdv(10)
+                        .build()
+        );
+        Statistics rightStats = new 
StatisticsBuilder().setRowCount(80).build();
+        rightStats.addColumnStats(b,
+                new ColumnStatisticBuilder()
+                        .setCount(80)
+                        .setNdv(0)
+                        .build()
+        ).addColumnStats(c,
+                new ColumnStatisticBuilder()
+                        .setCount(80)
+                        .setNdv(20)
+                        .build()
+        );
+        IdGenerator<GroupId> idGenerator = GroupId.createGenerator();
+        GroupPlan left = new GroupPlan(new Group(idGenerator.getNextId(), new 
LogicalProperties(
+                new Supplier<List<Slot>>() {
+                    @Override
+                    public List<Slot> get() {
+                        return Lists.newArrayList(a);
+                    }
+                })));
+        GroupPlan right = new GroupPlan(new Group(idGenerator.getNextId(), new 
LogicalProperties(
+                new Supplier<List<Slot>>() {
+                    @Override
+                    public List<Slot> get() {
+                        return Lists.newArrayList(b, c);
+                    }
+                })));
+        LogicalJoin join = new LogicalJoin(JoinType.LEFT_OUTER_JOIN, 
Lists.newArrayList(eq),
+                left, right);
+        Statistics outputStats = JoinEstimation.estimate(leftStats, 
rightStats, join);
+        ColumnStatistic outAStats = outputStats.findColumnStatistics(a);
+        Assertions.assertNotNull(outAStats);
+        Assertions.assertEquals(10, outAStats.ndv);
+        ColumnStatistic outBStats = outputStats.findColumnStatistics(b);
+        Assertions.assertNotNull(outAStats);
+        Assertions.assertEquals(0, outBStats.ndv);
+        ColumnStatistic outCStats = outputStats.findColumnStatistics(c);
+        Assertions.assertNotNull(outAStats);
+        Assertions.assertEquals(20.0, outCStats.ndv);
+    }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to