This is an automated email from the ASF dual-hosted git repository. kxiao pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/doris.git
commit 2eb30166c0ab6dbcbacf18f956e3871c522009bd Author: LiBinfeng <[email protected]> AuthorDate: Tue Oct 24 22:25:20 2023 +0800 [Fix](Nereids) fix column statistic derive in outer join estimation (#25586) Problem: When join estimation, upper join output slot statistic ndv would go wrong Example: we have two table: tableA (a1[ndv = 10.0]) tableB(b1[ndv = 0.0], b2[ndv = 10.0]) tableA left join tableB on A.a1 = B.b1. which B.b1 with ndv zero. the problem is after join estimation, B.b2 changed to 1.0. Reason: When estimating outer join, we can assume it behave like inner join. But we estimation then like inner join do Solved: When estimation outer join, output slot would update seperatly. --- .../apache/doris/nereids/stats/JoinEstimation.java | 11 +++-- .../doris/nereids/stats/JoinEstimateTest.java | 54 ++++++++++++++++++++++ 2 files changed, 62 insertions(+), 3 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java index ef4575e3308..800886c177f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java @@ -279,6 +279,11 @@ public class JoinEstimation { */ public static Statistics estimate(Statistics leftStats, Statistics rightStats, Join join) { JoinType joinType = join.getJoinType(); + Statistics crossJoinStats = new StatisticsBuilder() + .setRowCount(Math.max(1, leftStats.getRowCount()) * Math.max(1, rightStats.getRowCount())) + .putColumnStatistics(leftStats.columnStatistics()) + .putColumnStatistics(rightStats.columnStatistics()) + .build(); if (joinType.isSemiOrAntiJoin()) { return estimateSemiOrAnti(leftStats, rightStats, join); } else if (joinType == JoinType.INNER_JOIN) { @@ -289,15 +294,15 @@ public class JoinEstimation { Statistics innerJoinStats = estimateInnerJoin(leftStats, rightStats, join); double rowCount = Math.max(leftStats.getRowCount(), innerJoinStats.getRowCount()); rowCount = Math.max(leftStats.getRowCount(), rowCount); - return innerJoinStats.withRowCountAndEnforceValid(rowCount); + return crossJoinStats.withRowCountAndEnforceValid(rowCount); } else if (joinType == JoinType.RIGHT_OUTER_JOIN) { Statistics innerJoinStats = estimateInnerJoin(leftStats, rightStats, join); double rowCount = Math.max(rightStats.getRowCount(), innerJoinStats.getRowCount()); rowCount = Math.max(rowCount, rightStats.getRowCount()); - return innerJoinStats.withRowCountAndEnforceValid(rowCount); + return crossJoinStats.withRowCountAndEnforceValid(rowCount); } else if (joinType == JoinType.FULL_OUTER_JOIN) { Statistics innerJoinStats = estimateInnerJoin(leftStats, rightStats, join); - return innerJoinStats.withRowCountAndEnforceValid(leftStats.getRowCount() + return crossJoinStats.withRowCountAndEnforceValid(leftStats.getRowCount() + rightStats.getRowCount() + innerJoinStats.getRowCount()); } else if (joinType == JoinType.CROSS_JOIN) { return new StatisticsBuilder() diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/JoinEstimateTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/JoinEstimateTest.java index f49bf94e2d3..2735e26da46 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/JoinEstimateTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/JoinEstimateTest.java @@ -91,4 +91,58 @@ public class JoinEstimateTest { Assertions.assertNotNull(outAStats); Assertions.assertEquals(5, outBStats.ndv); } + + @Test + public void testOuterJoinStats() { + SlotReference a = new SlotReference("a", IntegerType.INSTANCE); + SlotReference b = new SlotReference("b", IntegerType.INSTANCE); + SlotReference c = new SlotReference("c", IntegerType.INSTANCE); + EqualTo eq = new EqualTo(a, b); + Statistics leftStats = new StatisticsBuilder().setRowCount(100).build(); + leftStats.addColumnStats(a, + new ColumnStatisticBuilder() + .setCount(100) + .setNdv(10) + .build() + ); + Statistics rightStats = new StatisticsBuilder().setRowCount(80).build(); + rightStats.addColumnStats(b, + new ColumnStatisticBuilder() + .setCount(80) + .setNdv(0) + .build() + ).addColumnStats(c, + new ColumnStatisticBuilder() + .setCount(80) + .setNdv(20) + .build() + ); + IdGenerator<GroupId> idGenerator = GroupId.createGenerator(); + GroupPlan left = new GroupPlan(new Group(idGenerator.getNextId(), new LogicalProperties( + new Supplier<List<Slot>>() { + @Override + public List<Slot> get() { + return Lists.newArrayList(a); + } + }))); + GroupPlan right = new GroupPlan(new Group(idGenerator.getNextId(), new LogicalProperties( + new Supplier<List<Slot>>() { + @Override + public List<Slot> get() { + return Lists.newArrayList(b, c); + } + }))); + LogicalJoin join = new LogicalJoin(JoinType.LEFT_OUTER_JOIN, Lists.newArrayList(eq), + left, right); + Statistics outputStats = JoinEstimation.estimate(leftStats, rightStats, join); + ColumnStatistic outAStats = outputStats.findColumnStatistics(a); + Assertions.assertNotNull(outAStats); + Assertions.assertEquals(10, outAStats.ndv); + ColumnStatistic outBStats = outputStats.findColumnStatistics(b); + Assertions.assertNotNull(outAStats); + Assertions.assertEquals(0, outBStats.ndv); + ColumnStatistic outCStats = outputStats.findColumnStatistics(c); + Assertions.assertNotNull(outAStats); + Assertions.assertEquals(20.0, outCStats.ndv); + } } --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
