From d144852e78aae6cff4dde85ac5e21cf92c5f8c2d Mon Sep 17 00:00:00 2001
From: Richard Guo <guofenglinux@gmail.com>
Date: Wed, 25 Dec 2024 12:34:18 +0900
Subject: [PATCH v1] Do not account for nullingrels when estimating number of
 groups

When estimating number of groups, we do not need to concern ourselves
with the outer joins that could null the Vars/PHVs contained in the
grouping expressions.  Accounting for nullingrels in the grouping
expressions could cause estimate_num_groups to count the same Var
multiple times if it's marked with different nullingrels.  This is
incorrect, and could lead to "ERROR:  corrupt MVNDistinct entry" when
searching for multivariate n-distinct.

Furthermore, accounting for nullingrels in the grouping expressions
could prevent us from matching a grouping expression to expressional
index columns or to the expressions in extended statistics, leading to
inaccurate estimates.

To fix, strip out all the nullingrels from the grouping expressions
before we estimate number of groups.
---
 src/backend/utils/adt/selfuncs.c   | 12 ++++++++++++
 src/test/regress/expected/join.out | 21 +++++++++++++++++++++
 src/test/regress/sql/join.sql      | 13 +++++++++++++
 3 files changed, 46 insertions(+)

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 08fa6774d9..57c61963d2 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -121,6 +121,7 @@
 #include "parser/parse_clause.h"
 #include "parser/parse_relation.h"
 #include "parser/parsetree.h"
+#include "rewrite/rewriteManip.h"
 #include "statistics/statistics.h"
 #include "storage/bufmgr.h"
 #include "utils/acl.h"
@@ -3446,6 +3447,17 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
 	if (groupExprs == NIL || (pgset && *pgset == NIL))
 		return 1.0;
 
+	/*
+	 * Strip out all the nullingrels bits from the grouping expressions.
+	 * These nullingrels bits could cause the same Var to be counted multiple
+	 * times if it's marked with different nullingrels.  They could also
+	 * prevent us from matching a grouping expression to expressional index
+	 * columns or to the expressions in extended statistics.
+	 */
+	groupExprs = (List *) remove_nulling_relids((Node *) copyObject(groupExprs),
+												root->outer_join_rels,
+												NULL);
+
 	/*
 	 * Count groups derived from boolean grouping expressions.  For other
 	 * expressions, find the unique Vars used, treating an expression as a Var
diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out
index 0c9b312eaf..ef737a4946 100644
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -8217,3 +8217,24 @@ SELECT * FROM rescan_bhs t1 LEFT JOIN rescan_bhs t2 ON t1.a IN
 
 RESET enable_seqscan;
 RESET enable_indexscan;
+-- Test that we do not account for nullingrels when estimating number of groups
+CREATE TABLE group_tbl (a INT, b INT);
+INSERT INTO group_tbl SELECT 1, 1;
+CREATE STATISTICS group_tbl_stat (ndistinct) ON a, b FROM group_tbl;
+ANALYZE group_tbl;
+EXPLAIN (COSTS OFF)
+SELECT 1 FROM group_tbl t1
+    LEFT JOIN (SELECT a c1, COALESCE(a) c2 FROM group_tbl t2) s ON TRUE
+GROUP BY s.c1, s.c2;
+                 QUERY PLAN                 
+--------------------------------------------
+ Group
+   Group Key: t2.a, (COALESCE(t2.a))
+   ->  Sort
+         Sort Key: t2.a, (COALESCE(t2.a))
+         ->  Nested Loop Left Join
+               ->  Seq Scan on group_tbl t1
+               ->  Seq Scan on group_tbl t2
+(7 rows)
+
+DROP TABLE group_tbl;
diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql
index 8cfc1053cb..ff6ca94aa2 100644
--- a/src/test/regress/sql/join.sql
+++ b/src/test/regress/sql/join.sql
@@ -3033,3 +3033,16 @@ SELECT * FROM rescan_bhs t1 LEFT JOIN rescan_bhs t2 ON t1.a IN
 
 RESET enable_seqscan;
 RESET enable_indexscan;
+
+-- Test that we do not account for nullingrels when estimating number of groups
+CREATE TABLE group_tbl (a INT, b INT);
+INSERT INTO group_tbl SELECT 1, 1;
+CREATE STATISTICS group_tbl_stat (ndistinct) ON a, b FROM group_tbl;
+ANALYZE group_tbl;
+
+EXPLAIN (COSTS OFF)
+SELECT 1 FROM group_tbl t1
+    LEFT JOIN (SELECT a c1, COALESCE(a) c2 FROM group_tbl t2) s ON TRUE
+GROUP BY s.c1, s.c2;
+
+DROP TABLE group_tbl;
-- 
2.43.0

