From 1a8786a75666094d015b33abca4dbb95a0ad72a8 Mon Sep 17 00:00:00 2001
From: Richard Guo <guofenglinux@gmail.com>
Date: Tue, 31 Dec 2024 12:29:36 +0900
Subject: [PATCH v2] Ignore nullingrels when looking up statistics

When looking up statistical data about an expression, we do not need
to concern ourselves with the outer joins that could null the
Vars/PHVs contained in the expression.  Accounting for nullingrels in
the expression could cause estimate_num_groups to count the same Var
multiple times if it's marked with different nullingrels.  This is
incorrect, and could lead to "ERROR:  corrupt MVNDistinct entry" when
searching for multivariate n-distinct.

Furthermore, the nullingrels could prevent us from matching an
expression to expressional index columns or to the expressions in
extended statistics, leading to inaccurate estimates.

To fix, strip out all the nullingrels from the expression before we
look up statistical data about it.
---
 src/backend/utils/adt/selfuncs.c   | 30 +++++++++++++++++++++++++++---
 src/test/regress/expected/join.out | 26 ++++++++++++++++++++++++--
 src/test/regress/sql/join.sql      | 13 +++++++++++++
 3 files changed, 64 insertions(+), 5 deletions(-)

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 08fa6774d9..aa38b60da7 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -121,6 +121,7 @@
 #include "parser/parse_clause.h"
 #include "parser/parse_relation.h"
 #include "parser/parsetree.h"
+#include "rewrite/rewriteManip.h"
 #include "statistics/statistics.h"
 #include "storage/bufmgr.h"
 #include "utils/acl.h"
@@ -3306,6 +3307,15 @@ add_unique_group_var(PlannerInfo *root, List *varinfos,
 
 	ndistinct = get_variable_numdistinct(vardata, &isdefault);
 
+	/*
+	 * The nullingrels bits within the var could cause the same var to be
+	 * counted multiple times if it is marked with different nullingrels.
+	 * They could also prevent us from matching the var to the expressions in
+	 * extended statistics (see estimate_multivariate_ndistinct).  So strip
+	 * them out first.
+	 */
+	var = remove_nulling_relids(var, root->outer_join_rels, NULL);
+
 	foreach(lc, varinfos)
 	{
 		varinfo = (GroupVarInfo *) lfirst(lc);
@@ -5077,7 +5087,13 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
 	{
 		int			relid;
 
-		if (bms_get_singleton_member(varnos, &relid))
+		/*
+		 * Check if the expression is in vars of a single base relation.  We
+		 * need to exclude the OJs that might be present in the expression.
+		 */
+		if (bms_get_singleton_member(bms_difference(varnos,
+													root->outer_join_rels),
+									 &relid))
 		{
 			if (varRelid == 0 || varRelid == relid)
 			{
@@ -5107,8 +5123,6 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
 		}
 	}
 
-	bms_free(varnos);
-
 	vardata->var = node;
 	vardata->atttype = exprType(node);
 	vardata->atttypmod = exprTypmod(node);
@@ -5132,6 +5146,14 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
 		ListCell   *slist;
 		Oid			userid;
 
+		/*
+		 * The nullingrels bits within the expression could prevent us from
+		 * matching it to expressional index columns or to the expressions in
+		 * extended statistics.  So strip them out first.
+		 */
+		if (bms_overlap(varnos, root->outer_join_rels))
+			node = remove_nulling_relids(node, root->outer_join_rels, NULL);
+
 		/*
 		 * Determine the user ID to use for privilege checks: either
 		 * onerel->userid if it's set (e.g., in case we're accessing the table
@@ -5402,6 +5424,8 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
 			}
 		}
 	}
+
+	bms_free(varnos);
 }
 
 /*
diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out
index 0c9b312eaf..079fcf46f0 100644
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -2573,10 +2573,11 @@ where t1.f1 = coalesce(t2.f1, 1);
                ->  Materialize
                      ->  Seq Scan on int4_tbl t2
                            Filter: (f1 > 1)
-         ->  Seq Scan on int4_tbl t3
+         ->  Materialize
+               ->  Seq Scan on int4_tbl t3
    ->  Materialize
          ->  Seq Scan on int4_tbl t4
-(13 rows)
+(14 rows)
 
 explain (costs off)
 select * from int4_tbl t1
@@ -8217,3 +8218,24 @@ SELECT * FROM rescan_bhs t1 LEFT JOIN rescan_bhs t2 ON t1.a IN
 
 RESET enable_seqscan;
 RESET enable_indexscan;
+-- Test that we do not account for nullingrels when looking up statistics
+CREATE TABLE group_tbl (a INT, b INT);
+INSERT INTO group_tbl SELECT 1, 1;
+CREATE STATISTICS group_tbl_stat (ndistinct) ON a, b FROM group_tbl;
+ANALYZE group_tbl;
+EXPLAIN (COSTS OFF)
+SELECT 1 FROM group_tbl t1
+    LEFT JOIN (SELECT a c1, COALESCE(a) c2 FROM group_tbl t2) s ON TRUE
+GROUP BY s.c1, s.c2;
+                 QUERY PLAN                 
+--------------------------------------------
+ Group
+   Group Key: t2.a, (COALESCE(t2.a))
+   ->  Sort
+         Sort Key: t2.a, (COALESCE(t2.a))
+         ->  Nested Loop Left Join
+               ->  Seq Scan on group_tbl t1
+               ->  Seq Scan on group_tbl t2
+(7 rows)
+
+DROP TABLE group_tbl;
diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql
index 8cfc1053cb..779d56cb30 100644
--- a/src/test/regress/sql/join.sql
+++ b/src/test/regress/sql/join.sql
@@ -3033,3 +3033,16 @@ SELECT * FROM rescan_bhs t1 LEFT JOIN rescan_bhs t2 ON t1.a IN
 
 RESET enable_seqscan;
 RESET enable_indexscan;
+
+-- Test that we do not account for nullingrels when looking up statistics
+CREATE TABLE group_tbl (a INT, b INT);
+INSERT INTO group_tbl SELECT 1, 1;
+CREATE STATISTICS group_tbl_stat (ndistinct) ON a, b FROM group_tbl;
+ANALYZE group_tbl;
+
+EXPLAIN (COSTS OFF)
+SELECT 1 FROM group_tbl t1
+    LEFT JOIN (SELECT a c1, COALESCE(a) c2 FROM group_tbl t2) s ON TRUE
+GROUP BY s.c1, s.c2;
+
+DROP TABLE group_tbl;
-- 
2.43.0