From aa87ba523683cca702129fb68eb2519c89ca152a Mon Sep 17 00:00:00 2001
From: Dean Rasheed <dean.a.rasheed@gmail.com>
Date: Mon, 7 Dec 2020 14:42:33 +0000
Subject: [PATCH 1/2] Improve estimation of OR clauses using multiple extended
 statistics.

When estimating an OR clause using multiple extended statistics
objects, treat the estimates for each set of clauses for each
statistics object as independent of one another. The overlap estimates
produced for each statistics object do not apply to clauses covered by
other statistics objects.
---
 src/backend/statistics/extended_stats.c | 25 +++++++++++++++++--------
 src/test/regress/expected/stats_ext.out |  2 +-
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c
index 8d3cd091ad..b6bd12c229 100644
--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
@@ -1356,17 +1356,19 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
 		if (is_or)
 		{
 			bool	   *or_matches = NULL;
-			Selectivity simple_or_sel = 0.0;
+			Selectivity simple_or_sel = 0.0,
+						stat_sel = 0.0;
 			MCVList    *mcv_list;
 
 			/* Load the MCV list stored in the statistics object */
 			mcv_list = statext_mcv_load(stat->statOid);
 
 			/*
-			 * Compute the selectivity of the ORed list of clauses by
-			 * estimating each in turn and combining them using the formula
-			 * P(A OR B) = P(A) + P(B) - P(A AND B).  This allows us to use
-			 * the multivariate MCV stats to better estimate each term.
+			 * Compute the selectivity of the ORed list of clauses covered by
+			 * this statistics object by estimating each in turn and combining
+			 * them using the formula P(A OR B) = P(A) + P(B) - P(A AND B).
+			 * This allows us to use the multivariate MCV stats to better
+			 * estimate the individual terms and their overlap.
 			 *
 			 * Each time we iterate this formula, the clause "A" above is
 			 * equal to all the clauses processed so far, combined with "OR".
@@ -1437,12 +1439,19 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
 														overlap_basesel,
 														mcv_totalsel);
 
-				/* Factor these into the overall result */
-				sel += clause_sel - overlap_sel;
-				CLAMP_PROBABILITY(sel);
+				/* Factor these into the result for this statistics object */
+				stat_sel += clause_sel - overlap_sel;
+				CLAMP_PROBABILITY(stat_sel);
 
 				listidx++;
 			}
+
+			/*
+			 * Factor the result for this statistics object into the overall
+			 * result. We just assume that the results from each separate
+			 * statistics object are independent of one another.
+			 */
+			sel = sel + stat_sel - sel * stat_sel;
 		}
 		else					/* Implicitly-ANDed list of clauses */
 		{
diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
index dbbe9844b2..6e1c4f3edd 100644
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -1706,7 +1706,7 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE (a = 0 A
 SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 OR b = 0 OR c = 0 OR d = 0');
  estimated | actual 
 -----------+--------
-      1714 |   1572
+      1571 |   1572
 (1 row)
 
 DROP TABLE mcv_lists_multi;
-- 
2.26.2

