Hi, thank you for reviewing it. Tue, 02 Jun 2026 at 0:18, Zsolt Parragi (<[email protected]>) wrote:
> > The strongest > > case is a top-level AND condition with equality clauses for all columns > > I see one issue related to that with the current patch, having the > same amount of conditions as the table doesn't mean that we have > conditions for all columns. I've fixed this in the v2 patch attached above (also attached here for completeness). It focuses on the basic full-dimensional top-level AND equality miss Now we construct a Bitmapset with the columns the clause covers and compare it to `stat->keys` that contains the columns covered by the MCV statistic. Best regards, Enrique.
From aa07d8660ff8a0a3e33d3ada65c73b8761990260 Mon Sep 17 00:00:00 2001 From: Enrique Sanchez Cardoso <[email protected]> Date: Wed, 3 Jun 2026 23:12:16 +0200 Subject: [PATCH v2 1/1] Cap selectivity when values are not in multi-column mcv Selectivity can't be > last MCV item (least common) selectivity when they are AND clauses and cover all the MCV dimensions. --- src/backend/statistics/extended_stats.c | 79 ++++++++++++++++++- src/backend/statistics/mcv.c | 9 ++- .../statistics/extended_stats_internal.h | 3 +- src/test/regress/expected/stats_ext.out | 26 ++++++ src/test/regress/sql/stats_ext.sql | 24 ++++++ 5 files changed, 138 insertions(+), 3 deletions(-) diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c index 2b83355d26e..748d3e884c4 100644 --- a/src/backend/statistics/extended_stats.c +++ b/src/backend/statistics/extended_stats.c @@ -1705,6 +1705,67 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid, return true; } +/* + * mcv_can_cap + * Determines whether the MCV selectivity estimate can be capped at the + * frequency of the least common item in the MCV list. + * + * When a combination of values does not appear in the MCV list, its true + * selectivity must be lower than the frequency of the least common tracked + * combination. We can exploit this to cap the combined selectivity estimate, + * but only when the following conditions are both satisfied: + * + * 1. The clauses cover all dimensions of the statistics object, i.e. + * covered_attnums equals stat->keys exactly. If any dimension is + * unconstrained, the absence of a match in the MCV list does not bound + * the selectivity of the full combination. + * + * 2. Every clause is an equality-like condition: either an equality operator, + * an IS NULL test, or a bare boolean Var. Range or inequality predicates + * can match many values, so the per-combination argument no longer applies. + * + * Returns true if both conditions hold and capping is valid. + */ +static bool +mcv_can_cap(StatisticExtInfo *stat, Bitmapset *covered_attnums, List *stat_clauses) +{ + ListCell *lc; + + /* + * Expressions are not supported, they can match multiple rows. + * Also, the clauses must cover all dimensions of the MCV list. + */ + if (stat->exprs != NULL || !bms_equal(covered_attnums, stat->keys)) + { + return false; + } + + foreach(lc, stat_clauses) + { + RestrictInfo *rinfo = (RestrictInfo *) lfirst(lc); + Node *clause = (Node *) rinfo->clause; + + /* = */ + if (is_opclause(clause) && get_oprrest(((const OpExpr *) clause)->opno) == F_EQSEL) + continue; + + /* IS NULL */ + if (IsA(clause, NullTest) && ((const NullTest *) clause)->nulltesttype == IS_NULL) + continue; + + /* = TRUE */ + if (IsA(clause, Var)) + continue; + + /* = FALSE */ + if (IsA(clause, BoolExpr) && ((const BoolExpr *) clause)->boolop == NOT_EXPR && IsA(linitial(((const BoolExpr *) clause)->args), Var)) + continue; + + return false; + } + return true; +} + /* * statext_mcv_clauselist_selectivity * Estimate clauses using the best multi-column statistics. @@ -1800,6 +1861,8 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli StatisticExtInfo *stat; List *stat_clauses; Bitmapset *simple_clauses; + Bitmapset *covered_attnums; + bool can_cap; /* find the best suited statistics object for these attnums */ stat = choose_best_statistics(rel->statlist, STATS_EXT_MCV, rte->inh, @@ -1822,6 +1885,9 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli /* record which clauses are simple (single column or expression) */ simple_clauses = NULL; + /* record all attnums to check if MCV covers all of them */ + covered_attnums = NULL; + listidx = -1; foreach(l, clauses) { @@ -1872,6 +1938,8 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli stat_clauses = lappend(stat_clauses, (Node *) lfirst(l)); *estimatedclauses = bms_add_member(*estimatedclauses, listidx); + covered_attnums = bms_add_members(covered_attnums, list_attnums[listidx]); + /* * Reset the pointers, so that choose_best_statistics knows this * clause was estimated and does not consider it again. @@ -1883,6 +1951,9 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli list_exprs[listidx] = NULL; } + can_cap = mcv_can_cap(stat, covered_attnums, stat_clauses); + bms_free(covered_attnums); + if (is_or) { bool *or_matches = NULL; @@ -1989,6 +2060,7 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli mcv_sel, mcv_basesel, mcv_totalsel, + mcv_cap, stat_sel; /* @@ -2006,7 +2078,8 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli mcv_sel = mcv_clauselist_selectivity(root, stat, stat_clauses, varRelid, jointype, sjinfo, rel, &mcv_basesel, - &mcv_totalsel); + &mcv_totalsel, + &mcv_cap); /* Combine the simple and multi-column estimates. */ stat_sel = mcv_combine_selectivities(simple_sel, @@ -2014,6 +2087,10 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli mcv_basesel, mcv_totalsel); + /* Cap to the least common MCV item when no MCV items matched. */ + if (can_cap && stat_sel > mcv_cap) + stat_sel = mcv_cap; + /* Factor this into the overall result */ sel *= stat_sel; } diff --git a/src/backend/statistics/mcv.c b/src/backend/statistics/mcv.c index 0b7da605a4c..6617c297eab 100644 --- a/src/backend/statistics/mcv.c +++ b/src/backend/statistics/mcv.c @@ -2047,7 +2047,8 @@ mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat, List *clauses, int varRelid, JoinType jointype, SpecialJoinInfo *sjinfo, RelOptInfo *rel, - Selectivity *basesel, Selectivity *totalsel) + Selectivity *basesel, Selectivity *totalsel, + Selectivity *cap) { int i; MCVList *mcv; @@ -2057,6 +2058,9 @@ mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat, /* match/mismatch bitmap for each MCV item */ bool *matches = NULL; + /* default: no cap on selectivity */ + *cap = 1.0; + /* load the MCV list stored in the statistics object */ mcv = statext_mcv_load(stat->statOid, rte->inh); @@ -2078,6 +2082,9 @@ mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat, } } + if (s == 0.0 && mcv->nitems > 0) + *cap = mcv->items[mcv->nitems - 1].frequency; + return s; } diff --git a/src/include/statistics/extended_stats_internal.h b/src/include/statistics/extended_stats_internal.h index c775442f2ee..01b5f67b843 100644 --- a/src/include/statistics/extended_stats_internal.h +++ b/src/include/statistics/extended_stats_internal.h @@ -129,7 +129,8 @@ extern Selectivity mcv_clauselist_selectivity(PlannerInfo *root, SpecialJoinInfo *sjinfo, RelOptInfo *rel, Selectivity *basesel, - Selectivity *totalsel); + Selectivity *totalsel, + Selectivity *cap); extern Selectivity mcv_clause_selectivity_or(PlannerInfo *root, StatisticExtInfo *stat, diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out index 37070c1a896..a15fcd92d1a 100644 --- a/src/test/regress/expected/stats_ext.out +++ b/src/test/regress/expected/stats_ext.out @@ -2928,6 +2928,32 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE (a = 0 (1 row) DROP TABLE mcv_lists_partial; +-- P(a=0)=0.5, P(b=0)=0.5, P(c=TRUE)=0.5, P(d='{1,2}')=0.5 so the independence estimate is 0.0625 * N. +-- After building MCV statistics the cap limits the combined estimate to the +-- least-common MCV frequency, eliminating most of the over-estimation. +CREATE TABLE mcv_cap (a INT, b INT, c BOOL, d INTEGER[]) WITH (autovacuum_enabled = off); +INSERT INTO mcv_cap + SELECT 0, b, TRUE, '{}' FROM generate_series(1, 99) b, generate_series(1, 100) r; +INSERT INTO mcv_cap + SELECT a, 0, NULL, '{1, 2}' FROM generate_series(1, 99) a, generate_series(1, 100) r; +ANALYZE mcv_cap; +-- without MCV statistics: independence gives 0.5 * 0.5 * 0.5 * 0.5 * 19800 = 1238 rows +SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 0 AND c = TRUE AND d = '{1, 2}'$$); + estimated | actual +-----------+-------- + 1238 | 0 +(1 row) + +CREATE STATISTICS mcv_cap_stats (mcv) ON a, b, c, d FROM mcv_cap; +ANALYZE mcv_cap; +-- with MCV statistics: bounded by least MCV frequency +SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 0 AND c = TRUE AND d = '{1, 2}'$$); + estimated | actual +-----------+-------- + 100 | 0 +(1 row) + +DROP TABLE mcv_cap; -- check the ability to use multiple MCV lists CREATE TABLE mcv_lists_multi ( a INTEGER, diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql index 3cc6012b822..dc77670706d 100644 --- a/src/test/regress/sql/stats_ext.sql +++ b/src/test/regress/sql/stats_ext.sql @@ -1465,6 +1465,30 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE (a = 0 DROP TABLE mcv_lists_partial; +-- P(a=0)=0.5, P(b=0)=0.5, P(c=TRUE)=0.5, P(d='{1,2}')=0.5 so the independence estimate is 0.0625 * N. +-- After building MCV statistics the cap limits the combined estimate to the +-- least-common MCV frequency, eliminating most of the over-estimation. +CREATE TABLE mcv_cap (a INT, b INT, c BOOL, d INTEGER[]) WITH (autovacuum_enabled = off); + +INSERT INTO mcv_cap + SELECT 0, b, TRUE, '{}' FROM generate_series(1, 99) b, generate_series(1, 100) r; + +INSERT INTO mcv_cap + SELECT a, 0, NULL, '{1, 2}' FROM generate_series(1, 99) a, generate_series(1, 100) r; + +ANALYZE mcv_cap; + +-- without MCV statistics: independence gives 0.5 * 0.5 * 0.5 * 0.5 * 19800 = 1238 rows +SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 0 AND c = TRUE AND d = '{1, 2}'$$); + +CREATE STATISTICS mcv_cap_stats (mcv) ON a, b, c, d FROM mcv_cap; +ANALYZE mcv_cap; + +-- with MCV statistics: bounded by least MCV frequency +SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 0 AND c = TRUE AND d = '{1, 2}'$$); + +DROP TABLE mcv_cap; + -- check the ability to use multiple MCV lists CREATE TABLE mcv_lists_multi ( a INTEGER, -- 2.43.0
