Re: Hash-based MCV matching for large IN-lists

Ilia Evdokimov Wed, 25 Feb 2026 14:46:28 -0800

I've addressed the review comments mentioned above.

David made a very good observation: for unique columns, where eachiteration effectively returns the same per-element selectivity, there isno need to iterate at all. In such cases we can reduce the computationto a closed-form expression, i.e. O(1) instead of running the loop O(N).

I applied this idea to unique columns and cases falling back toDEFAULT_EQ_SEL. In both cases the loop can be replaced with aclosed-from formula implemented in calculate_combined_selectivity(). Theformula mirros the existing independent/disjoint probability model: ANY(sel = 1 - (1 - s) ^ length or length * s ), ALL (sel = s ^ length or 1- length*(1 - s)). It would be good to carefully review that this isfully equivalent to the current accumulation logic.

I also exprimented with applying the same idea to elements that are notfound in MCV, are not Const, and effectively found in MCV with more thanone count. Those cases can still be accumulated usingaccum_scalararray_prob(), but potentially grouped to reduce repeated work.


Overall, the optimization work can be logically split into three parts:

1. Degenerate NULL case O(N) -> O(1) [0]
2. Identical non-NULL per-element selectivity O(N) -> O(1) (can be
   split into a separate thread if prederred)
3. MCV matching via hashing O(M*N) -> O(M+N) (current thread)

Feedback on how to best structure or split this work would be appreciated.

About op_is_reserved. It seems we should assign op_is_reserved = true,because we don't reverse types like eqjoinsel_semi(). If IN-list smallerthan MCV-list we reverse it by fmgr_info(hash_mcv ? hashLeft :hashRight, &hash_proc). Thanks for this remark.


Thoughts?

--
Best regards,
Ilia Evdokimov,
Tantor Labs LLC,
https://tantorlabs.com/

From 606d29dbb3e316796df3a581811ca6c98ac5b3a6 Mon Sep 17 00:00:00 2001
From: Evdokimov Ilia <[email protected]>
Date: Wed, 25 Feb 2026 23:34:37 +0300
Subject: [PATCH v6 3/3] Use hash-based MCV matching for ScalarArrayOpExpr 
 selectivity

When estimating selectivity for ScalarArrayOpExpr (IN / ANY / ALL) with
available MCV statistics, the planner currently matches IN-list elements
against the MCV array using nested loops. For large IN-lists and/or large
MCV lists this leads to O(N*M) planning-time behavior.

This patch adds a hash-based matching strategy, similar to the one used
in join selectivity estimation. When MCV statistics are available and the
operator supports hashing, the smaller of the two inputs (MCV list or
IN-list constant elements) is chosen as the hash table build side, and
the other side is scanned once, reducing complexity to O(N+M).

The hash-based path is restricted to equality and inequality operators
that use eqsel()/neqsel(), and is applied only when suitable hash
functions and MCV statistics are available.
---
 src/backend/utils/adt/selfuncs.c | 450 ++++++++++++++++++++++++++++++-
 1 file changed, 440 insertions(+), 10 deletions(-)

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index f6091a576d8..b6a9cf7844f 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -146,23 +146,27 @@
 /*
  * In production builds, switch to hash-based MCV matching when the lists are
  * large enough to amortize hash setup cost.  (This threshold is compared to
- * the sum of the lengths of the two MCV lists.  This is simplistic but seems
+ * the sum of the lengths of the two lists.  This is simplistic but seems
  * to work well enough.)  In debug builds, we use a smaller threshold so that
  * the regression tests cover both paths well.
  */
 #ifndef USE_ASSERT_CHECKING
-#define EQJOINSEL_MCV_HASH_THRESHOLD 200
+#define MCV_HASH_THRESHOLD 200
 #else
-#define EQJOINSEL_MCV_HASH_THRESHOLD 20
+#define MCV_HASH_THRESHOLD 20
 #endif
 
-/* Entries in the simplehash hash table used by eqjoinsel_find_matches */
+/*
+ * Entries in the simplehash hash table used by
+ * eqjoinsel_find_matches and scalararray_mcv_hash_match
+ */
 typedef struct MCVHashEntry
 {
 	Datum		value;			/* the value represented by this entry */
 	int			index;			/* its index in the relevant AttStatsSlot */
 	uint32		hash;			/* hash code for the Datum */
 	char		status;			/* status code used by simplehash.h */
+	int			count;			/* number of occurrences of current value in */
 } MCVHashEntry;
 
 /* private_data for the simplehash hash table */
@@ -184,6 +188,15 @@ get_relation_stats_hook_type get_relation_stats_hook = NULL;
 get_index_stats_hook_type get_index_stats_hook = NULL;
 
 static double eqsel_internal(PG_FUNCTION_ARGS, bool negate);
+static double scalararray_mcv_hash_match(VariableStatData *vardata, Oid operator, Oid collation,
+										 Selectivity s2, Datum *elem_values,
+										 bool *elem_nulls, int num_elems, bool *elem_const,
+										 Oid nominal_element_type, bool useOr, bool isEquality,
+										 bool isInequality);
+static void accum_scalararray_prob(double individual_s, int count,
+									bool useOr, bool isEquality,
+									bool isInequality, double nullfrac,
+									double *selec, double *s1disjoint);
 static Selectivity calculate_combined_selectivity(Selectivity s2, int num_elems,
 							  bool useOr,
 							  bool isEquality, bool isInequality);
@@ -1951,6 +1964,36 @@ calculate_combined_selectivity(Selectivity s2, int num_elems, bool useOr, bool i
 	return s1;
 }
 
+/*
+ * Accumulate the selectivity contribution of a single array element
+ * into the running ScalarArrayOpExpr selectivity estimate.
+ */
+static void
+accum_scalararray_prob(double individual_s, int count, bool useOr, bool isEquality,
+						bool isInequality, double nullfrac, double *selec, double *s1disjoint)
+{
+	if (count <= 0)
+		return;
+
+	if (isInequality)
+		individual_s = 1.0 - individual_s - nullfrac;
+
+	CLAMP_PROBABILITY(individual_s);
+
+	if (useOr)
+	{
+		*selec = 1.0 - (1.0 - *selec) * pow(1.0 - individual_s, count);
+		if (isEquality)
+			*s1disjoint += individual_s * count;
+	}
+	else
+	{
+		*selec = (*selec) * pow(individual_s, count);
+		if (isInequality)
+			*s1disjoint += count * (individual_s - 1.0);
+	}
+}
+
 /*
  *		scalararraysel		- Selectivity of ScalarArrayOpExpr Node.
  */
@@ -2102,6 +2145,7 @@ scalararraysel(PlannerInfo *root,
 			Selectivity s2 = -1.0;
 			Node       *other_op = NULL;
 			bool        var_on_left;
+			bool	   *elem_const = NULL;
 
 			/*
 			 * If the clause is of the form "var OP something" or
@@ -2134,16 +2178,25 @@ scalararraysel(PlannerInfo *root,
 						s2 = 1.0 - DEFAULT_EQ_SEL;
 				}
 
-				ReleaseVariableStats(vardata);
-
 				if (s2 >= 0.0)
 				{
 					CLAMP_PROBABILITY(s2);
 
 					s1 = calculate_combined_selectivity(s2, num_elems, useOr, isEquality, isInequality);
 
+					ReleaseVariableStats(vardata);
+
 					return s1;
 				}
+
+				s1 = scalararray_mcv_hash_match(&vardata, operator, clause->inputcollid, s2,
+												elem_values, elem_nulls, num_elems, elem_const,
+												nominal_element_type, useOr, isEquality, isInequality);
+
+				ReleaseVariableStats(vardata);
+
+				if (s1 >= 0.0)
+					return s1;
 			}
 			else
 			{
@@ -2250,16 +2303,78 @@ scalararraysel(PlannerInfo *root,
 			 * variable, then fall back to default code path to compute
 			 * default selectivity.
 			 */
-			if (!get_restriction_variable(root, clause->args, varRelid,
+			if (get_restriction_variable(root, clause->args, varRelid,
 										 &vardata, &other_op, &var_on_left))
+			{
+				Datum	   *elem_values;
+				bool	   *elem_nulls;
+				bool	   *elem_const;
+				ListCell   *lc;
+
+				elem_values = palloc_array(Datum, num_elems);
+				elem_nulls = palloc0_array(bool, num_elems);
+				elem_const = palloc0_array(bool, num_elems);
+
+				/*
+				 * Build arrays describing ARRAY[] elements:
+				 * - elem_values: Datum value for Const elements
+				 * - elem_nulls: whether element is NULL
+				 * - elem_const: whether element is a Const node
+				 */
+				foreach(lc, arrayexpr->elements)
+				{
+					Node	*elem_value = (Node *) lfirst(lc);
+					int		i = foreach_current_index(lc);
+
+					if (IsA(elem_value, Const))
+					{
+						elem_values[i] = ((Const *) elem_value)->constvalue;
+						elem_nulls[i] = ((Const *) elem_value)->constisnull;
+						elem_const[i] = true;
+					}
+					else
+					{
+						elem_nulls[i] = false;
+						elem_const[i] = false;
+					}
+
+					/* Selectivity of "WHERE x NOT IN (NULL, ... )" is always 0 */
+					if (!useOr && elem_nulls[i])
+					{
+						pfree(elem_values);
+						pfree(elem_nulls);
+						pfree(elem_const);
+
+						ReleaseVariableStats(vardata);
+
+						return (Selectivity) 0.0;
+					}
+				}
+
+				/* Compute per-element selectivity via eqsel()/neqsel semantics. */
+				s2 = var_eq_non_const(&vardata, operator, clause->inputcollid,
+											  other_op, var_on_left, isInequality);
+
+				s1 = scalararray_mcv_hash_match(&vardata, operator, clause->inputcollid, s2,
+												elem_values, elem_nulls, num_elems, elem_const,
+												nominal_element_type, useOr, isEquality, isInequality);
+
+				pfree(elem_values);
+				pfree(elem_nulls);
+				pfree(elem_const);
+
+				ReleaseVariableStats(vardata);
+
+				if (s1 >= 0.0)
+					return s1;
+			}
+			else
 			{
 				s2 = (isInequality) ? (1.0 - DEFAULT_EQ_SEL) : DEFAULT_EQ_SEL;
 				s1 = calculate_combined_selectivity(s2, num_elems, useOr, isEquality, isInequality);
 
 				return s1;
 			}
-			else
-				ReleaseVariableStats(vardata);
 		}
 
 		/*
@@ -2376,6 +2491,321 @@ scalararraysel(PlannerInfo *root,
 	return s1;
 }
 
+/*
+ * Estimate selectivity of a ScalarArrayOpExpr (ANY/ALL) using MCV statistics
+ * with hash-based matching.
+ *
+ * This function follows the same probability model as the generic
+ * ScalarArrayOpExpr selectivity code (independent or disjoint probabilities
+ * for OR/AND combinations), but attempts to speed up matching between
+ * IN-list elements and the column's most-common-values (MCV) statistics by
+ * using hashing instead of nested loops.
+ *
+ * MCV statistics are used only to obtain per-value selectivities for
+ * constants that match MCV entries.  All probabilities are combined using
+ * the standard ANY/ALL formulas, exactly as in the generic estimator.
+ *
+ * The function may return -1.0 to indicate that hash-based MCV estimation
+ * is not applicable (for example, missing statistics, unsupported operator,
+ * or unavailable hash functions), in which case the caller should fall back
+ * to the generic ScalarArrayOpExpr selectivity estimation.
+ *
+ * Inputs:
+ *	vardata: statistics and metadata for the variable being estimated
+ *	operator: equality or inequality operator to apply
+ *	collation: OID of collation to use
+ *  nonconst_sel: selectivity of non-const element
+ *	elem_values: array of IN-list element values
+ *	elem_nulls: array indicating which IN-list elements are NULL
+ *	elem_const: array indicating which IN-list elements are Const nodes.
+ *              array is NULL if all elemnets are const.
+ *	num_elems: number of IN-list elements
+ *	nominal_element_type: type of IN-list elements
+ *	useOr: true if elements are combined using OR semantics, false for AND
+ *	isEquality: true if the operator behaves like equality
+ *	isInequality: true if the operator behaves like inequality
+ *
+ * Result:
+ *	Selectivity estimate in the range [0.0, 1.0], or -1.0 if no estimate
+ *	could be produced by this function.
+ *
+ * Note:
+ *	This function assumes that the operator’s selectivity behavior matches
+ *	eqsel()/neqsel semantics.  It must not be used for operators with custom
+ *	or non-standard selectivity behavior.
+ */
+static double
+scalararray_mcv_hash_match(VariableStatData *vardata, Oid operator, Oid collation, Selectivity nonconst_sel,
+						   Datum *elem_values, bool *elem_nulls, int num_elems, bool *elem_const,
+						   Oid nominal_element_type, bool useOr, bool isEquality,
+						   bool isInequality)
+{
+	Form_pg_statistic stats;
+	AttStatsSlot sslot;
+	FmgrInfo	eqproc;
+	double		selec = -1.0,
+				s1disjoint,
+				nullfrac = 0.0;
+	Oid			hashLeft = InvalidOid,
+				hashRight = InvalidOid,
+				opfuncoid;
+	bool		have_mcvs = false;
+
+	opfuncoid = get_opcode(operator);
+	memset(&sslot, 0, sizeof(sslot));
+
+	if (HeapTupleIsValid(vardata->statsTuple))
+	{
+		if (statistic_proc_security_check(vardata, opfuncoid))
+			have_mcvs = get_attstatsslot(&sslot, vardata->statsTuple,
+										 STATISTIC_KIND_MCV, InvalidOid,
+										 ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS);
+	}
+
+	if (have_mcvs)
+	{
+		/*
+		 * If the MCV list and IN-list are large enough, and the operator
+		 * supports hashing, attempt to use hash functions so that MCV–IN
+		 * matching can be done in O(N+M) instead of O(N×M).
+		 */
+		if (sslot.nvalues + num_elems >= MCV_HASH_THRESHOLD)
+		{
+			fmgr_info(opfuncoid, &eqproc);
+			(void) get_op_hash_functions(operator, &hashLeft, &hashRight);
+		}
+	}
+
+	if (have_mcvs && OidIsValid(hashLeft) && OidIsValid(hashRight))
+	{
+		/* Use a hash table to speed up the matching */
+		LOCAL_FCINFO(fcinfo, 2);
+		LOCAL_FCINFO(hash_fcinfo, 1);
+		MCVHashTable_hash *hashTable;
+		FmgrInfo	hash_proc;
+		MCVHashContext hashContext;
+		double		sumallcommon = 0.0,
+					nonmcv_selec = 0.0;
+		bool		isdefault;
+		bool		hash_mcv;
+		double		otherdistinct;
+		Datum	   *arrayHash;
+		Datum	   *arrayProbe;
+		int			nvaluesHash;
+		int			nvaluesProbe;
+		int			nonmcv_cnt = num_elems;
+		int			nonconst_cnt = 0;
+
+		/* Grab the nullfrac for use below. */
+		stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
+		nullfrac = stats->stanullfrac;
+
+		selec = s1disjoint = (useOr ? 0.0 : 1.0);
+
+		InitFunctionCallInfoData(*fcinfo, &eqproc, 2, collation,
+								 NULL, NULL);
+		fcinfo->args[0].isnull = false;
+		fcinfo->args[1].isnull = false;
+
+		for (int i = 0; i < sslot.nvalues; i++)
+			sumallcommon += sslot.numbers[i];
+
+		/*
+		 * Compute the total probability mass of all non-MCV values. This is
+		 * the part of the column distribution not covered by MCVs.
+		 */
+		nonmcv_selec = 1.0 - sumallcommon - nullfrac;
+		CLAMP_PROBABILITY(nonmcv_selec);
+
+		/*
+		 * Approximate the per-value probability of a non-MCV constant by
+		 * dividing the remaining probability mass by the number of other
+		 * distinct values.
+		 */
+		otherdistinct = get_variable_numdistinct(vardata, &isdefault) - sslot.nnumbers;
+		if (otherdistinct > 1)
+			nonmcv_selec /= otherdistinct;
+
+		if (sslot.nnumbers > 0 && nonmcv_selec > sslot.numbers[sslot.nnumbers - 1])
+			nonmcv_selec = sslot.numbers[sslot.nnumbers - 1];
+
+		/* Make sure we build the hash table on the smaller array. */
+		if (sslot.nvalues <= num_elems)
+		{
+			hash_mcv = true;
+			nvaluesHash = sslot.nvalues;
+			nvaluesProbe = num_elems;
+			arrayHash = sslot.values;
+			arrayProbe = elem_values;
+		}
+		else
+		{
+			hash_mcv = false;
+			nvaluesHash = num_elems;
+			nvaluesProbe = sslot.nvalues;
+			arrayHash = elem_values;
+			arrayProbe = sslot.values;
+		}
+
+		fmgr_info(hash_mcv ? hashLeft : hashRight, &hash_proc);
+		InitFunctionCallInfoData(*hash_fcinfo, &hash_proc, 1, collation,
+								 NULL, NULL);
+		hash_fcinfo->args[0].isnull = false;
+
+		hashContext.equal_fcinfo = fcinfo;
+		hashContext.hash_fcinfo = hash_fcinfo;
+		hashContext.op_is_reversed = true;
+		hashContext.insert_mode = true;
+
+		get_typlenbyval(hash_mcv ? sslot.valuetype : nominal_element_type,
+						&hashContext.hash_typlen,
+						&hashContext.hash_typbyval);
+
+		hashTable = MCVHashTable_create(CurrentMemoryContext,
+										nvaluesHash,
+										&hashContext);
+
+		/* Build a hash table over the smaller input side. */
+		for (int i = 0; i < nvaluesHash; i++)
+		{
+			bool		found = false;
+			MCVHashEntry *entry;
+
+			/*
+			 * When hashing IN-list values (hash_mcv == false), we only insert
+			 * constant, non-NULL elements.  NULL and non-Const elements are
+			 * counted separately, because they cannot participate in MCV
+			 * matching and must be handled later using generic selectivity
+			 * estimation.
+			 */
+			if (!hash_mcv)
+			{
+				if (elem_nulls[i])
+				{
+					nonmcv_cnt--;
+					continue;
+				}
+
+				if (elem_const != NULL && !elem_const[i])
+				{
+					nonmcv_cnt--;
+					nonconst_cnt++;
+					continue;
+				}
+			}
+
+			entry = MCVHashTable_insert(hashTable, arrayHash[i], &found);
+
+			/*
+			 * entry->count tracks how many times the same value appears, so
+			 * that duplicate IN-list elements can be folded into the
+			 * probability calculation.
+			 */
+			if (likely(!found))
+			{
+				entry->index = i;
+				entry->count = 1;
+			}
+			else
+				entry->count++;
+		}
+
+		hashContext.insert_mode = false;
+		if (hashLeft != hashRight)
+		{
+			fmgr_info(hash_mcv ? hashRight : hashLeft, &hash_proc);
+			/* Resetting hash_fcinfo is probably unnecessary, but be safe */
+			InitFunctionCallInfoData(*hash_fcinfo, &hash_proc, 1, collation,
+									 NULL, NULL);
+			hash_fcinfo->args[0].isnull = false;
+		}
+
+		for (int i = 0; i < nvaluesProbe; i++)
+		{
+			MCVHashEntry *entry;
+			Selectivity s1;
+			int			nvaluesmcv;
+
+			/*
+			 * When probing with IN-list elements, ignore NULLs and non-Const
+			 * expressions: they cannot be matched against MCVs and will be
+			 * accounted for later by generic estimation.
+			 */
+			if (hash_mcv)
+			{
+				if (elem_nulls[i])
+				{
+					nonmcv_cnt--;
+					continue;
+				}
+
+				if (elem_const != NULL && !elem_const[i])
+				{
+					nonmcv_cnt--;
+					nonconst_cnt++;
+					continue;
+				}
+			}
+
+			entry = MCVHashTable_lookup(hashTable, arrayProbe[i]);
+
+			/*
+			 * If found, obtain its MCV frequency and remember how many values
+			 * on the hashed side map to this entry.
+			 */
+			if (entry != NULL)
+			{
+				s1 = hash_mcv ? sslot.numbers[entry->index]
+					: sslot.numbers[i];
+
+				nvaluesmcv = entry->count;
+
+				accum_scalararray_prob(s1, nvaluesmcv, useOr, isEquality, isInequality,
+										nullfrac, &selec, &s1disjoint);
+
+				/* Matched values are no longer considered non-MCV */
+				nonmcv_cnt -= nvaluesmcv;
+			}
+		}
+
+		/*
+		 * Account for constant IN-list values that did not match any MCV.
+		 *
+		 * Each such value is assumed to have probability = nonmcv_selec,
+		 * derived from the remaining (non-MCV) probability mass.
+		 */
+		accum_scalararray_prob(nonmcv_selec, nonmcv_cnt, useOr, isEquality, isInequality,
+								nullfrac, &selec, &s1disjoint);
+
+		/*
+		 * Account for non-Const IN-list elements.
+		 *
+		 * These values cannot be matched against MCVs, so we rely on the
+		 * operator's generic selectivity estimator for each of them.
+		 */
+		accum_scalararray_prob(nonconst_sel, nonconst_cnt, useOr, isEquality, isInequality,
+								nullfrac, &selec, &s1disjoint);
+
+		/*
+		 * For = ANY or <> ALL, if the IN-list elements are assumed distinct,
+		 * the events are disjoint and the total probability is the sum of
+		 * individual probabilities.  Use that estimate if it lies in [0,1].
+		 */
+		if ((useOr ? isEquality : isInequality) &&
+			s1disjoint >= 0.0 && s1disjoint <= 1.0)
+			selec = s1disjoint;
+
+		CLAMP_PROBABILITY(selec);
+
+		MCVHashTable_destroy(hashTable);
+	}
+
+	if (have_mcvs)
+		free_attstatsslot(&sslot);
+
+	return selec;
+}
+
 /*
  * Estimate number of elements in the array yielded by an expression.
  *
@@ -2612,7 +3042,7 @@ eqjoinsel(PG_FUNCTION_ARGS)
 		 * If the MCV lists are long enough to justify hashing, try to look up
 		 * hash functions for the join operator.
 		 */
-		if ((sslot1.nvalues + sslot2.nvalues) >= EQJOINSEL_MCV_HASH_THRESHOLD)
+		if ((sslot1.nvalues + sslot2.nvalues) >= MCV_HASH_THRESHOLD)
 			(void) get_op_hash_functions(operator, &hashLeft, &hashRight);
 	}
 	else
-- 
2.34.1

From 7d668752ceb49b901571a96d156e0219da4e7c1f Mon Sep 17 00:00:00 2001
From: Evdokimov Ilia <[email protected]>
Date: Wed, 25 Feb 2026 23:00:32 +0300
Subject: [PATCH v6 2/3] Use O(1) selectivity formula for eqsel/neqsel IN/ALL

Replace per-element iteration in ScalarArrayOpExpr selectivity
estimation with a closed-form probability formula when all elements
share the same eqsel()/neqsel() semantics.

Preserves existing independence/disjoint models while reducing
planning cost for large IN/ALL lists from O(N) to O(1).

Special handling added for unique columns using 1/reltuples.
---
 src/backend/utils/adt/selfuncs.c | 157 +++++++++++++++++++++++++++++++
 1 file changed, 157 insertions(+)

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index eef3f0375a5..f6091a576d8 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -184,6 +184,9 @@ get_relation_stats_hook_type get_relation_stats_hook = NULL;
 get_index_stats_hook_type get_index_stats_hook = NULL;
 
 static double eqsel_internal(PG_FUNCTION_ARGS, bool negate);
+static Selectivity calculate_combined_selectivity(Selectivity s2, int num_elems,
+							  bool useOr,
+							  bool isEquality, bool isInequality);
 static double eqjoinsel_inner(FmgrInfo *eqproc, Oid collation,
 							  Oid hashLeft, Oid hashRight,
 							  VariableStatData *vardata1, VariableStatData *vardata2,
@@ -1893,6 +1896,61 @@ strip_array_coercion(Node *node)
 	return node;
 }
 
+/*
+ * calculate_combined_selectivity
+ *
+ * Combine selectivities of N identical ScalarArrayOpExpr elements.
+ *
+ * This function assumes that all elements of the IN/ANY or ALL list
+ * have the same per-element selectivity s2, and computes the overall
+ * selectivity without iterating over the elements.
+ *
+ * For OR semantics (x = ANY (...)):
+ *   main model      : 1 - (1 - s2)^N
+ *   disjoint model  : N * s2
+ *
+ * For AND semantics (x <> ALL (...)):
+ *   main model      : s2^N
+ *   disjoint model  : 1 - N * (1 - s2)
+ *
+ * If the disjoint estimate is within [0,1], it is preferred.
+ * Otherwise, we fall back to the main (independence) model.
+ */
+static Selectivity
+calculate_combined_selectivity(Selectivity s2, int num_elems, bool useOr, bool isEquality, bool isInequality)
+{
+	bool		use_disjoint = false;
+	Selectivity	s1;
+	Selectivity	s1disjoint;
+
+	s1 = s1disjoint = (useOr ? 0.0 : 1.0);
+
+	if (useOr)
+	{
+		if (isEquality)
+		{
+			s1disjoint = s2 * num_elems;
+			if (s1disjoint >= 0.0 && s1disjoint <= 1.0)
+				use_disjoint = true;
+		}
+		s1 = use_disjoint ? s1disjoint : (1.0 - pow(1.0 - s2, num_elems));
+	}
+	else
+	{
+		if (isInequality)
+		{
+			s1disjoint = 1.0 + num_elems * (s2 - 1.0);
+			if (s1disjoint >= 0.0 && s1disjoint <= 1.0)
+				use_disjoint = true;
+		}
+		s1 = use_disjoint ? s1disjoint : pow(s2, num_elems);
+	}
+
+	CLAMP_PROBABILITY(s1);
+
+	return s1;
+}
+
 /*
  *		scalararraysel		- Selectivity of ScalarArrayOpExpr Node.
  */
@@ -2030,6 +2088,72 @@ scalararraysel(PlannerInfo *root,
 						  elmlen, elmbyval, elmalign,
 						  &elem_values, &elem_nulls, &num_elems);
 
+		/*
+		 * Try to avoid O(N^2) selectivity calculation for ScalarArrayOpExpr.
+		 *
+		 * For equality/inequality operators in restriction clauses,
+		 * attempt to derive a single per-element selectivity (s2) and
+		 * combine it in O(1) time using a closed-form formula instead
+		 * of iterating over all elements.
+		 */
+		if ((isEquality || isInequality) && !is_join_clause)
+		{
+			VariableStatData vardata;
+			Selectivity s2 = -1.0;
+			Node       *other_op = NULL;
+			bool        var_on_left;
+
+			/*
+			 * If the clause is of the form "var OP something" or
+			 * "something OP var", extract statistics for the variable.
+			 * Otherwise, fall back to a default per-element estimate.
+			 */
+			if (get_restriction_variable(root, clause->args, varRelid, &vardata, &other_op, &var_on_left))
+			{
+				/*
+				 * Fast path for unique columns.
+				 *
+				 * If the variable is known to be unique and the relation
+				 * has at least one tuple, equality selectivity is exactly
+				 * 1 / reltuples.
+				 */
+				if (vardata.isunique && vardata.rel && vardata.rel->tuples >= 1.0)
+				{
+					s2 = 1.0 / vardata.rel->tuples;
+					if (HeapTupleIsValid(vardata.statsTuple))
+					{
+						Form_pg_statistic stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
+						if (isInequality)
+							s2 = 1.0 - s2 - stats->stanullfrac;
+					}
+				}
+				else if (isInequality)
+				{
+					Oid negator = get_negator(operator);
+					if (!OidIsValid(negator))
+						s2 = 1.0 - DEFAULT_EQ_SEL;
+				}
+
+				ReleaseVariableStats(vardata);
+
+				if (s2 >= 0.0)
+				{
+					CLAMP_PROBABILITY(s2);
+
+					s1 = calculate_combined_selectivity(s2, num_elems, useOr, isEquality, isInequality);
+
+					return s1;
+				}
+			}
+			else
+			{
+				s2 = (isInequality) ? (1.0 - DEFAULT_EQ_SEL) : DEFAULT_EQ_SEL;
+				s1 = calculate_combined_selectivity(s2, num_elems, useOr, isEquality, isInequality);
+
+				return s1;
+			}
+		}
+
 		/*
 		 * For generic operators, we assume the probability of success is
 		 * independent for each array element.  But for "= ANY" or "<> ALL",
@@ -2105,6 +2229,39 @@ scalararraysel(PlannerInfo *root,
 		get_typlenbyval(arrayexpr->element_typeid,
 						&elmlen, &elmbyval);
 
+		/*
+		 * Try to avoid O(N^2) selectivity calculation for ScalarArrayOpExpr.
+		 *
+		 * For equality/inequality operators in restriction clauses,
+		 * attempt to derive a single per-element selectivity (s2) and
+		 * combine it in O(1) time using a closed-form formula instead
+		 * of iterating over all elements.
+		 */
+		if ((isEquality || isInequality) && !is_join_clause)
+		{
+			VariableStatData vardata;
+			Selectivity s2 = -1.0;
+			Node	*other_op = NULL;
+			bool	var_on_left;
+			int	num_elems = list_length(arrayexpr->elements);
+
+			/*
+			 * If expression is not variable = something or something =
+			 * variable, then fall back to default code path to compute
+			 * default selectivity.
+			 */
+			if (!get_restriction_variable(root, clause->args, varRelid,
+										 &vardata, &other_op, &var_on_left))
+			{
+				s2 = (isInequality) ? (1.0 - DEFAULT_EQ_SEL) : DEFAULT_EQ_SEL;
+				s1 = calculate_combined_selectivity(s2, num_elems, useOr, isEquality, isInequality);
+
+				return s1;
+			}
+			else
+				ReleaseVariableStats(vardata);
+		}
+
 		/*
 		 * We use the assumption of disjoint probabilities here too, although
 		 * the odds of equal array elements are rather higher if the elements
-- 
2.34.1

From c6cc307e6d7a131ba5fc3c59fc86ba0df6768a43 Mon Sep 17 00:00:00 2001
From: Evdokimov Ilia <[email protected]>
Date: Wed, 25 Feb 2026 22:58:32 +0300
Subject: [PATCH v6 1/3] Reduce planning time for large NOT IN lists containing
   NULL

For x <> ALL (...) / x NOT IN (...), the presence of a NULL element
makes the selectivity 0.0.

The planner currently still iterates over all elements and computes
per-element selectivity, even though the final result is known.

Add an early NULL check for constant arrays and immediately return
0.0 under ALL semantics.

This reduces planning time for large NOT IN / <> ALL lists without
changing semantics.
---
 src/backend/utils/adt/selfuncs.c          |  9 +++++
 src/test/regress/expected/expressions.out | 44 +++++++++++++++++++++++
 src/test/regress/sql/expressions.sql      | 41 +++++++++++++++++++++
 3 files changed, 94 insertions(+)

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 29fec655593..eef3f0375a5 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -2018,6 +2018,11 @@ scalararraysel(PlannerInfo *root,
 		if (arrayisnull)		/* qual can't succeed if null array */
 			return (Selectivity) 0.0;
 		arrayval = DatumGetArrayTypeP(arraydatum);
+
+		/* Selectivity of "WHERE x NOT IN (NULL, ... )" is always 0 */
+		if (!useOr && array_contains_nulls(arrayval))
+			return (Selectivity) 0.0;
+
 		get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
 							 &elmlen, &elmbyval, &elmalign);
 		deconstruct_array(arrayval,
@@ -2115,6 +2120,10 @@ scalararraysel(PlannerInfo *root,
 			List	   *args;
 			Selectivity s2;
 
+			/* Selectivity of "WHERE x NOT IN (NULL, ... )" is always 0 */
+			if (!useOr && IsA(elem, Const) && ((Const *) elem)->constisnull)
+				return (Selectivity) 0.0;
+
 			/*
 			 * Theoretically, if elem isn't of nominal_element_type we should
 			 * insert a RelabelType, but it seems unlikely that any operator
diff --git a/src/test/regress/expected/expressions.out b/src/test/regress/expected/expressions.out
index 9a3c97b15a3..34f14a5775a 100644
--- a/src/test/regress/expected/expressions.out
+++ b/src/test/regress/expected/expressions.out
@@ -426,3 +426,47 @@ select * from inttest where a not in (0::myint,2::myint,3::myint,4::myint,5::myi
 (0 rows)
 
 rollback;
+-- Test <> ALL when array initially contained NULL but no longer does
+begin;
+create function check_estimated_rows(text) returns table (estimated int)
+language plpgsql as
+$$
+declare
+    ln text;
+    tmp text[];
+    first_row bool := true;
+begin
+    for ln in
+        execute format('explain %s', $1)
+    loop
+        if first_row then
+            first_row := false;
+            tmp := regexp_match(ln, 'rows=(\d*)');
+            return query select tmp[1]::int;
+        end if;
+    end loop;
+end;
+$$;
+create function replace_elem(arr int[], idx int, val int)
+returns int[] AS $$
+begin
+      arr[idx] := val;
+      return arr;
+end;
+$$ language plpgsql immutable;
+create table notin_test as select generate_series(1, 1000) as x;
+analyze notin_test;
+select * from check_estimated_rows('select * from notin_test where x <> all(array[1,99,3])');
+ estimated 
+-----------
+       997
+(1 row)
+
+-- same array, constructed from an array with a NULL
+select * from check_estimated_rows('select * from notin_test where x <> all(replace_elem(array[1,null,3], 2, 99))');
+ estimated 
+-----------
+       997
+(1 row)
+
+rollback;
diff --git a/src/test/regress/sql/expressions.sql b/src/test/regress/sql/expressions.sql
index e02c21f3368..ca94859bbf8 100644
--- a/src/test/regress/sql/expressions.sql
+++ b/src/test/regress/sql/expressions.sql
@@ -209,3 +209,44 @@ select * from inttest where a not in (1::myint,2::myint,3::myint,4::myint,5::myi
 select * from inttest where a not in (0::myint,2::myint,3::myint,4::myint,5::myint, null);
 
 rollback;
+
+-- Test <> ALL when array initially contained NULL but no longer does
+
+begin;
+
+create function check_estimated_rows(text) returns table (estimated int)
+language plpgsql as
+$$
+declare
+    ln text;
+    tmp text[];
+    first_row bool := true;
+begin
+    for ln in
+        execute format('explain %s', $1)
+    loop
+        if first_row then
+            first_row := false;
+            tmp := regexp_match(ln, 'rows=(\d*)');
+            return query select tmp[1]::int;
+        end if;
+    end loop;
+end;
+$$;
+
+create function replace_elem(arr int[], idx int, val int)
+returns int[] AS $$
+begin
+      arr[idx] := val;
+      return arr;
+end;
+$$ language plpgsql immutable;
+
+create table notin_test as select generate_series(1, 1000) as x;
+analyze notin_test;
+
+select * from check_estimated_rows('select * from notin_test where x <> all(array[1,99,3])');
+-- same array, constructed from an array with a NULL
+select * from check_estimated_rows('select * from notin_test where x <> all(replace_elem(array[1,null,3], 2, 99))');
+
+rollback;
\ No newline at end of file
-- 
2.34.1

Re: Hash-based MCV matching for large IN-lists

Reply via email to