Hi Ilia!

On 05.09.2025 16:03, David Geier wrote:
>>> I propose an optimization: when the column datatype supports
>>> ordering(i.e., has < and >), we can sort both MCV lists and apply
>>> mege-style algorithm to detect matches. This reduces runtime from
>>> O(N^2) to O(NlogN), where N is the number of MCV entries. The patch
>>> also applies the same optimization to semi-join clauses, which show
>>> similar performance behavior.
> 
> Why do you sort both lists and then merge instead of putting the smaller
> list into a hash map and then doing hash lookups (if the type is hashable)?

I've tested your and my code with the following script:

CREATE TABLE foo(col TEXT);
CREATE TABLE bar(col TEXT);
SET default_statistics_target = 10000;

-- Generate MCV values. PostgreSQL doesn't store MCVs if the table has
-- only a single value or every value has exactly the same cardinality.
DO $$
BEGIN
  FOR i IN 1..10000 LOOP
    FOR j IN 1..least(i, 50) LOOP
      INSERT INTO foo VALUES ('aaaaaaaaaaaaaaaaaaaa' || i::TEXT);
      INSERT INTO bar VALUES ('aaaaaaaaaaaaaaaaaaaa' || (i + 100000)::TEXT);
    END LOOP;
  END LOOP;
END;
$$;

ANALYZE foo, bar;
\timing on
EXPLAIN SELECT * FROM foo f, bar b WHERE f.col = b.col;

Results are:

- master: 433 ms
- Order+Merge: 11 ms
- Hash map: 4 ms

I've attached my draft patch.

--
David Geier
From 7e2cf9602511ada0284503eb929e202ce0d7d354 Mon Sep 17 00:00:00 2001
From: David Geier <[email protected]>
Date: Mon, 8 Sep 2025 12:06:44 +0200
Subject: [PATCH] Optimize eqoinsel_inner with hash table

---
 src/backend/utils/adt/selfuncs.c | 180 +++++++++++++++++++++++++++----
 1 file changed, 159 insertions(+), 21 deletions(-)

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 1c480cfaaf7..c171d70eb47 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -143,6 +143,8 @@
 
 #define DEFAULT_PAGE_CPU_MULTIPLIER 50.0
 
+struct McvHashTable_hash;
+
 /* Hooks for plugins to get control when we ask for stats */
 get_relation_stats_hook_type get_relation_stats_hook = NULL;
 get_index_stats_hook_type get_index_stats_hook = NULL;
@@ -217,7 +219,9 @@ static bool get_actual_variable_endpoint(Relation heapRel,
 static RelOptInfo *find_join_input_rel(PlannerInfo *root, Relids relids);
 static double btcost_correlation(IndexOptInfo *index,
 								 VariableStatData *vardata);
-
+static uint32 hash_msv(struct McvHashTable_hash *hashTable, Datum key);
+static bool are_mcvs_equal(struct McvHashTable_hash *hashTable, Datum value1, Datum value2);
+static Oid get_hash_func_oid(Oid operatorOid);
 
 /*
  *		eqsel			- Selectivity of "=" for any data types.
@@ -2269,6 +2273,132 @@ rowcomparesel(PlannerInfo *root,
 	return s1;
 }
 
+typedef struct McvHashEntry
+{
+	Datum  value;
+	uint32 index;
+	uint32 hash;
+	char   status; // NOLINT
+} McvHashEntry;
+
+typedef struct McvHashContext
+{
+	FmgrInfo equal_proc;
+	FmgrInfo hash_proc;
+	Oid      collation;
+} McvHashContext;
+
+#define SH_PREFIX                  McvHashTable
+#define SH_ELEMENT_TYPE            McvHashEntry
+#define SH_KEY_TYPE                Datum
+#define SH_KEY                     value
+#define SH_HASH_KEY(mcvs, key)     hash_msv(mcvs, key)
+#define SH_EQUAL(mcvs, key0, key1) are_mcvs_equal(mcvs, key0, key1)
+#define SH_SCOPE                   static inline
+#define SH_STORE_HASH
+#define SH_GET_HASH(mcvs, key)     key->hash
+#define SH_DEFINE
+#define SH_DECLARE
+#include "lib/simplehash.h"
+  
+static uint32
+hash_msv(struct McvHashTable_hash *hashTable, Datum key)
+{	
+	McvHashContext *context = (McvHashContext *)hashTable->private_data;
+	return DatumGetUInt32(FunctionCall1Coll(&context->hash_proc, context->collation, key));
+}
+
+static bool
+are_mcvs_equal(struct McvHashTable_hash *hashTable, Datum value1, Datum value2)
+{
+	
+	/*
+	 * We can safely use FunctionCall2Coll() which requires the result to never be NULL,
+     * because MCV arrays from 'pg_statistic' don't contain 'NULL' values
+	 */
+	McvHashContext *context = (McvHashContext *)hashTable->private_data;
+	return DatumGetBool(FunctionCall2Coll(&context->equal_proc, context->collation, value1, value2));
+}
+
+static Oid
+get_hash_func_oid(Oid operatorOid)
+{
+	Oid hashLeft  = InvalidOid;
+	Oid hashRight = InvalidOid;
+	get_op_hash_functions(operatorOid, &hashLeft, &hashRight);
+	return OidIsValid(hashLeft) && hashLeft == hashRight ? hashLeft : InvalidOid;
+}
+  
+/*
+ * eqjoinsel_inner_with_hashtable
+ *
+ * Optimizes inner equality join selectivity estimation by using an O(n) algorithm based on hashing.
+ * Returns whether or not all prerequisites are met and the operation was successful.
+ * The result is used to know if to fallback to the default implementation.
+ */
+static bool
+eqjoinsel_inner_with_hashtable(Oid operatorOid, Oid collation,
+								AttStatsSlot *statsSlot1, AttStatsSlot *statsSlot2,
+								FunctionCallInfo equalFunctionCallInfo,
+								/* Output parameters: */
+								double *matchProductFrequencies, int *nMatches, bool *hasMatch1, bool *hasMatch2)
+{
+	Oid hashFuncOid;
+	AttStatsSlot *statsInner = statsSlot2;
+	AttStatsSlot *statsOuter = statsSlot1;
+	bool *hasMatchInner = hasMatch2;
+	bool *hasMatchOuter = hasMatch1;
+	struct McvHashContext hashContext;
+	McvHashTable_hash *hashTable;
+
+	if (Min(statsSlot1->nvalues, statsSlot2->nvalues) == 1)
+		return false;
+
+	hashFuncOid = get_hash_func_oid(operatorOid);
+	if (!OidIsValid(hashFuncOid))
+		return false;
+
+	/* Make sure we build the hash table on the smaller array. */
+	if (statsSlot1->nvalues < statsSlot2->nvalues)
+	{
+		statsInner    = statsSlot1;
+		statsOuter    = statsSlot2;
+		hasMatchInner = hasMatch1;
+		hasMatchOuter = hasMatch2;
+	}
+
+	/* 1. Create hash table of smaller 'pg_statistic' array. That's O(n). */
+	fmgr_info(get_opcode(operatorOid), &hashContext.equal_proc);
+	fmgr_info(hashFuncOid, &hashContext.hash_proc);
+	hashContext.collation = collation;
+
+	hashTable = McvHashTable_create(CurrentMemoryContext, statsInner->nvalues, &hashContext);
+
+	for (int i = 0; i < statsInner->nvalues; i++)
+	{
+		bool found = false;
+		McvHashEntry *entry = McvHashTable_insert(hashTable, statsInner->values[i], &found);
+		Assert(!found);
+		entry->index = i;
+	}
+
+	/* 2. Look-up values from other 'pg_statistic' array against hash map to find matches. */
+	for (int i = 0; i < statsOuter->nvalues; i++)
+	{
+		McvHashEntry *entry = McvHashTable_lookup(hashTable, statsOuter->values[i]);
+		if (entry != NULL)
+		{
+			hasMatchInner[entry->index] = true;
+			hasMatchOuter[i]            = true;
+			*matchProductFrequencies += statsInner->numbers[entry->index] * statsOuter->numbers[i];
+			(*nMatches)++;
+		}
+	}
+
+	McvHashTable_destroy(hashTable);
+	return true;
+}
+
 /*
  *		eqjoinsel		- Join selectivity of "="
  */
@@ -2350,7 +2480,7 @@ eqjoinsel(PG_FUNCTION_ARGS)
 	}
 
 	/* We need to compute the inner-join selectivity in all cases */
-	selec_inner = eqjoinsel_inner(opfuncoid, collation,
+	selec_inner = eqjoinsel_inner(operator, collation,
 								  &vardata1, &vardata2,
 								  nd1, nd2,
 								  isdefault1, isdefault2,
@@ -2438,7 +2568,7 @@ eqjoinsel(PG_FUNCTION_ARGS)
  * that it's worth trying to distinguish them here.
  */
 static double
-eqjoinsel_inner(Oid opfuncoid, Oid collation,
+eqjoinsel_inner(Oid operator, Oid collation,
 				VariableStatData *vardata1, VariableStatData *vardata2,
 				double nd1, double nd2,
 				bool isdefault1, bool isdefault2,
@@ -2480,7 +2610,7 @@ eqjoinsel_inner(Oid opfuncoid, Oid collation,
 		int			i,
 					nmatches;
 
-		fmgr_info(opfuncoid, &eqproc);
+		fmgr_info(get_opcode(operator), &eqproc);
 
 		/*
 		 * Save a few cycles by setting up the fcinfo struct just once. Using
@@ -2504,30 +2634,38 @@ eqjoinsel_inner(Oid opfuncoid, Oid collation,
 		 */
 		matchprodfreq = 0.0;
 		nmatches = 0;
-		for (i = 0; i < sslot1->nvalues; i++)
-		{
-			int			j;
 
-			fcinfo->args[0].value = sslot1->values[i];
-
-			for (j = 0; j < sslot2->nvalues; j++)
+		if (!eqjoinsel_inner_with_hashtable(operator, collation, sslot1, sslot2,
+												 fcinfo, &matchprodfreq, &nmatches,
+												 hasmatch1, hasmatch2))
+		{
+			/* Fallback to O(N^2) algorithm if hash based variant didn't succeed. */
+			for (i = 0; i < sslot1->nvalues; i++)
 			{
-				Datum		fresult;
+				int			j;
 
-				if (hasmatch2[j])
-					continue;
-				fcinfo->args[1].value = sslot2->values[j];
-				fcinfo->isnull = false;
-				fresult = FunctionCallInvoke(fcinfo);
-				if (!fcinfo->isnull && DatumGetBool(fresult))
+				fcinfo->args[0].value = sslot1->values[i];
+
+				for (j = 0; j < sslot2->nvalues; j++)
 				{
-					hasmatch1[i] = hasmatch2[j] = true;
-					matchprodfreq += sslot1->numbers[i] * sslot2->numbers[j];
-					nmatches++;
-					break;
+					Datum		fresult;
+
+					if (hasmatch2[j])
+						continue;
+					fcinfo->args[1].value = sslot2->values[j];
+					fcinfo->isnull = false;
+					fresult = FunctionCallInvoke(fcinfo);
+					if (!fcinfo->isnull && DatumGetBool(fresult))
+					{
+						hasmatch1[i] = hasmatch2[j] = true;
+						matchprodfreq += sslot1->numbers[i] * sslot2->numbers[j];
+						nmatches++;
+						break;
+					}
 				}
 			}
 		}
+
 		CLAMP_PROBABILITY(matchprodfreq);
 		/* Sum up frequencies of matched and unmatched MCVs */
 		matchfreq1 = unmatchfreq1 = 0.0;
-- 
2.43.0

Reply via email to