From 47593262e8f6dd37aa29ab6dfda82ec00c57fa09 Mon Sep 17 00:00:00 2001
From: Amit Langote <amitlan@postgresql.org>
Date: Tue, 1 Apr 2025 16:06:48 +0900
Subject: [PATCH v6 3/3] Canonicalize keys for derived clause hash table

Derived clauses are now stored in the ec_derives_hash using canonicalized
keys: the lower-addressed EquivalenceMember is always placed in em1, and
the higher-addressed one in em2. This avoids storing or searching for
both permutations of the same clause and eliminates the need for
redundant lookups.

For clauses involving a constant EM, only the non-constant EM is included
in the key; em1 is set to NULL and em2 to the non-constant EM.

The initial hash table size is set to twice the length of ec_derives_list
to avoid immediate resizing.

This design follows suggestions by David Rowley, who proposed both the
key canonicalization to reduce entry count and guidance on sizing the
hash table to balance memory usage and CPU efficiency.
---
 src/backend/optimizer/path/equivclass.c | 146 ++++++++++++++----------
 1 file changed, 84 insertions(+), 62 deletions(-)

diff --git a/src/backend/optimizer/path/equivclass.c b/src/backend/optimizer/path/equivclass.c
index b988db4d7b3..a1bccbdd38f 100644
--- a/src/backend/optimizer/path/equivclass.c
+++ b/src/backend/optimizer/path/equivclass.c
@@ -88,7 +88,9 @@ static RestrictInfo *ec_search_derived_clause_for_ems(PlannerInfo *root,
 													  EquivalenceClass *parent_ec);
 
 /*
- * Hash key identifying a derived clause by EquivalenceMembers and parent EC.
+ * Hash key identifying a derived clause.
+ *
+ * See fill_ec_derives_key() for details on canonical key construction.
  */
 typedef struct
 {
@@ -3450,15 +3452,22 @@ get_common_eclass_indexes(PlannerInfo *root, Relids relids1, Relids relids2)
 
 /*
  * ec_build_derives_hash
- *	  Construct the auxiliary hash table for derived clauses.
+ *	  Construct the auxiliary hash table for derived clause lookups.
  */
 static void
 ec_build_derives_hash(PlannerInfo *root, EquivalenceClass *ec)
 {
 	Assert(!ec->ec_derives_hash);
 
-	/* Create the hash table */
-	ec->ec_derives_hash = derives_create(root->planner_cxt, 256, NULL);
+	/*
+	 * Create the hash table.
+	 *
+	 * We set the initial size to twice the number of clauses in
+	 * ec_derives_list to avoid immediate resizing during insertion.
+	 */
+	ec->ec_derives_hash = derives_create(root->planner_cxt,
+										 list_length(ec->ec_derives_list) * 2,
+										 NULL);
 
 	foreach_node(RestrictInfo, rinfo, ec->ec_derives_list)
 		ec_add_clause_to_derives_hash(ec, rinfo);
@@ -3495,10 +3504,56 @@ ec_add_derived_clauses(EquivalenceClass *ec, List *clauses)
 			ec_add_clause_to_derives_hash(ec, rinfo);
 }
 
+/*
+ * fill_ec_derives_key
+ *		Compute a canonical key for ec_derives_hash lookup or insertion.
+ *
+ * Derived clauses are looked up using a pair of EquivalenceMembers and a
+ * parent EquivalenceClass. To avoid storing or searching for both EM orderings,
+ * we canonicalize the key:
+ *
+ * - For clauses involving two non-constant EMs, we order the EMs by address
+ *   and place the lower one first.
+ * - For clauses involving a constant EM, the caller must pass the non-constant
+ *   EM as leftem; we then set em1 = NULL and em2 = leftem.
+ */
+static inline void
+fill_ec_derives_key(ECDerivesKey *key,
+					EquivalenceMember *leftem,
+					EquivalenceMember *rightem,
+					EquivalenceClass *parent_ec)
+{
+	Assert(leftem);				/* Always required for lookup or insertion */
+
+	/*
+	 * Clauses with a constant EM are always stored and looked up using only
+	 * the non-constant EM, with the other key slot set to NULL.
+	 */
+	if (rightem == NULL)
+	{
+		key->em1 = NULL;
+		key->em2 = leftem;
+	}
+	else if (leftem < rightem)
+	{
+		key->em1 = leftem;
+		key->em2 = rightem;
+	}
+	else
+	{
+		key->em1 = rightem;
+		key->em2 = leftem;
+	}
+	key->parent_ec = parent_ec;
+}
+
 /*
  * ec_add_clause_to_derives_hash
- *      Add a derived clause to the ec_derives_hash of the given
- *      EquivalenceClass.
+ *		Add a derived clause to ec_derives_hash for the given EquivalenceClass.
+ *
+ * Each clause is inserted under a canonicalized key. For constant-containing
+ * clauses, only the non-constant EM is used for lookup; see comments in
+ * fill_ec_derives_key().
  */
 static void
 ec_add_clause_to_derives_hash(EquivalenceClass *ec, RestrictInfo *rinfo)
@@ -3513,49 +3568,24 @@ ec_add_clause_to_derives_hash(EquivalenceClass *ec, RestrictInfo *rinfo)
 	 */
 	Assert(!rinfo->left_em->em_is_const);
 
-	if (rinfo->right_em->em_is_const)
-	{
-		/*
-		 * Clauses containing a constant are never considered redundant, so
-		 * parent_ec is not set.
-		 */
-		Assert(!rinfo->parent_ec);
-
-		/*
-		 * find_derived_clause_for_ec_member() performs lookup of a clause
-		 * involving a constant using only the non-constant EM and NULL for
-		 * the RHS. Since there's only one constant EM per EC, we don't need
-		 * to store or match it during lookup.  We set key.em2 = NULL to
-		 * reflect this.
-		 */
-		key.em1 = rinfo->left_em;
-		key.em2 = NULL;
-		key.parent_ec = rinfo->parent_ec;
-		entry = derives_insert(ec->ec_derives_hash, key, &found);
-		Assert(!found);
-		entry->rinfo = rinfo;
-	}
-	else
-	{
-		key.em1 = rinfo->left_em;
-		key.em2 = rinfo->right_em;
-		key.parent_ec = rinfo->parent_ec;
-		entry = derives_insert(ec->ec_derives_hash, key, &found);
-		Assert(!found);
-		entry->rinfo = rinfo;
+	/*
+	 * Clauses containing a constant are never considered redundant, so
+	 * parent_ec is not set.
+	 */
+	Assert(!rinfo->parent_ec || !rinfo->right_em->em_is_const);
 
-		/*
-		 * Insert the clause under the given EM pair key, and also under the
-		 * reverse order. This ensures we can find the clause regardless of
-		 * the order in which EMs are passed to the lookup function.
-		 */
-		key.em1 = rinfo->right_em;
-		key.em2 = rinfo->left_em;
-		key.parent_ec = rinfo->parent_ec;
-		entry = derives_insert(ec->ec_derives_hash, key, &found);
-		Assert(!found);
-		entry->rinfo = rinfo;
-	}
+	/*
+	 * See fill_ec_derives_key() for details: we use a canonicalized key
+	 * to avoid storing both EM orderings. For constant EMs, only the
+	 * non-constant EM is included in the key.
+	 */
+	fill_ec_derives_key(&key,
+						rinfo->left_em,
+						rinfo->right_em->em_is_const ? NULL : rinfo->right_em,
+						rinfo->parent_ec);
+	entry = derives_insert(ec->ec_derives_hash, key, &found);
+	Assert(!found);
+	entry->rinfo = rinfo;
 }
 
 /*
@@ -3623,13 +3653,12 @@ ec_search_clause_for_ems(PlannerInfo *root, EquivalenceClass *ec,
  * If the number of derived clauses exceeds a threshold, switch to hash table
  * lookup; otherwise, scan ec_derives_list linearly.
  *
- * Clauses involving constants are looked up using only the non-const EM
- * (leftem) and a NULL rightem. In that case, we expect to find a clause with
- * a constant on the RHS.
+ * Clauses involving constants are looked up using only the non-constant EM
+ * (leftem), with rightem set to NULL. In that case, we expect to find a
+ * clause with a constant on the RHS.
  *
- * We do not attempt a second lookup with EMs swapped when using the hash
- * table; such clauses are inserted under both orderings at the time of
- * insertion.
+ * We do not attempt a second lookup with EMs swapped, because the key is
+ * canonicalized during both insertion and lookup.
  */
 static RestrictInfo *
 ec_search_derived_clause_for_ems(PlannerInfo *root, EquivalenceClass *ec,
@@ -3651,14 +3680,7 @@ ec_search_derived_clause_for_ems(PlannerInfo *root, EquivalenceClass *ec,
 		RestrictInfo *rinfo;
 		ECDerivesEntry *entry;
 
-		/*
-		 * See ec_add_clause_to_derives_hash() for rationale: derived clauses
-		 * are inserted into the hash table under both (em1, em2) and (em2,
-		 * em1), so a single lookup with the original order is sufficient.
-		 */
-		key.em1 = leftem;
-		key.em2 = rightem;
-		key.parent_ec = parent_ec;
+		fill_ec_derives_key(&key, leftem, rightem, parent_ec);
 		entry = derives_lookup(ec->ec_derives_hash, key);
 		if (entry)
 		{
-- 
2.43.0

