From 4423d726583a9adcfacce477cb98265f900e72fd Mon Sep 17 00:00:00 2001
From: David Rowley <dgrowley@gmail.com>
Date: Tue, 25 Mar 2025 18:49:15 +1300
Subject: [PATCH v7] Fix query jumbling to account for NULL nodes

Previously NULL nodes were ignored.  This could cause issues where the
computed query ID could match for queries where fields that are next to
each other in their Node struct where one field was NULL and the other
non-NULL.  For example, the Query struct has distinctClause and sortClause
next to each other.  If someone wrote;

SELECT DISTINCT c1 FROM t;

and then;

SELECT c1 FROM t ORDER BY c1;

these would produce the same query ID since in the first query we ignore
the NULL sortClause and append the jumble bytes for the distictClause.
In the latter query, since we do nothing for the NULL distinctClause
then jumble the non-NULL sortClause, and since the node representation
stored is the same in both cases, the query IDs are identical.

Here we fix this by always jumbling NULL nodes.  This produces different
jumble bytes in the above queries since the order that we jumble the
NULL changes between each of the above two queries.

Fixing this without regressing the performance of jumbling was quite
tricky, so this commit also contains a few optimizations to improve the
jumble performance.

Discussion: https://postgr.es/m/aafce7966e234372b2ba876c0193f1e9%40localhost.localdomain
---
 src/backend/nodes/queryjumblefuncs.c | 159 +++++++++++++++++++++++++--
 src/include/nodes/queryjumble.h      |   6 +
 2 files changed, 153 insertions(+), 12 deletions(-)

diff --git a/src/backend/nodes/queryjumblefuncs.c b/src/backend/nodes/queryjumblefuncs.c
index f8b0f91704b..8d3b1348634 100644
--- a/src/backend/nodes/queryjumblefuncs.c
+++ b/src/backend/nodes/queryjumblefuncs.c
@@ -58,6 +58,8 @@ bool		query_id_squash_values = false;
  */
 bool		query_id_enabled = false;
 
+static void FlushPendingNulls(JumbleState *jstate);
+
 static void AppendJumble(JumbleState *jstate,
 						 const unsigned char *item, Size size);
 static void RecordConstLocation(JumbleState *jstate,
@@ -134,9 +136,13 @@ JumbleQuery(Query *query)
 		palloc(jstate->clocations_buf_size * sizeof(LocationLen));
 	jstate->clocations_count = 0;
 	jstate->highest_extern_param_id = 0;
+	jstate->pending_nulls = 0;
 
 	/* Compute query ID and mark the Query node with it */
 	_jumbleNode(jstate, (Node *) query);
+	if (jstate->pending_nulls > 0)
+		FlushPendingNulls(jstate);
+
 	query->queryId = DatumGetUInt64(hash_any_extended(jstate->jumble,
 													  jstate->jumble_len,
 													  0));
@@ -170,25 +176,40 @@ EnableQueryId(void)
 }
 
 /*
- * AppendJumble: Append a value that is substantive in a given query to
- * the current jumble.
+ * AppendJumbleInternal: Append a value that is substantive in a given query
+ * to the current jumble.
+ *
+ * Note: Callers must ensure that jstate->pending_nulls is zero first by
+ * calling FlushPendingNulls() when required.  Callers must also ensure that
+ * size > 0.
  */
-static void
-AppendJumble(JumbleState *jstate, const unsigned char *item, Size size)
+static pg_attribute_always_inline void
+AppendJumbleInternal(JumbleState *jstate, const unsigned char *item,
+					 Size size)
 {
 	unsigned char *jumble = jstate->jumble;
 	Size		jumble_len = jstate->jumble_len;
 
+	/* Ensure the caller didn't mess up */
+	Assert(size > 0);
+
+	if (likely(size <= JUMBLE_SIZE - jumble_len))
+	{
+		memcpy(jumble + jumble_len, item, size);
+		jstate->jumble_len += size;
+		return;
+	}
+
 	/*
 	 * Whenever the jumble buffer is full, we hash the current contents and
 	 * reset the buffer to contain just that hash value, thus relying on the
 	 * hash to summarize everything so far.
 	 */
-	while (size > 0)
+	do
 	{
 		Size		part_size;
 
-		if (jumble_len >= JUMBLE_SIZE)
+		if (unlikely(jumble_len >= JUMBLE_SIZE))
 		{
 			uint64		start_hash;
 
@@ -202,10 +223,106 @@ AppendJumble(JumbleState *jstate, const unsigned char *item, Size size)
 		jumble_len += part_size;
 		item += part_size;
 		size -= part_size;
-	}
+	} while (size > 0);
+
 	jstate->jumble_len = jumble_len;
 }
 
+/*
+ * AppendJumbleNull
+ *		For jumbling NULL pointers
+ */
+static pg_attribute_always_inline void
+AppendJumbleNull(JumbleState *jstate)
+{
+	jstate->pending_nulls++;
+}
+
+/*
+ * AppendJumble
+ *		Add 'size' bytes of the given jumble 'value' to the jumble state
+ */
+static pg_noinline void
+AppendJumble(JumbleState *jstate, const unsigned char *value, Size size)
+{
+	if (jstate->pending_nulls > 0)
+		FlushPendingNulls(jstate);
+
+	AppendJumbleInternal(jstate, value, size);
+}
+
+/*
+ * AppendJumble8
+ *		Add the first byte from the given 'value' pointer to the jumble state
+ */
+static pg_noinline void
+AppendJumble8(JumbleState *jstate, const unsigned char *value)
+{
+	if (jstate->pending_nulls > 0)
+		FlushPendingNulls(jstate);
+
+	AppendJumbleInternal(jstate, value, 1);
+}
+
+/*
+ * AppendJumble16
+ *		Add the first 2 bytes from the given 'value' pointer to the jumble
+ *		state.
+ */
+static pg_noinline void
+AppendJumble16(JumbleState *jstate, const unsigned char *value)
+{
+	if (jstate->pending_nulls > 0)
+		FlushPendingNulls(jstate);
+
+	AppendJumbleInternal(jstate, value, 2);
+}
+
+/*
+ * AppendJumble32
+ *		Add the first 4 bytes from the given 'value' pointer to the jumble
+ *		state.
+ */
+static pg_noinline void
+AppendJumble32(JumbleState *jstate, const unsigned char *value)
+{
+	if (jstate->pending_nulls > 0)
+		FlushPendingNulls(jstate);
+
+	AppendJumbleInternal(jstate, value, 4);
+}
+
+/*
+ * AppendJumble64
+ *		Add the first 8 bytes from the given 'value' pointer to the jumble
+ *		state.
+ */
+static pg_noinline void
+AppendJumble64(JumbleState *jstate, const unsigned char *value)
+{
+	if (jstate->pending_nulls > 0)
+		FlushPendingNulls(jstate);
+
+	AppendJumbleInternal(jstate, value, 8);
+}
+
+/*
+ * FlushPendingNulls
+ *		Incorporate the pending_null values into the jumble buffer.
+ *
+ * Note: Callers must ensure that there's at least 1 pending NULL.
+ */
+static pg_attribute_always_inline void
+FlushPendingNulls(JumbleState *jstate)
+{
+	Assert(jstate->pending_nulls > 0);
+
+	AppendJumbleInternal(jstate,
+						 (const unsigned char *) &jstate->pending_nulls, 4);
+	jstate->pending_nulls = 0;
+}
+
+
 /*
  * Record location of constant within query string of query tree that is
  * currently being walked.
@@ -319,19 +436,34 @@ IsSquashableConstList(List *elements, Node **firstExpr, Node **lastExpr)
 }
 
 #define JUMBLE_NODE(item) \
-	_jumbleNode(jstate, (Node *) expr->item)
+	_jumbleNode(jstate, (Node *) expr->item);
 #define JUMBLE_ELEMENTS(list) \
 	_jumbleElements(jstate, (List *) expr->list)
 #define JUMBLE_LOCATION(location) \
 	RecordConstLocation(jstate, expr->location, false)
 #define JUMBLE_FIELD(item) \
-	AppendJumble(jstate, (const unsigned char *) &(expr->item), sizeof(expr->item))
+do { \
+	if (sizeof(expr->item) == 8) \
+		AppendJumble64(jstate, (const unsigned char *) &(expr->item)); \
+	else if (sizeof(expr->item) == 4) \
+		AppendJumble32(jstate, (const unsigned char *) &(expr->item)); \
+	else if (sizeof(expr->item) == 2) \
+		AppendJumble16(jstate, (const unsigned char *) &(expr->item)); \
+	else if (sizeof(expr->item) == 1) \
+		AppendJumble8(jstate, (const unsigned char *) &(expr->item)); \
+	else \
+		AppendJumble(jstate, (const unsigned char *) &(expr->item), sizeof(expr->item)); \
+} while (0)
+
+/* XXX what's this used for? */
 #define JUMBLE_FIELD_SINGLE(item) \
 	AppendJumble(jstate, (const unsigned char *) &(item), sizeof(item))
 #define JUMBLE_STRING(str) \
 do { \
 	if (expr->str) \
 		AppendJumble(jstate, (const unsigned char *) (expr->str), strlen(expr->str) + 1); \
+	else \
+		AppendJumbleNull(jstate); \
 } while(0)
 /* Function name used for the node field attribute custom_query_jumble. */
 #define JUMBLE_CUSTOM(nodetype, item) \
@@ -384,7 +516,10 @@ _jumbleNode(JumbleState *jstate, Node *node)
 	Node	   *expr = node;
 
 	if (expr == NULL)
+	{
+		AppendJumbleNull(jstate);
 		return;
+	}
 
 	/* Guard against stack overflow due to overly complex expressions */
 	check_stack_depth();
@@ -448,15 +583,15 @@ _jumbleList(JumbleState *jstate, Node *node)
 			break;
 		case T_IntList:
 			foreach(l, expr)
-				JUMBLE_FIELD_SINGLE(lfirst_int(l));
+				AppendJumble32(jstate, (const unsigned char *) &lfirst_int(l));
 			break;
 		case T_OidList:
 			foreach(l, expr)
-				JUMBLE_FIELD_SINGLE(lfirst_oid(l));
+				AppendJumble32(jstate, (const unsigned char *) &lfirst_oid(l));
 			break;
 		case T_XidList:
 			foreach(l, expr)
-				JUMBLE_FIELD_SINGLE(lfirst_xid(l));
+				AppendJumble32(jstate, (const unsigned char *) &lfirst_xid(l));
 			break;
 		default:
 			elog(ERROR, "unrecognized list node type: %d",
diff --git a/src/include/nodes/queryjumble.h b/src/include/nodes/queryjumble.h
index 905f66bc0bd..ab49b2b8b93 100644
--- a/src/include/nodes/queryjumble.h
+++ b/src/include/nodes/queryjumble.h
@@ -54,6 +54,12 @@ typedef struct JumbleState
 
 	/* highest Param id we've seen, in order to start normalization correctly */
 	int			highest_extern_param_id;
+
+	/*
+	 * Count of the number of NULL nodes seen since last appending a value.
+	 * These get flushed out to the jumble buffer before subsequent appends
+	 */
+	unsigned int pending_nulls;
 } JumbleState;
 
 /* Values for the compute_query_id GUC */
-- 
2.43.0

