From 1820a25ee972859c049d30a959b6e5f6ae29a3ba Mon Sep 17 00:00:00 2001
From: David Rowley <dgrowley@gmail.com>
Date: Tue, 25 Mar 2025 18:49:15 +1300
Subject: [PATCH v9 1/2] Fix query jumbling to account for NULL nodes

Previously NULL nodes were ignored.  This could cause issues where the
computed query ID could match for queries where fields that are next to
each other in their Node struct where one field was NULL and the other
non-NULL.  For example, the Query struct had distinctClause and sortClause
next to each other.  If someone wrote;

SELECT DISTINCT c1 FROM t;

and then;

SELECT c1 FROM t ORDER BY c1;

these would produce the same query ID since, in the first query, we
ignored the NULL sortClause and appended the jumble bytes for the
distictClause.  In the latter query, since we did nothing for the NULL
distinctClause then jumble the non-NULL sortClause, and since the node
representation stored is the same in both cases, the query IDs were
identical.

Here we fix this by always accounting for NULL nodes by recording that
we saw a NULL in the jumble buffer.  This fixes the issue as the order that
the NULL is recorded isn't the same in the above two queries.

Author: Bykov Ivan <i.bykov@modernsys.ru>
Author: Michael Paquier <michael@paquier.xyz>
Author: David Rowley <dgrowleyml@gmail.com>
Discussion: https://postgr.es/m/aafce7966e234372b2ba876c0193f1e9%40localhost.localdomain
---
 .../pg_stat_statements/expected/select.out    |  87 ++++++++++++-
 contrib/pg_stat_statements/sql/select.sql     |  20 +++
 src/backend/nodes/queryjumblefuncs.c          | 116 +++++++++++++++---
 src/include/nodes/queryjumble.h               |  10 ++
 4 files changed, 216 insertions(+), 17 deletions(-)

diff --git a/contrib/pg_stat_statements/expected/select.out b/contrib/pg_stat_statements/expected/select.out
index 1eebc2898ab..09476a7b699 100644
--- a/contrib/pg_stat_statements/expected/select.out
+++ b/contrib/pg_stat_statements/expected/select.out
@@ -19,6 +19,86 @@ SELECT 1 AS "int";
    1
 (1 row)
 
+-- LIMIT and OFFSET patterns
+-- Try some query permutations which once produced identical query IDs
+SELECT 1 AS "int" LIMIT 1;
+ int 
+-----
+   1
+(1 row)
+
+SELECT 1 AS "int" LIMIT 2;
+ int 
+-----
+   1
+(1 row)
+
+SELECT 1 AS "int" OFFSET 1;
+ int 
+-----
+(0 rows)
+
+SELECT 1 AS "int" OFFSET 2;
+ int 
+-----
+(0 rows)
+
+SELECT 1 AS "int" OFFSET 1 LIMIT 1;
+ int 
+-----
+(0 rows)
+
+SELECT 1 AS "int" OFFSET 2 LIMIT 2;
+ int 
+-----
+(0 rows)
+
+SELECT 1 AS "int" LIMIT 1 OFFSET 1;
+ int 
+-----
+(0 rows)
+
+SELECT 1 AS "int" LIMIT 3 OFFSET 3;
+ int 
+-----
+(0 rows)
+
+SELECT 1 AS "int" OFFSET 1 FETCH FIRST 2 ROW ONLY;
+ int 
+-----
+(0 rows)
+
+SELECT 1 AS "int" OFFSET 2 FETCH FIRST 3 ROW ONLY;
+ int 
+-----
+(0 rows)
+
+-- DISTINCT and ORDER BY patterns
+-- Try some query permutations which once produced identical query IDs
+SELECT DISTINCT 1 AS "int";
+ int 
+-----
+   1
+(1 row)
+
+SELECT DISTINCT 2 AS "int";
+ int 
+-----
+   2
+(1 row)
+
+SELECT 1 AS "int" ORDER BY 1;
+ int 
+-----
+   1
+(1 row)
+
+SELECT 2 AS "int" ORDER BY 1;
+ int 
+-----
+   2
+(1 row)
+
 /* this comment should not appear in the output */
 SELECT 'hello'
   -- but this one will appear
@@ -135,9 +215,14 @@ SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C";
      3 |    3 | SELECT $1 + $2 + $3 AS "add"
      1 |    1 | SELECT $1 AS "float"
      2 |    2 | SELECT $1 AS "int"
+     2 |    2 | SELECT $1 AS "int" LIMIT $2
+     2 |    0 | SELECT $1 AS "int" OFFSET $2
+     6 |    0 | SELECT $1 AS "int" OFFSET $2 LIMIT $3
+     2 |    2 | SELECT $1 AS "int" ORDER BY 1
      1 |    2 | SELECT $1 AS i UNION SELECT $2 ORDER BY i
      1 |    1 | SELECT $1 || $2
      1 |    1 | SELECT $1, $2 LIMIT $3
+     2 |    2 | SELECT DISTINCT $1 AS "int"
      0 |    0 | SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C"
      1 |    1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t
      1 |    2 | WITH t(f) AS (                                                              +
@@ -145,7 +230,7 @@ SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C";
        |      | )                                                                           +
        |      |   SELECT f FROM t ORDER BY f
      1 |    1 | select $1::jsonb ? $2
-(12 rows)
+(17 rows)
 
 SELECT pg_stat_statements_reset() IS NOT NULL AS t;
  t 
diff --git a/contrib/pg_stat_statements/sql/select.sql b/contrib/pg_stat_statements/sql/select.sql
index 9a35471b271..c5e0b84ee5b 100644
--- a/contrib/pg_stat_statements/sql/select.sql
+++ b/contrib/pg_stat_statements/sql/select.sql
@@ -12,6 +12,26 @@ SELECT pg_stat_statements_reset() IS NOT NULL AS t;
 --
 SELECT 1 AS "int";
 
+-- LIMIT and OFFSET patterns
+-- Try some query permutations which once produced identical query IDs
+SELECT 1 AS "int" LIMIT 1;
+SELECT 1 AS "int" LIMIT 2;
+SELECT 1 AS "int" OFFSET 1;
+SELECT 1 AS "int" OFFSET 2;
+SELECT 1 AS "int" OFFSET 1 LIMIT 1;
+SELECT 1 AS "int" OFFSET 2 LIMIT 2;
+SELECT 1 AS "int" LIMIT 1 OFFSET 1;
+SELECT 1 AS "int" LIMIT 3 OFFSET 3;
+SELECT 1 AS "int" OFFSET 1 FETCH FIRST 2 ROW ONLY;
+SELECT 1 AS "int" OFFSET 2 FETCH FIRST 3 ROW ONLY;
+
+-- DISTINCT and ORDER BY patterns
+-- Try some query permutations which once produced identical query IDs
+SELECT DISTINCT 1 AS "int";
+SELECT DISTINCT 2 AS "int";
+SELECT 1 AS "int" ORDER BY 1;
+SELECT 2 AS "int" ORDER BY 1;
+
 /* this comment should not appear in the output */
 SELECT 'hello'
   -- but this one will appear
diff --git a/src/backend/nodes/queryjumblefuncs.c b/src/backend/nodes/queryjumblefuncs.c
index 62d6cfb7ac1..9d206afef15 100644
--- a/src/backend/nodes/queryjumblefuncs.c
+++ b/src/backend/nodes/queryjumblefuncs.c
@@ -58,6 +58,9 @@ bool		query_id_squash_values = false;
  */
 bool		query_id_enabled = false;
 
+static JumbleState *InitJumble(void);
+static uint64 DoJumble(JumbleState *jstate, Node *node);
+static void FlushPendingNulls(JumbleState *jstate);
 static void AppendJumble(JumbleState *jstate,
 						 const unsigned char *item, Size size);
 static void RecordConstLocation(JumbleState *jstate,
@@ -120,29 +123,22 @@ CleanQuerytext(const char *query, int *location, int *len)
 	return query;
 }
 
+/*
+ * JumbleQuery
+ *		Recursively process the given Query producing a 64-bit hash value by
+ *		hashing the relevant fields and record that value in the Query's queryId
+ *		field.  Return the JumbleState object used for jumbling the query.
+ */
 JumbleState *
 JumbleQuery(Query *query)
 {
-	JumbleState *jstate = NULL;
+	JumbleState *jstate;
 
 	Assert(IsQueryIdEnabled());
 
-	jstate = (JumbleState *) palloc(sizeof(JumbleState));
-
-	/* Set up workspace for query jumbling */
-	jstate->jumble = (unsigned char *) palloc(JUMBLE_SIZE);
-	jstate->jumble_len = 0;
-	jstate->clocations_buf_size = 32;
-	jstate->clocations = (LocationLen *)
-		palloc(jstate->clocations_buf_size * sizeof(LocationLen));
-	jstate->clocations_count = 0;
-	jstate->highest_extern_param_id = 0;
+	jstate = InitJumble();
 
-	/* Compute query ID and mark the Query node with it */
-	_jumbleNode(jstate, (Node *) query);
-	query->queryId = DatumGetUInt64(hash_any_extended(jstate->jumble,
-													  jstate->jumble_len,
-													  0));
+	query->queryId = DoJumble(jstate, (Node *) query);
 
 	/*
 	 * If we are unlucky enough to get a hash of zero, use 1 instead for
@@ -172,6 +168,51 @@ EnableQueryId(void)
 		query_id_enabled = true;
 }
 
+/*
+ * InitJumble
+ *		Allocate a JumbleState object and make it ready to jumble.
+ */
+static JumbleState *
+InitJumble(void)
+{
+	JumbleState *jstate;
+
+	jstate = (JumbleState *) palloc(sizeof(JumbleState));
+
+	/* Set up workspace for query jumbling */
+	jstate->jumble = (unsigned char *) palloc(JUMBLE_SIZE);
+	jstate->jumble_len = 0;
+	jstate->clocations_buf_size = 32;
+	jstate->clocations = (LocationLen *) palloc(jstate->clocations_buf_size *
+												sizeof(LocationLen));
+	jstate->clocations_count = 0;
+	jstate->highest_extern_param_id = 0;
+	jstate->pending_nulls = 0;
+	jstate->total_jumble_len = 0;
+
+	return jstate;
+}
+
+/*
+ * DoJumble
+ *		Jumble the given Node using the given JumbleState and return the resulting
+ *		jumble hash.
+ */
+static uint64
+DoJumble(JumbleState *jstate, Node *node)
+{
+	/* Compute query ID and mark the Query node with it */
+	_jumbleNode(jstate, node);
+
+	/* Flush any pending NULLs before doing the final hash */
+	if (jstate->pending_nulls > 0)
+		FlushPendingNulls(jstate);
+
+	return DatumGetUInt64(hash_any_extended(jstate->jumble,
+											jstate->jumble_len,
+											0));
+}
+
 /*
  * AppendJumble: Append a value that is substantive in a given query to
  * the current jumble.
@@ -205,10 +246,42 @@ AppendJumble(JumbleState *jstate, const unsigned char *item, Size size)
 		jumble_len += part_size;
 		item += part_size;
 		size -= part_size;
+
+#ifdef USE_ASSERT_CHECKING
+		jstate->total_jumble_len += part_size;
+#endif
 	}
+
 	jstate->jumble_len = jumble_len;
 }
 
+/*
+ * AppendJumbleNull
+ *		For jumbling NULL pointers
+ */
+static pg_attribute_always_inline void
+AppendJumbleNull(JumbleState *jstate)
+{
+	jstate->pending_nulls++;
+}
+
+/*
+ * FlushPendingNulls
+ *		Incorporate the pending_null value into the jumble buffer.
+ *
+ * Note: Callers must ensure that there's at least 1 pending NULL.
+ */
+static pg_attribute_always_inline void
+FlushPendingNulls(JumbleState *jstate)
+{
+	Assert(jstate->pending_nulls > 0);
+
+	AppendJumble(jstate,
+				 (const unsigned char *) &jstate->pending_nulls, 4);
+	jstate->pending_nulls = 0;
+}
+
+
 /*
  * Record location of constant within query string of query tree that is
  * currently being walked.
@@ -335,6 +408,8 @@ IsSquashableConstList(List *elements, Node **firstExpr, Node **lastExpr)
 do { \
 	if (expr->str) \
 		AppendJumble(jstate, (const unsigned char *) (expr->str), strlen(expr->str) + 1); \
+	else \
+		AppendJumbleNull(jstate); \
 } while(0)
 /* Function name used for the node field attribute custom_query_jumble. */
 #define JUMBLE_CUSTOM(nodetype, item) \
@@ -385,9 +460,15 @@ static void
 _jumbleNode(JumbleState *jstate, Node *node)
 {
 	Node	   *expr = node;
+#ifdef USE_ASSERT_CHECKING
+	Size		prev_jumble_len = jstate->total_jumble_len;
+#endif
 
 	if (expr == NULL)
+	{
+		AppendJumbleNull(jstate);
 		return;
+	}
 
 	/* Guard against stack overflow due to overly complex expressions */
 	check_stack_depth();
@@ -435,6 +516,9 @@ _jumbleNode(JumbleState *jstate, Node *node)
 		default:
 			break;
 	}
+
+	/* Ensure we added something to the jumble buffer */
+	Assert(jstate->total_jumble_len > prev_jumble_len);
 }
 
 static void
diff --git a/src/include/nodes/queryjumble.h b/src/include/nodes/queryjumble.h
index 905f66bc0bd..9414fb151ee 100644
--- a/src/include/nodes/queryjumble.h
+++ b/src/include/nodes/queryjumble.h
@@ -54,6 +54,16 @@ typedef struct JumbleState
 
 	/* highest Param id we've seen, in order to start normalization correctly */
 	int			highest_extern_param_id;
+
+	/*
+	 * Count of the number of NULL nodes seen since last appending a value.
+	 * These are flushed out to the jumble buffer before subsequent appends
+	 * and before performing the final jumble hash.
+	 */
+	unsigned int pending_nulls;
+
+	/* The total number of bytes added to the jumble buffer */
+	Size		total_jumble_len;
 } JumbleState;
 
 /* Values for the compute_query_id GUC */
-- 
2.43.0

