From 39532fd798c15dca9e3cd0474b32934b01739449 Mon Sep 17 00:00:00 2001
From: Bykov Ivan <i.bykov@modernsys.ru>
Date: Mon, 10 Mar 2025 15:11:06 +0500
Subject: [PATCH] Query ID Calculation Fix for DISTINCT / ORDER BY and LIMIT /
 OFFSET

In some cases, we may encounter different query trees with the same IDs.
For two structurally similar query subnodes, the query trees may
look like this:

QueryA->subNodeOne = Value X;
QueryA->subNodeTwo = NULL;
QueryB->subNodeOne = NULL;
QueryB->subNodeTwo = Value X;

When the query jumble walker calculates the query ID, it traverses
the Query members from top to bottom and generates the same IDs
for these two queries because it does not change the hash value
when visiting an empty node (= NULL).

There are two pairs of subnodes that can trigger this error:
- distinctClause and sortClause (both contain a list of SortGroupClauses);
- limitOffset and limitCount (both contain an int8 expression).

To fix this problem, the definition of the Query node has been
changed so that a scalar field is placed between similar subnodes.
---
 .../pg_stat_statements/expected/select.out    | 49 +++++++++++++++++++
 contrib/pg_stat_statements/sql/select.sql     | 18 +++++++
 src/include/nodes/parsenodes.h                | 16 ++++--
 3 files changed, 80 insertions(+), 3 deletions(-)

diff --git a/contrib/pg_stat_statements/expected/select.out b/contrib/pg_stat_statements/expected/select.out
index dd6c756f67d..f1b71f56b7b 100644
--- a/contrib/pg_stat_statements/expected/select.out
+++ b/contrib/pg_stat_statements/expected/select.out
@@ -406,9 +406,58 @@ SELECT COUNT(*) FROM pg_stat_statements WHERE query LIKE '%SELECT GROUPING%';
      2
 (1 row)
 
+CREATE TABLE pgss_c AS
+  SELECT id FROM generate_series(1,2) AS id;
 SELECT pg_stat_statements_reset() IS NOT NULL AS t;
  t 
 ---
  t
 (1 row)
 
+-- These two queries trigger a Query ID collision in PostgreSQL versions prior to 18
+SELECT DISTINCT id FROM pgss_c;
+ id 
+----
+  2
+  1
+(2 rows)
+
+SELECT id FROM pgss_c ORDER BY id;
+ id 
+----
+  1
+  2
+(2 rows)
+
+-- These two queries trigger a Query ID collision in PostgreSQL versions prior to 18
+SELECT id FROM pgss_c LIMIT 1;
+ id 
+----
+  1
+(1 row)
+
+SELECT id FROM pgss_c OFFSET 1;
+ id 
+----
+  2
+(1 row)
+
+-- Expected four rows (Query ID collision has been fixed).
+SELECT query FROM pg_stat_statements
+  WHERE query LIKE '%pgss_c%'
+  ORDER BY query COLLATE "C";
+               query               
+-----------------------------------
+ SELECT DISTINCT id FROM pgss_c
+ SELECT id FROM pgss_c LIMIT $1
+ SELECT id FROM pgss_c OFFSET $1
+ SELECT id FROM pgss_c ORDER BY id
+(4 rows)
+
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+ t 
+---
+ t
+(1 row)
+
+DROP TABLE pgss_c;
diff --git a/contrib/pg_stat_statements/sql/select.sql b/contrib/pg_stat_statements/sql/select.sql
index eb45cb81ad2..bbc37fa0005 100644
--- a/contrib/pg_stat_statements/sql/select.sql
+++ b/contrib/pg_stat_statements/sql/select.sql
@@ -146,4 +146,22 @@ SELECT (
 ) FROM (VALUES(6,7)) v3(e,f) GROUP BY ROLLUP(e,f);
 
 SELECT COUNT(*) FROM pg_stat_statements WHERE query LIKE '%SELECT GROUPING%';
+
+CREATE TABLE pgss_c AS
+  SELECT id FROM generate_series(1,2) AS id;
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+
+-- These two queries trigger a Query ID collision in PostgreSQL versions prior to 18
+SELECT DISTINCT id FROM pgss_c;
+SELECT id FROM pgss_c ORDER BY id;
+-- These two queries trigger a Query ID collision in PostgreSQL versions prior to 18
+SELECT id FROM pgss_c LIMIT 1;
+SELECT id FROM pgss_c OFFSET 1;
+
+-- Expected four rows (Query ID collision has been fixed).
+SELECT query FROM pg_stat_statements
+  WHERE query LIKE '%pgss_c%'
+  ORDER BY query COLLATE "C";
 SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+
+DROP TABLE pgss_c;
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index 67c90a2bd32..1a8ea81c690 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -113,6 +113,16 @@ typedef uint64 AclMode;			/* a bitmask of privilege bits */
  *	  significant (such as alias names), as is ignored anything that can
  *	  be deduced from child nodes (else we'd just be double-hashing that
  *	  piece of information).
+ *
+ *	  The query jumbling process may trigger a Query ID collision when
+ *	  two Query fields located sequentially contain the same type of nodes
+ *	  (a list of nodes), and both of them may be NULL.
+ *	  List of such groups of fields:
+ *	    - limitOffset and limitCount (which may be an int8 expression or NULL);
+ *	    - distinctClause, and sortClause (which may be a list of
+ *	      SortGroupClause entries or NULL);
+ *	  To prevent this collision, we should insert at least one scalar field
+ *	  without the attribute query_jumble_ignore between such fields.
  */
 typedef struct Query
 {
@@ -208,11 +218,11 @@ typedef struct Query
 
 	List	   *distinctClause; /* a list of SortGroupClause's */
 
-	List	   *sortClause;		/* a list of SortGroupClause's */
-
 	Node	   *limitOffset;	/* # of result tuples to skip (int8 expr) */
-	Node	   *limitCount;		/* # of result tuples to return (int8 expr) */
 	LimitOption limitOption;	/* limit type */
+	Node	   *limitCount;		/* # of result tuples to return (int8 expr) */
+
+	List	   *sortClause;		/* a list of SortGroupClause's */
 
 	List	   *rowMarks;		/* a list of RowMarkClause's */
 
-- 
2.39.5