From 15c4e81cfd8d3fa32615694c4462026d0bac1a96 Mon Sep 17 00:00:00 2001
From: Sami Imseih <simseih@amazon.com>
Date: Fri, 30 May 2025 11:42:56 -0500
Subject: [PATCH v8 3/3] Support Squashing of External Parameters

62d712ec introduced the concept of element squashing for
quwry normalization purposes. However, it did not account for
external parameters passed to a list of elements. This adds
support to these types of values and simplifies the squashing
logic further.
---
 .../pg_stat_statements/expected/extended.out  |  36 +++--
 .../pg_stat_statements/expected/squashing.out |  24 +--
 .../pg_stat_statements/pg_stat_statements.c   |   4 +
 contrib/pg_stat_statements/sql/extended.sql   |   5 +-
 contrib/pg_stat_statements/sql/squashing.sql  |   4 +-
 src/backend/nodes/queryjumblefuncs.c          | 147 +++++++++++-------
 src/include/nodes/primnodes.h                 |   6 +-
 src/include/nodes/queryjumble.h               |  16 +-
 8 files changed, 158 insertions(+), 84 deletions(-)

diff --git a/contrib/pg_stat_statements/expected/extended.out b/contrib/pg_stat_statements/expected/extended.out
index 7da308ba84f4..6f2c231bf2ae 100644
--- a/contrib/pg_stat_statements/expected/extended.out
+++ b/contrib/pg_stat_statements/expected/extended.out
@@ -69,13 +69,13 @@ SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C";
 (4 rows)
 
 -- Various parameter numbering patterns
+-- Unique query IDs with parameter numbers switched.
 SELECT pg_stat_statements_reset() IS NOT NULL AS t;
  t 
 ---
  t
 (1 row)
 
--- Unique query IDs with parameter numbers switched.
 SELECT WHERE ($1::int, 7) IN ((8, $2::int), ($3::int, 9)) \bind '1' '2' '3' \g
 --
 (0 rows)
@@ -96,7 +96,24 @@ SELECT WHERE $3::int IN ($1::int, $2::int) \bind '1' '2' '3' \g
 --
 (0 rows)
 
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+                            query                             | calls 
+--------------------------------------------------------------+-------
+ SELECT WHERE $1::int IN ($2 /*, ... */)                      |     1
+ SELECT WHERE $1::int IN ($2 /*, ... */)                      |     1
+ SELECT WHERE $1::int IN ($2 /*, ... */)                      |     1
+ SELECT WHERE ($1::int, $4) IN (($5, $2::int), ($3::int, $6)) |     1
+ SELECT WHERE ($2::int, $4) IN (($5, $3::int), ($1::int, $6)) |     1
+ SELECT pg_stat_statements_reset() IS NOT NULL AS t           |     1
+(6 rows)
+
 -- Two groups of two queries with the same query ID.
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+ t 
+---
+ t
+(1 row)
+
 SELECT WHERE '1'::int IN ($1::int, '2'::int) \bind '1' \g
 --
 (1 row)
@@ -114,15 +131,10 @@ SELECT WHERE $2::int IN ($1::int, '2'::int) \bind '3' '4' \g
 (0 rows)
 
 SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
-                            query                             | calls 
---------------------------------------------------------------+-------
- SELECT WHERE $1::int IN ($2::int, $3::int)                   |     1
- SELECT WHERE $2::int IN ($1::int, $3::int)                   |     2
- SELECT WHERE $2::int IN ($1::int, $3::int)                   |     2
- SELECT WHERE $2::int IN ($3::int, $1::int)                   |     1
- SELECT WHERE $3::int IN ($1::int, $2::int)                   |     1
- SELECT WHERE ($1::int, $4) IN (($5, $2::int), ($3::int, $6)) |     1
- SELECT WHERE ($2::int, $4) IN (($5, $3::int), ($1::int, $6)) |     1
- SELECT pg_stat_statements_reset() IS NOT NULL AS t           |     1
-(8 rows)
+                       query                        | calls 
+----------------------------------------------------+-------
+ SELECT WHERE $1::int IN ($2 /*, ... */)            |     2
+ SELECT WHERE $1::int IN ($2 /*, ... */)            |     2
+ SELECT pg_stat_statements_reset() IS NOT NULL AS t |     1
+(3 rows)
 
diff --git a/contrib/pg_stat_statements/expected/squashing.out b/contrib/pg_stat_statements/expected/squashing.out
index 5376700fef86..ad282ac2b834 100644
--- a/contrib/pg_stat_statements/expected/squashing.out
+++ b/contrib/pg_stat_statements/expected/squashing.out
@@ -103,7 +103,7 @@ SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
  SELECT pg_stat_statements_reset() IS NOT NULL AS t |     1
 (2 rows)
 
--- external parameters will not be squashed
+-- external parameters will be squashed
 SELECT pg_stat_statements_reset() IS NOT NULL AS t;
  t 
 ---
@@ -123,14 +123,14 @@ SELECT * FROM test_squash WHERE id::text = ANY(ARRAY[$1, $2, $3, $4, $5]) \bind
 (0 rows)
 
 SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
-                                   query                                   | calls 
----------------------------------------------------------------------------+-------
- SELECT * FROM test_squash WHERE id IN ($1, $2, $3, $4, $5)                |     1
- SELECT * FROM test_squash WHERE id::text = ANY(ARRAY[$1, $2, $3, $4, $5]) |     1
- SELECT pg_stat_statements_reset() IS NOT NULL AS t                        |     1
+                                query                                 | calls 
+----------------------------------------------------------------------+-------
+ SELECT * FROM test_squash WHERE id IN ($1 /*, ... */)                |     1
+ SELECT * FROM test_squash WHERE id::text = ANY(ARRAY[$1 /*, ... */]) |     1
+ SELECT pg_stat_statements_reset() IS NOT NULL AS t                   |     1
 (3 rows)
 
--- neither are prepared statements
+-- prepared statements will also be squashed
 -- the IN and ARRAY forms of this statement will have the same queryId
 SELECT pg_stat_statements_reset() IS NOT NULL AS t;
  t 
@@ -155,11 +155,11 @@ EXECUTE p1(1, 2, 3, 4, 5);
 
 DEALLOCATE p1;
 SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
-                           query                            | calls 
-------------------------------------------------------------+-------
- DEALLOCATE $1                                              |     2
- SELECT * FROM test_squash WHERE id IN ($1, $2, $3, $4, $5) |     2
- SELECT pg_stat_statements_reset() IS NOT NULL AS t         |     1
+                         query                         | calls 
+-------------------------------------------------------+-------
+ DEALLOCATE $1                                         |     2
+ SELECT * FROM test_squash WHERE id IN ($1 /*, ... */) |     2
+ SELECT pg_stat_statements_reset() IS NOT NULL AS t    |     1
 (3 rows)
 
 -- More conditions in the query
diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c
index b08985c0051d..47f74b1e9ac4 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.c
+++ b/contrib/pg_stat_statements/pg_stat_statements.c
@@ -2842,6 +2842,10 @@ generate_normalized_query(JumbleState *jstate, const char *query,
 		int			off,		/* Offset from start for cur tok */
 					tok_len;	/* Length (in bytes) of that tok */
 
+		if (jstate->clocations[i].extern_param &&
+			!jstate->has_squashed_lists)
+			continue;
+
 		off = jstate->clocations[i].location;
 
 		/* Adjust recorded location if we're dealing with partial string */
diff --git a/contrib/pg_stat_statements/sql/extended.sql b/contrib/pg_stat_statements/sql/extended.sql
index a366658a53a7..ffb5b1628190 100644
--- a/contrib/pg_stat_statements/sql/extended.sql
+++ b/contrib/pg_stat_statements/sql/extended.sql
@@ -21,17 +21,18 @@ SELECT $1 \bind 'unnamed_val1' \g
 SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C";
 
 -- Various parameter numbering patterns
-SELECT pg_stat_statements_reset() IS NOT NULL AS t;
 -- Unique query IDs with parameter numbers switched.
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
 SELECT WHERE ($1::int, 7) IN ((8, $2::int), ($3::int, 9)) \bind '1' '2' '3' \g
 SELECT WHERE ($2::int, 10) IN ((11, $3::int), ($1::int, 12)) \bind '1' '2' '3' \g
 SELECT WHERE $1::int IN ($2::int, $3::int) \bind '1' '2' '3' \g
 SELECT WHERE $2::int IN ($3::int, $1::int) \bind '1' '2' '3' \g
 SELECT WHERE $3::int IN ($1::int, $2::int) \bind '1' '2' '3' \g
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
 -- Two groups of two queries with the same query ID.
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
 SELECT WHERE '1'::int IN ($1::int, '2'::int) \bind '1' \g
 SELECT WHERE '4'::int IN ($1::int, '5'::int) \bind '2' \g
 SELECT WHERE $2::int IN ($1::int, '1'::int) \bind '1' '2' \g
 SELECT WHERE $2::int IN ($1::int, '2'::int) \bind '3' '4' \g
-
 SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
diff --git a/contrib/pg_stat_statements/sql/squashing.sql b/contrib/pg_stat_statements/sql/squashing.sql
index 85aae152da8e..4efd412be9bd 100644
--- a/contrib/pg_stat_statements/sql/squashing.sql
+++ b/contrib/pg_stat_statements/sql/squashing.sql
@@ -32,7 +32,7 @@ SELECT WHERE 1 IN (1, int4(1), int4(2), 2);
 SELECT WHERE 1 = ANY (ARRAY[1, int4(1), int4(2), 2]);
 SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
 
--- external parameters will not be squashed
+-- external parameters will be squashed
 SELECT pg_stat_statements_reset() IS NOT NULL AS t;
 SELECT * FROM test_squash WHERE id IN ($1, $2, $3, $4, $5)  \bind 1 2 3 4 5
 ;
@@ -40,7 +40,7 @@ SELECT * FROM test_squash WHERE id::text = ANY(ARRAY[$1, $2, $3, $4, $5]) \bind
 ;
 SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
 
--- neither are prepared statements
+-- prepared statements will also be squashed
 -- the IN and ARRAY forms of this statement will have the same queryId
 SELECT pg_stat_statements_reset() IS NOT NULL AS t;
 PREPARE p1(int, int, int, int, int) AS
diff --git a/src/backend/nodes/queryjumblefuncs.c b/src/backend/nodes/queryjumblefuncs.c
index 35552731bf69..d41e3eff2406 100644
--- a/src/backend/nodes/queryjumblefuncs.c
+++ b/src/backend/nodes/queryjumblefuncs.c
@@ -61,7 +61,7 @@ static void AppendJumble(JumbleState *jstate,
 						 const unsigned char *value, Size size);
 static void FlushPendingNulls(JumbleState *jstate);
 static void RecordExpressionLocation(JumbleState *jstate,
-									 int location, int len);
+									 int location, int len, bool extern_param);
 static void _jumbleNode(JumbleState *jstate, Node *node);
 static void _jumbleElements(JumbleState *jstate, List *elements, Node *node);
 static void _jumbleA_Const(JumbleState *jstate, Node *node);
@@ -70,6 +70,7 @@ static void _jumbleVariableSetStmt(JumbleState *jstate, Node *node);
 static void _jumbleRangeTblEntry_eref(JumbleState *jstate,
 									  RangeTblEntry *rte,
 									  Alias *expr);
+static void _jumbleParam(JumbleState *jstate, Node *node);
 
 /*
  * Given a possibly multi-statement source string, confine our attention to the
@@ -185,6 +186,7 @@ InitJumble(void)
 	jstate->clocations_count = 0;
 	jstate->highest_extern_param_id = 0;
 	jstate->pending_nulls = 0;
+	jstate->has_squashed_lists = false;
 #ifdef USE_ASSERT_CHECKING
 	jstate->total_jumble_len = 0;
 #endif
@@ -207,6 +209,10 @@ DoJumble(JumbleState *jstate, Node *node)
 	if (jstate->pending_nulls > 0)
 		FlushPendingNulls(jstate);
 
+	/* Squashed list found, reset highest_extern_param_id */
+	if (jstate->has_squashed_lists)
+		jstate->highest_extern_param_id = 0;
+
 	/* Process the jumble buffer and produce the hash value */
 	return DatumGetInt64(hash_any_extended(jstate->jumble,
 										   jstate->jumble_len,
@@ -376,13 +382,13 @@ FlushPendingNulls(JumbleState *jstate)
  * Record location of an expression within query string of query tree
  * that is currently being walked.
  *
- * Recorded locations can either be constants or the starting points of
- * lists of elements to be squashed. In the latter case, a length is
- * provided to determine the end of the squashed list and to mark the
- * location accordingly.
+ * Recorded locations can either be constants, external parameters or
+ * the starting points of lists of elements to be squashed. In the latter
+ * case, a length is provided to determine the end of the squashed list
+ * and to mark the location accordingly.
  */
 static void
-RecordExpressionLocation(JumbleState *jstate, int location, int len)
+RecordExpressionLocation(JumbleState *jstate, int location, int len, bool extern_param)
 {
 	/* -1 indicates unknown or undefined location */
 	if (location >= 0)
@@ -406,10 +412,29 @@ RecordExpressionLocation(JumbleState *jstate, int location, int len)
 		 */
 		jstate->clocations[jstate->clocations_count].length = (len > -1) ? len : -1;
 		jstate->clocations[jstate->clocations_count].squashed = (len > -1) ? true : false;
+		jstate->clocations[jstate->clocations_count].extern_param = extern_param;
 		jstate->clocations_count++;
 	}
 }
 
+/*
+ * Subroutine for IsSquashableExpression to check if a node is a
+ * constant or External Parameter.
+ */
+static bool
+IsConstOrExternalParam(Node *node)
+{
+	switch (nodeTag(node))
+	{
+		case T_Const:
+			return true;
+		case T_Param:
+			return ((Param *) node)->paramkind == PARAM_EXTERN;
+		default:
+			return false;
+	}
+}
+
 /*
  * Subroutine for _jumbleElements: Verify a few simple cases where we can
  * deduce that the expression is a constant:
@@ -422,42 +447,48 @@ RecordExpressionLocation(JumbleState *jstate, int location, int len)
 static bool
 IsSquashableExpression(Node *element)
 {
+	ListCell   *temp;
+
+	/* Unwrap RelabelType and CoerceViaIO layers */
 	if (IsA(element, RelabelType))
 		element = (Node *) ((RelabelType *) element)->arg;
 
 	if (IsA(element, CoerceViaIO))
 		element = (Node *) ((CoerceViaIO *) element)->arg;
 
-	if (IsA(element, FuncExpr))
+	switch (nodeTag(element))
 	{
-		FuncExpr   *func = (FuncExpr *) element;
-		ListCell   *temp;
+		case T_FuncExpr:
+			{
+				FuncExpr   *func = (FuncExpr *) element;
 
-		if (func->funcformat != COERCE_IMPLICIT_CAST &&
-			func->funcformat != COERCE_EXPLICIT_CAST)
-			return false;
+				/*
+				 * Only implicit/explicit casts on built-in functions are
+				 * squashable
+				 */
+				if (func->funcformat != COERCE_IMPLICIT_CAST &&
+					func->funcformat != COERCE_EXPLICIT_CAST)
+					return false;
 
-		if (func->funcid > FirstGenbkiObjectId)
-			return false;
+				if (func->funcid > FirstGenbkiObjectId)
+					return false;
 
-		foreach(temp, func->args)
-		{
-			Node	   *arg = lfirst(temp);
+				/* All arguments must be constants or external parameters */
+				foreach(temp, func->args)
+				{
+					Node	   *arg = lfirst(temp);
 
-			if (!IsA(arg, Const))	/* XXX we could recurse here instead */
-				return false;
-		}
+					if (IsConstOrExternalParam(arg))
+						return true;
+				}
 
-		return true;
+				return false;
+			}
+		default:
+			return IsConstOrExternalParam(element);
 	}
-
-	if (!IsA(element, Const))
-		return false;
-
-	return true;
 }
 
-
 /*
  * Subroutine for _jumbleElements: Verify whether the provided list
  * can be squashed, meaning it contains only constant expressions.
@@ -473,10 +504,7 @@ IsSquashableExpressionList(List *elements)
 {
 	ListCell   *temp;
 
-	/*
-	 * If squashing is disabled, or the list is too short, we don't try to
-	 * squash it.
-	 */
+	/* If the list only has 1 element, don't squash it */
 	if (list_length(elements) < 2)
 		return false;
 
@@ -494,7 +522,7 @@ IsSquashableExpressionList(List *elements)
 #define JUMBLE_ELEMENTS(list, node) \
 	_jumbleElements(jstate, (List *) expr->list, node)
 #define JUMBLE_LOCATION(location) \
-	RecordExpressionLocation(jstate, expr->location, -1)
+	RecordExpressionLocation(jstate, expr->location, -1, false)
 #define JUMBLE_FIELD(item) \
 do { \
 	if (sizeof(expr->item) == 8) \
@@ -545,8 +573,9 @@ _jumbleElements(JumbleState *jstate, List *elements, Node *node)
 			{
 				RecordExpressionLocation(jstate,
 										 aexpr->list_start + 1,
-										 (aexpr->list_end - aexpr->list_start) - 1);
+										 (aexpr->list_end - aexpr->list_start) - 1, false);
 				normalize_list = true;
+				jstate->has_squashed_lists = true;
 			}
 		}
 	}
@@ -598,26 +627,6 @@ _jumbleNode(JumbleState *jstate, Node *node)
 			break;
 	}
 
-	/* Special cases to handle outside the automated code */
-	switch (nodeTag(expr))
-	{
-		case T_Param:
-			{
-				Param	   *p = (Param *) node;
-
-				/*
-				 * Update the highest Param id seen, in order to start
-				 * normalization correctly.
-				 */
-				if (p->paramkind == PARAM_EXTERN &&
-					p->paramid > jstate->highest_extern_param_id)
-					jstate->highest_extern_param_id = p->paramid;
-			}
-			break;
-		default:
-			break;
-	}
-
 	/* Ensure we added something to the jumble buffer */
 	Assert(jstate->total_jumble_len > prev_jumble_len);
 }
@@ -720,3 +729,35 @@ _jumbleRangeTblEntry_eref(JumbleState *jstate,
 	 */
 	JUMBLE_STRING(aliasname);
 }
+
+/*
+ * Custom query jumble function for _jumbleParam.
+ */
+static void
+_jumbleParam(JumbleState *jstate, Node *node)
+{
+	Param	   *expr = (Param *) node;
+
+	JUMBLE_FIELD(paramkind);
+	JUMBLE_FIELD(paramid);
+	JUMBLE_FIELD(paramtype);
+
+	if (expr->paramkind == PARAM_EXTERN)
+	{
+		/*
+		 * At this point, Only external parameter locations outside of
+		 * squashable lists will be recorded.
+		 */
+		RecordExpressionLocation(jstate, expr->location, -1, true);
+
+		/*
+		 * Update the highest Param id seen, in order to start normalization
+		 * correctly.
+		 *
+		 * Note: This value is reset at the end of jumbling if there exists a
+		 * squashable list. See the comment in the definition of JumbleState.
+		 */
+		if (expr->paramid > jstate->highest_extern_param_id)
+			jstate->highest_extern_param_id = expr->paramid;
+	}
+}
diff --git a/src/include/nodes/primnodes.h b/src/include/nodes/primnodes.h
index 773cdd880aa8..99d2c019c4bd 100644
--- a/src/include/nodes/primnodes.h
+++ b/src/include/nodes/primnodes.h
@@ -389,14 +389,16 @@ typedef enum ParamKind
 
 typedef struct Param
 {
+	pg_node_attr(custom_query_jumble)
+
 	Expr		xpr;
 	ParamKind	paramkind;		/* kind of parameter. See above */
 	int			paramid;		/* numeric ID for parameter */
 	Oid			paramtype;		/* pg_type OID of parameter's datatype */
 	/* typmod value, if known */
-	int32		paramtypmod pg_node_attr(query_jumble_ignore);
+	int32		paramtypmod;
 	/* OID of collation, or InvalidOid if none */
-	Oid			paramcollid pg_node_attr(query_jumble_ignore);
+	Oid			paramcollid;
 	/* token location, or -1 if unknown */
 	ParseLoc	location;
 } Param;
diff --git a/src/include/nodes/queryjumble.h b/src/include/nodes/queryjumble.h
index da7c7abed2e6..bab971162dc7 100644
--- a/src/include/nodes/queryjumble.h
+++ b/src/include/nodes/queryjumble.h
@@ -29,6 +29,13 @@ typedef struct LocationLen
 	 * of squashed constants.
 	 */
 	bool		squashed;
+
+	/*
+	 * Indicates whether a location is that of an external parameter, so it
+	 * can be decided during normalization whether the parameter number should
+	 * be replaced or kept as is.
+	 */
+	bool		extern_param;
 } LocationLen;
 
 /*
@@ -52,8 +59,15 @@ typedef struct JumbleState
 	/* Current number of valid entries in clocations array */
 	int			clocations_count;
 
-	/* highest Param id we've seen, in order to start normalization correctly */
+	/*
+	 * Highest Param id we've seen, in order to start normalization correctly.
+	 * However, if the jumble contains at least one squashed list, we
+	 * disregard the highest_extern_param_id value because parameters can
+	 * exist within the squashed list and are no longer considered for
+	 * normalization.
+	 */
 	int			highest_extern_param_id;
+	bool		has_squashed_lists;
 
 	/*
 	 * Count of the number of NULL nodes seen since last appending a value.
-- 
2.39.5 (Apple Git-154)

