> On Thu, Feb 09, 2023 at 08:43:29PM +0100, Dmitry Dolgov wrote:
> > On Thu, Feb 09, 2023 at 06:26:51PM +0100, Alvaro Herrera wrote:
> > On 2023-Feb-09, Dmitry Dolgov wrote:
> >
> > > > On Thu, Feb 09, 2023 at 02:30:34PM +0100, Peter Eisentraut wrote:
> >
> > > > What is the point of making this a numeric setting?  Either you want
> > > > to merge all values or you don't want to merge any values.
> > >
> > > At least in theory the definition of "too many constants" is different
> > > for different use cases and I see allowing to configure it as a way of
> > > reducing the level of surprise here.
> >
> > I was thinking about this a few days ago and I agree that we don't
> > necessarily want to make it just a boolean thing; we may want to make it
> > more complex.  One trivial idea is to make it group entries in powers of
> > 10: for 0-9 elements, you get one entry, and 10-99 you get a different
> > one, and so on:
> >
> > # group everything in a single bucket
> > const_merge_threshold = true / yes / on
> >
> > # group 0-9, 10-99, 100-999, 1000-9999
> > const_merge_treshold = powers
> >
> > Ideally the value would be represented somehow in the query text. For
> > example
> >
> >                          query                            | calls
> > ----------------------------------------------------------+-------
> >  select * from test where i in ({... 0-9 entries ...})    |     2
> >  select * from test where i in ({... 10-99 entries ...})  |     1
> >
> > What do you think?  The jumble would have to know how to reduce all
> > values within each power-of-ten group to one specific value, but I don't
> > think that should be particularly difficult.
>
> Yeah, it sounds appealing and conveniently addresses the question of
> losing the information about how many constants originally were there.
> Not sure if the example above would be the most natural way to represent
> it in the query text, but otherwise I'm going to try implementing this.
> Stay tuned.

It took me couple of evenings, here is what I've got:

* The representation is not that far away from your proposal, I've
  settled on:

    SELECT * FROM test_merge WHERE id IN (... [10-99 entries])

* To not reinvent the wheel, I've reused decimalLenght function from
  numutils, hence one more patch to make it available to reuse.

* This approach resolves my concerns about letting people tuning
  the behaviour of merging, as now it's possible to distinguish between
  calls with different number of constants up to the power of 10. So
  I've decided to simplify the configuration and make the guc boolean to
  turn it off or on.

* To separate queries with constants falling into different ranges
  (10-99, 100-999, etc), the order of magnitude is added into the jumble
  hash.

* I've incorporated feedback from Sergei and David, as well as tried to
  make comments and documentation more clear.

Any feedback is welcomed, thanks!
>From dbb4eab9f3efbcee2326278be6f70ff52685b2b0 Mon Sep 17 00:00:00 2001
From: Dmitrii Dolgov <9erthali...@gmail.com>
Date: Fri, 17 Feb 2023 10:17:55 +0100
Subject: [PATCH v13 1/2] Reusable decimalLength functions

Move out decimalLength functions to reuse in the following patch.
---
 src/backend/utils/adt/numutils.c | 50 +-----------------------
 src/include/utils/numutils.h     | 67 ++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+), 49 deletions(-)
 create mode 100644 src/include/utils/numutils.h

diff --git a/src/backend/utils/adt/numutils.c b/src/backend/utils/adt/numutils.c
index 471fbb7ee6..df7418cce7 100644
--- a/src/backend/utils/adt/numutils.c
+++ b/src/backend/utils/adt/numutils.c
@@ -18,9 +18,8 @@
 #include <limits.h>
 #include <ctype.h>
 
-#include "common/int.h"
 #include "utils/builtins.h"
-#include "port/pg_bitutils.h"
+#include "utils/numutils.h"
 
 /*
  * A table of all two-digit numbers. This is used to speed up decimal digit
@@ -38,53 +37,6 @@ static const char DIGIT_TABLE[200] =
 "80" "81" "82" "83" "84" "85" "86" "87" "88" "89"
 "90" "91" "92" "93" "94" "95" "96" "97" "98" "99";
 
-/*
- * Adapted from http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog10
- */
-static inline int
-decimalLength32(const uint32 v)
-{
-	int			t;
-	static const uint32 PowersOfTen[] = {
-		1, 10, 100,
-		1000, 10000, 100000,
-		1000000, 10000000, 100000000,
-		1000000000
-	};
-
-	/*
-	 * Compute base-10 logarithm by dividing the base-2 logarithm by a
-	 * good-enough approximation of the base-2 logarithm of 10
-	 */
-	t = (pg_leftmost_one_pos32(v) + 1) * 1233 / 4096;
-	return t + (v >= PowersOfTen[t]);
-}
-
-static inline int
-decimalLength64(const uint64 v)
-{
-	int			t;
-	static const uint64 PowersOfTen[] = {
-		UINT64CONST(1), UINT64CONST(10),
-		UINT64CONST(100), UINT64CONST(1000),
-		UINT64CONST(10000), UINT64CONST(100000),
-		UINT64CONST(1000000), UINT64CONST(10000000),
-		UINT64CONST(100000000), UINT64CONST(1000000000),
-		UINT64CONST(10000000000), UINT64CONST(100000000000),
-		UINT64CONST(1000000000000), UINT64CONST(10000000000000),
-		UINT64CONST(100000000000000), UINT64CONST(1000000000000000),
-		UINT64CONST(10000000000000000), UINT64CONST(100000000000000000),
-		UINT64CONST(1000000000000000000), UINT64CONST(10000000000000000000)
-	};
-
-	/*
-	 * Compute base-10 logarithm by dividing the base-2 logarithm by a
-	 * good-enough approximation of the base-2 logarithm of 10
-	 */
-	t = (pg_leftmost_one_pos64(v) + 1) * 1233 / 4096;
-	return t + (v >= PowersOfTen[t]);
-}
-
 static const int8 hexlookup[128] = {
 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
diff --git a/src/include/utils/numutils.h b/src/include/utils/numutils.h
new file mode 100644
index 0000000000..876e64f2df
--- /dev/null
+++ b/src/include/utils/numutils.h
@@ -0,0 +1,67 @@
+/*-------------------------------------------------------------------------
+ *
+ * numutils.h
+ *	  Decimal length functions for numutils.c
+ *
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/utils/numutils.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef NUMUTILS_H
+#define NUMUTILS_H
+
+#include "common/int.h"
+#include "port/pg_bitutils.h"
+
+/*
+ * Adapted from http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog10
+ */
+static inline int
+decimalLength32(const uint32 v)
+{
+	int			t;
+	static const uint32 PowersOfTen[] = {
+		1, 10, 100,
+		1000, 10000, 100000,
+		1000000, 10000000, 100000000,
+		1000000000
+	};
+
+	/*
+	 * Compute base-10 logarithm by dividing the base-2 logarithm by a
+	 * good-enough approximation of the base-2 logarithm of 10
+	 */
+	t = (pg_leftmost_one_pos32(v) + 1) * 1233 / 4096;
+	return t + (v >= PowersOfTen[t]);
+}
+
+static inline int
+decimalLength64(const uint64 v)
+{
+	int			t;
+	static const uint64 PowersOfTen[] = {
+		UINT64CONST(1), UINT64CONST(10),
+		UINT64CONST(100), UINT64CONST(1000),
+		UINT64CONST(10000), UINT64CONST(100000),
+		UINT64CONST(1000000), UINT64CONST(10000000),
+		UINT64CONST(100000000), UINT64CONST(1000000000),
+		UINT64CONST(10000000000), UINT64CONST(100000000000),
+		UINT64CONST(1000000000000), UINT64CONST(10000000000000),
+		UINT64CONST(100000000000000), UINT64CONST(1000000000000000),
+		UINT64CONST(10000000000000000), UINT64CONST(100000000000000000),
+		UINT64CONST(1000000000000000000), UINT64CONST(10000000000000000000)
+	};
+
+	/*
+	 * Compute base-10 logarithm by dividing the base-2 logarithm by a
+	 * good-enough approximation of the base-2 logarithm of 10
+	 */
+	t = (pg_leftmost_one_pos64(v) + 1) * 1233 / 4096;
+	return t + (v >= PowersOfTen[t]);
+}
+
+#endif							/* NUMUTILS_H */
-- 
2.32.0

>From 717595218d9668b5ea5bd5d88052959786148d7d Mon Sep 17 00:00:00 2001
From: Dmitrii Dolgov <9erthali...@gmail.com>
Date: Sun, 24 Jul 2022 11:43:25 +0200
Subject: [PATCH v13 2/2] Prevent jumbling of every element in ArrayExpr

pg_stat_statements produces multiple entries for queries like

    SELECT something FROM table WHERE col IN (1, 2, 3, ...)

depending on the number of parameters, because every element of
ArrayExpr is jumbled. In certain situations it's undesirable, especially
if the list becomes too large.

Make an array of Const expressions contribute only the number of
elements (up to power of 10) to the jumble hash. Allow to enable this
behavior via the new GUC query_id_const_merge with the default value off.

Reviewed-by: Zhihong Yu, Sergey Dudoladov, Robert Haas, Tom Lane,
Michael Paquier, Sergei Kornilov, Alvaro Herrera, David Geier
Tested-by: Chengxi Sun
---
 .../expected/pg_stat_statements.out           | 227 ++++++++++++++++++
 .../pg_stat_statements/pg_stat_statements.c   |  46 +++-
 .../sql/pg_stat_statements.sql                |  68 ++++++
 doc/src/sgml/config.sgml                      |  29 +++
 doc/src/sgml/pgstatstatements.sgml            |  27 ++-
 src/backend/nodes/gen_node_support.pl         |   2 +-
 src/backend/nodes/queryjumblefuncs.c          | 111 ++++++++-
 src/backend/utils/misc/guc_tables.c           |  10 +
 src/backend/utils/misc/postgresql.conf.sample |   2 +-
 src/include/nodes/primnodes.h                 |   2 +
 src/include/nodes/queryjumble.h               |  12 +-
 11 files changed, 517 insertions(+), 19 deletions(-)

diff --git a/contrib/pg_stat_statements/expected/pg_stat_statements.out b/contrib/pg_stat_statements/expected/pg_stat_statements.out
index 8c0b2235e8..f32e8cc5f0 100644
--- a/contrib/pg_stat_statements/expected/pg_stat_statements.out
+++ b/contrib/pg_stat_statements/expected/pg_stat_statements.out
@@ -1156,4 +1156,231 @@ SELECT COUNT(*) FROM pg_stat_statements WHERE query LIKE '%SELECT GROUPING%';
      2
 (1 row)
 
+--
+-- Const merging functionality
+--
+CREATE TABLE test_merge (id int, data int);
+-- IN queries
+-- No merging is performed, as a baseline result
+SELECT pg_stat_statements_reset();
+ pg_stat_statements_reset 
+--------------------------
+ 
+(1 row)
+
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9);
+ id | data 
+----+------
+(0 rows)
+
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
+ id | data 
+----+------
+(0 rows)
+
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);
+ id | data 
+----+------
+(0 rows)
+
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+                                        query                                        | calls 
+-------------------------------------------------------------------------------------+-------
+ SELECT * FROM test_merge WHERE id IN ($1, $2, $3, $4, $5, $6, $7, $8, $9)           |     1
+ SELECT * FROM test_merge WHERE id IN ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)      |     1
+ SELECT * FROM test_merge WHERE id IN ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11) |     1
+ SELECT pg_stat_statements_reset()                                                   |     1
+ SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"              |     0
+(5 rows)
+
+-- Normal scenario, too many simple constants for an IN query
+SET query_id_const_merge = on;
+SELECT pg_stat_statements_reset();
+ pg_stat_statements_reset 
+--------------------------
+ 
+(1 row)
+
+SELECT * FROM test_merge WHERE id IN (1);
+ id | data 
+----+------
+(0 rows)
+
+SELECT * FROM test_merge WHERE id IN (1, 2, 3);
+ id | data 
+----+------
+(0 rows)
+
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+                                 query                                  | calls 
+------------------------------------------------------------------------+-------
+ SELECT * FROM test_merge WHERE id IN ($1)                              |     1
+ SELECT * FROM test_merge WHERE id IN ($1, $2, $3)                      |     1
+ SELECT pg_stat_statements_reset()                                      |     1
+ SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C" |     0
+(4 rows)
+
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9);
+ id | data 
+----+------
+(0 rows)
+
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
+ id | data 
+----+------
+(0 rows)
+
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);
+ id | data 
+----+------
+(0 rows)
+
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+                                   query                                   | calls 
+---------------------------------------------------------------------------+-------
+ SELECT * FROM test_merge WHERE id IN ($1)                                 |     1
+ SELECT * FROM test_merge WHERE id IN ($1, $2, $3)                         |     1
+ SELECT * FROM test_merge WHERE id IN ($1, $2, $3, $4, $5, $6, $7, $8, $9) |     1
+ SELECT * FROM test_merge WHERE id IN (... [10-99 entries])                |     2
+ SELECT pg_stat_statements_reset()                                         |     1
+ SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"    |     1
+(6 rows)
+
+-- Second order of magnitude, brace yourself
+SELECT pg_stat_statements_reset();
+ pg_stat_statements_reset 
+--------------------------
+ 
+(1 row)
+
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110);
+ id | data 
+----+------
+(0 rows)
+
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+                                 query                                  | calls 
+------------------------------------------------------------------------+-------
+ SELECT * FROM test_merge WHERE id IN (... [100-999 entries])           |     1
+ SELECT pg_stat_statements_reset()                                      |     1
+ SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C" |     0
+(3 rows)
+
+-- With gaps on the threshold
+SELECT pg_stat_statements_reset();
+ pg_stat_statements_reset 
+--------------------------
+ 
+(1 row)
+
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4);
+ id | data 
+----+------
+(0 rows)
+
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+                                 query                                  | calls 
+------------------------------------------------------------------------+-------
+ SELECT * FROM test_merge WHERE id IN ($1, $2, $3, $4)                  |     1
+ SELECT pg_stat_statements_reset()                                      |     1
+ SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C" |     0
+(3 rows)
+
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+ id | data 
+----+------
+(0 rows)
+
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+                                 query                                  | calls 
+------------------------------------------------------------------------+-------
+ SELECT * FROM test_merge WHERE id IN ($1, $2, $3, $4)                  |     1
+ SELECT * FROM test_merge WHERE id IN (... [10-99 entries])             |     1
+ SELECT pg_stat_statements_reset()                                      |     1
+ SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C" |     1
+(4 rows)
+
+-- No unmerged constants
+SELECT pg_stat_statements_reset();
+ pg_stat_statements_reset 
+--------------------------
+ 
+(1 row)
+
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);
+ id | data 
+----+------
+(0 rows)
+
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+                                 query                                  | calls 
+------------------------------------------------------------------------+-------
+ SELECT * FROM test_merge WHERE id IN (... [10-99 entries])             |     1
+ SELECT pg_stat_statements_reset()                                      |     1
+ SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C" |     0
+(3 rows)
+
+-- More conditions in the query
+SELECT pg_stat_statements_reset();
+ pg_stat_statements_reset 
+--------------------------
+ 
+(1 row)
+
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9) and data = 2;
+ id | data 
+----+------
+(0 rows)
+
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10) and data = 2;
+ id | data 
+----+------
+(0 rows)
+
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) and data = 2;
+ id | data 
+----+------
+(0 rows)
+
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+                                          query                                           | calls 
+------------------------------------------------------------------------------------------+-------
+ SELECT * FROM test_merge WHERE id IN ($1, $2, $3, $4, $5, $6, $7, $8, $9) and data = $10 |     1
+ SELECT * FROM test_merge WHERE id IN (... [10-99 entries]) and data = $3                 |     2
+ SELECT pg_stat_statements_reset()                                                        |     1
+ SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C"                   |     0
+(4 rows)
+
+-- On table, numeric type causes every constant being wrapped into functions.
+CREATE TABLE test_merge_numeric (id int, data numeric(5, 2));
+SELECT pg_stat_statements_reset();
+ pg_stat_statements_reset 
+--------------------------
+ 
+(1 row)
+
+SELECT * FROM test_merge_numeric WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);
+ id | data 
+----+------
+(0 rows)
+
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+                                 query                                  | calls 
+------------------------------------------------------------------------+-------
+ SELECT * FROM test_merge_numeric WHERE id IN (... [10-99 entries])     |     1
+ SELECT pg_stat_statements_reset()                                      |     1
+ SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C" |     0
+(3 rows)
+
+-- Test constants evaluation
+WITH cte AS (
+    SELECT 'const' as const FROM test_merge
+)
+SELECT ARRAY['a', 'b', 'c', const::varchar] AS result
+FROM cte;
+ result 
+--------
+(0 rows)
+
+RESET query_id_const_merge;
 DROP EXTENSION pg_stat_statements;
diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c
index ad1fe44496..ece56e99b2 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.c
+++ b/contrib/pg_stat_statements/pg_stat_statements.c
@@ -2666,6 +2666,10 @@ generate_normalized_query(JumbleState *jstate, const char *query,
 				n_quer_loc = 0, /* Normalized query byte location */
 				last_off = 0,	/* Offset from start for previous tok */
 				last_tok_len = 0;	/* Length (in bytes) of that tok */
+	bool		skip = false; 	/* Signals that certain constants are
+								   merged together and have to be skipped */
+	int 		magnitude; 		/* Order of magnitute for number merged constants */
+
 
 	/*
 	 * Get constants' lengths (core system only gives us locations).  Note
@@ -2689,7 +2693,6 @@ generate_normalized_query(JumbleState *jstate, const char *query,
 	{
 		int			off,		/* Offset from start for cur tok */
 					tok_len;	/* Length (in bytes) of that tok */
-
 		off = jstate->clocations[i].location;
 		/* Adjust recorded location if we're dealing with partial string */
 		off -= query_loc;
@@ -2704,12 +2707,43 @@ generate_normalized_query(JumbleState *jstate, const char *query,
 		len_to_wrt -= last_tok_len;
 
 		Assert(len_to_wrt >= 0);
-		memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
-		n_quer_loc += len_to_wrt;
 
-		/* And insert a param symbol in place of the constant token */
-		n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d",
-							  i + 1 + jstate->highest_extern_param_id);
+		/* Normal path, non merged constant */
+		magnitude = jstate->clocations[i].magnitude;
+		if (magnitude == 0)
+		{
+			memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
+			n_quer_loc += len_to_wrt;
+
+			/* And insert a param symbol in place of the constant token */
+			n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d",
+								  i + 1 + jstate->highest_extern_param_id);
+
+			/* In case previous constants were merged away, stop doing that */
+			if (skip)
+				skip = false;
+		}
+		/* The firsts merged constant */
+		else if (!skip)
+		{
+			static const uint32 powers_of_ten[] = {
+				1, 10, 100,
+				1000, 10000, 100000,
+				1000000, 10000000, 100000000,
+				1000000000
+			};
+			int lower_merged = powers_of_ten[magnitude - 1];
+			int upper_merged = powers_of_ten[magnitude];
+
+			memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
+			n_quer_loc += len_to_wrt;
+
+			/* Skip the following until a non merged constant appear */
+			skip = true;
+			n_quer_loc += sprintf(norm_query + n_quer_loc, "... [%d-%d entries]",
+								  lower_merged, upper_merged - 1);
+		}
+		/* Otherwise the constant is merged away */
 
 		quer_loc = off + tok_len;
 		last_off = off;
diff --git a/contrib/pg_stat_statements/sql/pg_stat_statements.sql b/contrib/pg_stat_statements/sql/pg_stat_statements.sql
index cebde7392b..dc4b3e355b 100644
--- a/contrib/pg_stat_statements/sql/pg_stat_statements.sql
+++ b/contrib/pg_stat_statements/sql/pg_stat_statements.sql
@@ -473,4 +473,72 @@ SELECT (
 
 SELECT COUNT(*) FROM pg_stat_statements WHERE query LIKE '%SELECT GROUPING%';
 
+--
+-- Const merging functionality
+--
+CREATE TABLE test_merge (id int, data int);
+
+-- IN queries
+
+-- No merging is performed, as a baseline result
+SELECT pg_stat_statements_reset();
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9);
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+
+-- Normal scenario, too many simple constants for an IN query
+SET query_id_const_merge = on;
+
+SELECT pg_stat_statements_reset();
+SELECT * FROM test_merge WHERE id IN (1);
+SELECT * FROM test_merge WHERE id IN (1, 2, 3);
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9);
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+
+-- Second order of magnitude, brace yourself
+SELECT pg_stat_statements_reset();
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110);
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+
+-- With gaps on the threshold
+SELECT pg_stat_statements_reset();
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4);
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+
+-- No unmerged constants
+SELECT pg_stat_statements_reset();
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+
+-- More conditions in the query
+SELECT pg_stat_statements_reset();
+
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9) and data = 2;
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10) and data = 2;
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) and data = 2;
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+
+-- On table, numeric type causes every constant being wrapped into functions.
+CREATE TABLE test_merge_numeric (id int, data numeric(5, 2));
+SELECT pg_stat_statements_reset();
+SELECT * FROM test_merge_numeric WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+
+-- Test constants evaluation
+WITH cte AS (
+    SELECT 'const' as const FROM test_merge
+)
+SELECT ARRAY['a', 'b', 'c', const::varchar] AS result
+FROM cte;
+
+RESET query_id_const_merge;
+
 DROP EXTENSION pg_stat_statements;
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index ecd9aa73ef..5b15863ac7 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -8278,6 +8278,35 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv;
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-query-id-const-merge" xreflabel="query_id_const_merge">
+      <term><varname>query_id_const_merge</varname> (<type>bool</type>)
+      <indexterm>
+       <primary><varname>query_id_const_merge</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Specifies how arrays of constants (e.g. for an "IN" clause) contribute
+        to the query identifier computation. Normally every element of an array
+        contributes to a query identifier, which means effectively the same
+        query could get multiple different identifiers, depending of size of the
+        array.
+
+        If this parameter is on, two queries with an array will get the same
+        query identifier if the only difference between them is the number of
+        constants, both numbers is of the same order of magnitude and greater or
+        equal 10 (so the order of magnitude is greather than 1, it is not worth
+        the efforts otherwise).
+
+        The parameter could be used to reduce amount of repeating data stored
+        via <xref linkend="pgstatstatements"/> extension.  The default value is off.
+
+        The <xref linkend="pgstatstatements"/> extension will represent such
+        queries in form <literal>'(... [10-99 entries])'</literal>.
+       </para>
+      </listitem>
+     </varlistentry>
+
      </variablelist>
 
     </sect2>
diff --git a/doc/src/sgml/pgstatstatements.sgml b/doc/src/sgml/pgstatstatements.sgml
index efc36da602..84dedb870e 100644
--- a/doc/src/sgml/pgstatstatements.sgml
+++ b/doc/src/sgml/pgstatstatements.sgml
@@ -519,10 +519,29 @@
   <para>
    In some cases, queries with visibly different texts might get merged into a
    single <structname>pg_stat_statements</structname> entry.  Normally this will happen
-   only for semantically equivalent queries, but there is a small chance of
-   hash collisions causing unrelated queries to be merged into one entry.
-   (This cannot happen for queries belonging to different users or databases,
-   however.)
+   only for semantically equivalent queries, for example when queries are
+   different only in values of constants they use. Another valid possibility for
+   merging queries into a single <structname>pg_stat_statements</structname>
+   entry is when <xref linkend="guc-query-id-const-merge"/> is enabled and the
+   queries contain an array of constants of similar size:
+
+<screen>
+=# SET query_id_const_merge = on;
+=# SELECT pg_stat_statements_reset();
+=# SELECT * FROM test WHERE a IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);
+=# SELECT * FROM test WHERE a IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12);
+=# SELECT query, calls FROM pg_stat_statements;
+-[ RECORD 1 ]------------------------------
+query | SELECT * FROM test WHERE a IN (... [10-99 entries])
+calls | 2
+-[ RECORD 2 ]------------------------------
+query | SELECT pg_stat_statements_reset()
+calls | 1
+</screen>
+
+   But there is a small chance of hash collisions causing unrelated queries to
+   be merged into one entry. (This cannot happen for queries belonging to
+   different users or databases, however.)
   </para>
 
   <para>
diff --git a/src/backend/nodes/gen_node_support.pl b/src/backend/nodes/gen_node_support.pl
index ecbcadb8bf..93aed18c2e 100644
--- a/src/backend/nodes/gen_node_support.pl
+++ b/src/backend/nodes/gen_node_support.pl
@@ -1308,7 +1308,7 @@ _jumble${n}(JumbleState *jstate, Node *node)
 			# Track the node's location only if directly requested.
 			if ($query_jumble_location)
 			{
-				print $jff "\tJUMBLE_LOCATION($f);\n"
+				print $jff "\tJUMBLE_LOCATION($f, false);\n"
 				  unless $query_jumble_ignore;
 			}
 		}
diff --git a/src/backend/nodes/queryjumblefuncs.c b/src/backend/nodes/queryjumblefuncs.c
index d7fd72d70f..b2508cb0bd 100644
--- a/src/backend/nodes/queryjumblefuncs.c
+++ b/src/backend/nodes/queryjumblefuncs.c
@@ -37,17 +37,26 @@
 #include "nodes/queryjumble.h"
 #include "parser/scansup.h"
 
+#include "utils/numutils.h"
+
 #define JUMBLE_SIZE				1024	/* query serialization buffer size */
 
+#define QUERY_ID_CONST_MERGE_THRESHOLD 	10 	/* when to start merging constants,
+											 * purely to avoid unnecessary work */
+
 /* GUC parameters */
 int			compute_query_id = COMPUTE_QUERY_ID_AUTO;
 
+/* Whether to merge constants in a list when computing query_id */
+bool		query_id_const_merge = false;
+
 /* True when compute_query_id is ON, or AUTO and a module requests them */
 bool		query_id_enabled = false;
 
 static void AppendJumble(JumbleState *jstate,
 						 const unsigned char *item, Size size);
-static void RecordConstLocation(JumbleState *jstate, int location);
+static void RecordConstLocation(JumbleState *jstate,
+								int location, int magnitude);
 static void _jumbleNode(JumbleState *jstate, Node *node);
 static void _jumbleA_Const(JumbleState *jstate, Node *node);
 static void _jumbleList(JumbleState *jstate, Node *node);
@@ -186,11 +195,18 @@ AppendJumble(JumbleState *jstate, const unsigned char *item, Size size)
 }
 
 /*
- * Record location of constant within query string of query tree
- * that is currently being walked.
+ * Record location of constant within query string of query tree that is
+ * currently being walked.
+ *
+ * Magnitude argument larger than zero signals that the constant represents the
+ * first or the last element in a series of merged constants, and everything
+ * but such first/last element will contribute nothing to the jumble hash. In
+ * this case magnitute value describes order of magnitute for the number of
+ * elements in the series (i.e. how many digits it has), to represent the fact
+ * of merging later on.
  */
 static void
-RecordConstLocation(JumbleState *jstate, int location)
+RecordConstLocation(JumbleState *jstate, int location, int magnitude)
 {
 	/* -1 indicates unknown or undefined location */
 	if (location >= 0)
@@ -206,15 +222,73 @@ RecordConstLocation(JumbleState *jstate, int location)
 		}
 		jstate->clocations[jstate->clocations_count].location = location;
 		/* initialize lengths to -1 to simplify third-party module usage */
+		jstate->clocations[jstate->clocations_count].magnitude = magnitude;
 		jstate->clocations[jstate->clocations_count].length = -1;
 		jstate->clocations_count++;
 	}
 }
 
+/*
+ * Verify if the provided list contains could be merged down, which means it
+ * contains only constant expressions and its length is greater than or equal
+ * to CONST_MERGE_THRESHOLD.
+ *
+ * Return value is an order of magnitude for size of the list (to use for
+ * representation purposes later on) if merging is possible, otherwise zero.
+ *
+ * Note that this function searches only for Const directly and do not tries to
+ * simplify expressions.
+ */
+static int
+IsMergeableConstList(List *elements, Const **firstConst, Const **lastConst)
+{
+	ListCell   *temp;
+	Node	   *firstExpr = NULL;
+
+	if (elements == NULL)
+		return 0;
+
+	if (!query_id_const_merge)
+	{
+		/* Merging is disabled, process everything one by one */
+		return 0;
+	}
+
+	if (elements->length < QUERY_ID_CONST_MERGE_THRESHOLD)
+	{
+		/* It is not worth it to consider small lists for merging */
+		return 0;
+	}
+
+	firstExpr = linitial(elements);
+
+	/*
+	 * If the first expression is a constant, verify if the following elements
+	 * are constants as well. If yes, the list is eligible for merging, and the
+	 * order of magnitude need to be calculated.
+	 */
+	if (IsA(firstExpr, Const))
+	{
+		foreach(temp, elements)
+			if (!IsA(lfirst(temp), Const))
+				return 0;
+
+		*firstConst = (Const *) firstExpr;
+		*lastConst = llast_node(Const, elements);
+		return decimalLength32(elements->length);
+	}
+
+	/*
+	 * If we end up here, it means no constants merging is possible, process
+	 * the list as usual.
+	 */
+	return 0;
+}
+
 #define JUMBLE_NODE(item) \
 	_jumbleNode(jstate, (Node *) expr->item)
-#define JUMBLE_LOCATION(location) \
-	RecordConstLocation(jstate, expr->location)
+#define JUMBLE_LOCATION(location, magnitude) \
+	RecordConstLocation(jstate, expr->location, magnitude)
 #define JUMBLE_FIELD(item) \
 	AppendJumble(jstate, (const unsigned char *) &(expr->item), sizeof(expr->item))
 #define JUMBLE_FIELD_SINGLE(item) \
@@ -227,6 +301,31 @@ do { \
 
 #include "queryjumblefuncs.funcs.c"
 
+static void
+_jumbleArrayExpr(JumbleState *jstate, Node *node)
+{
+	ArrayExpr *expr = (ArrayExpr *) node;
+	Const *first, *last;
+	int magnitude = IsMergeableConstList(expr->elements, &first, &last);
+
+	if (magnitude)
+	{
+		RecordConstLocation(jstate, first->location, magnitude);
+		RecordConstLocation(jstate, last->location, magnitude);
+
+		/*
+		 * After merging constants down we end up with only two constants, the
+		 * first and the last one. To distinguish the order of magnitute behind
+		 * merged constants, add its value into the jumble.
+		 */
+		JUMBLE_FIELD_SINGLE(magnitude);
+	}
+	else
+	{
+		JUMBLE_NODE(elements);
+	}
+}
+
 static void
 _jumbleNode(JumbleState *jstate, Node *node)
 {
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 43b9d92660..11147e72fc 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -1972,6 +1972,16 @@ struct config_bool ConfigureNamesBool[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"query_id_const_merge", PGC_SUSET, STATS_MONITORING,
+			gettext_noop("Sets whether an array of constants will contribute to"
+						 " query identified computation."),
+		},
+		&query_id_const_merge,
+		false,
+		NULL, NULL, NULL
+	},
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index d06074b86f..57b83b296d 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -627,7 +627,7 @@
 #log_parser_stats = off
 #log_planner_stats = off
 #log_executor_stats = off
-
+#query_id_const_merge = off
 
 #------------------------------------------------------------------------------
 # AUTOVACUUM
diff --git a/src/include/nodes/primnodes.h b/src/include/nodes/primnodes.h
index 1be1642d92..d79b663825 100644
--- a/src/include/nodes/primnodes.h
+++ b/src/include/nodes/primnodes.h
@@ -1291,6 +1291,8 @@ typedef struct CaseTestExpr
  */
 typedef struct ArrayExpr
 {
+	pg_node_attr(custom_query_jumble)
+
 	Expr		xpr;
 	/* type of expression result */
 	Oid			array_typeid pg_node_attr(query_jumble_ignore);
diff --git a/src/include/nodes/queryjumble.h b/src/include/nodes/queryjumble.h
index 204b8f74fd..001694aee9 100644
--- a/src/include/nodes/queryjumble.h
+++ b/src/include/nodes/queryjumble.h
@@ -15,6 +15,7 @@
 #define QUERYJUBLE_H
 
 #include "nodes/parsenodes.h"
+#include "nodes/nodeFuncs.h"
 
 /*
  * Struct for tracking locations/lengths of constants during normalization
@@ -23,6 +24,15 @@ typedef struct LocationLen
 {
 	int			location;		/* start offset in query text */
 	int			length;			/* length in bytes, or -1 to ignore */
+
+	/*
+	 * The constant may represent the beginning or the end of a merged
+	 * constants interval. In this case the magnitude value contains how many
+	 * constants were merged away (to a power of 10), in other words order of
+	 * manitude for number of merged constants. Otherwise the value is 0,
+	 * indicating that no merging is involved.
+	 */
+	int			magnitude;
 } LocationLen;
 
 /*
@@ -61,7 +71,7 @@ enum ComputeQueryIdType
 
 /* GUC parameters */
 extern PGDLLIMPORT int compute_query_id;
-
+extern PGDLLIMPORT bool query_id_const_merge;
 
 extern const char *CleanQuerytext(const char *query, int *location, int *len);
 extern JumbleState *JumbleQuery(Query *query, const char *querytext);
-- 
2.32.0

Reply via email to