examine_variable ignored CollateExpr

jian he Mon, 12 Jan 2026 18:24:26 -0800

hi.

ComputeIndexAttrs:
src/backend/commands/indexcmds.c, line 1995:
            /*
             * Strip any top-level COLLATE clause.  This ensures that we treat
             * "x COLLATE y" and "(x COLLATE y)" alike.
             */
            while (IsA(expr, CollateExpr))
                expr = (Node *) ((CollateExpr *) expr)->arg;


That means in function examine_variable:
``foreach(ilist, onerel->indexlist)``
we don't need to worry about RelabelType transformed from COLLATE clause.

CreateStatistics does not have special handling for CollateExpr. Query like:
CREATE STATISTICS t_stx1 ON (a collate "C" collate "C" collate "C"
collate "POSIX") FROM t;
the stored statistics expression node will be a nested CollateExpr.

For the above example,
get_relation_info->get_relation_statistics->eval_const_expressions will
transform the nested CollateExpr into one RelabelType node if column "a"
collation is not POSIX.

RelabelType can be from the CollateExpr transformation, as mentioned above.  For
statistics, say ndistinct, ``GROUP BY A COLLATE "C"`` can have different results
from ``GROUP BY A COLLATE case_insensitive``.

Therefore in examine_variable, we can not just simple use:
``
       while (IsA(basenode, RelabelType))
               basenode = (Node *) ((RelabelType *) basenode)->arg;
``

demo:
CREATE TABLE test_stats_ext_coll (a text, b text, c int);
INSERT INTO test_stats_ext_coll SELECT chr(65 + g % 52) FROM
generate_series(1, 500) g;
CREATE STATISTICS test_stats_ext_coll_stx ON (a COLLATE
case_insensitive) FROM test_stats_ext_coll;
ANALYZE test_stats_ext_coll;

SELECT * FROM check_estimated_rows ($$SELECT count(*) FROM
test_stats_ext_coll GROUP BY a COLLATE case_insensitive $$);

with HEAD:
 estimated | actual
-----------+--------
        52 |     32

with attached PATCH:
 estimated | actual
-----------+--------
        32 |     32

function check_estimated_rows is in src/test/regress/sql/stats_ext.sql.

While working on this, I wondered whether we could move check_estimated_rows
from stats_ext.sql to test_setup.sql so that it can be reused more broadly.



--
jian
https://www.enterprisedb.com/

From aef713df81c613a56855a8d46aac198090ab1a5d Mon Sep 17 00:00:00 2001
From: jian he <[email protected]>
Date: Tue, 13 Jan 2026 10:15:13 +0800
Subject: [PATCH v1 1/1] proper handling examine_variable RelabelType

discussion: https://postgr.es/m/
---
 src/backend/utils/adt/selfuncs.c              | 62 +++++++++++++++++--
 .../regress/expected/collate.icu.utf8.out     | 49 +++++++++++++++
 src/test/regress/sql/collate.icu.utf8.sql     | 32 ++++++++++
 3 files changed, 138 insertions(+), 5 deletions(-)

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 29fec655593..d6a26c48384 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -5661,8 +5661,34 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
 	 * RelabelType nodes here, because the prior stripping of PlaceHolderVars
 	 * may have brought separate RelabelTypes into adjacency.
 	 */
-	while (IsA(basenode, RelabelType))
-		basenode = (Node *) ((RelabelType *) basenode)->arg;
+	if (IsA(basenode, RelabelType))
+	{
+		RelabelType *relabel = castNode(RelabelType, basenode);
+		Expr	   *expr = (Expr *) relabel;
+
+		while (expr && IsA(expr, RelabelType))
+			expr = ((RelabelType *) expr)->arg;
+
+		if (exprCollation((Node *) expr) == relabel->resultcollid)
+			basenode = (Node *) expr;
+		else
+		{
+			/*
+			 * A RelabelType is required when the input expression’s
+			 * collation differs from the resulting resultcollid.
+			 */
+			RelabelType *newrelabel = makeNode(RelabelType);
+
+			newrelabel->arg = (Expr *) expr;
+			newrelabel->resulttype = relabel->resulttype;
+			newrelabel->resulttypmod = relabel->resulttypmod;
+			newrelabel->resultcollid = relabel->resultcollid;
+			newrelabel->relabelformat = relabel->relabelformat;
+			newrelabel->location = relabel->location;
+
+			basenode = (Node *) newrelabel;
+		}
+	}
 
 	/* Fast path for a simple Var */
 	if (IsA(basenode, Var) &&
@@ -5909,9 +5935,35 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
 
 				Assert(expr);
 
-				/* strip RelabelType before comparing it */
-				if (expr && IsA(expr, RelabelType))
-					expr = (Node *) ((RelabelType *) expr)->arg;
+				if (IsA(expr, RelabelType))
+				{
+					RelabelType *relabel = castNode(RelabelType, expr);
+					Expr	   *rexpr = (Expr *) relabel;
+
+					while (rexpr && IsA(rexpr, RelabelType))
+						rexpr = ((RelabelType *) rexpr)->arg;
+
+					if (exprCollation((Node *) rexpr) == relabel->resultcollid)
+						expr = (Node *) rexpr;
+					else
+					{
+						/*
+						 * A RelabelType is required when the input
+						 * expression’s collation differs from the resulting
+						 * resultcollid.
+						 */
+						RelabelType *newrelabel = makeNode(RelabelType);
+
+						newrelabel->arg = (Expr *) rexpr;
+						newrelabel->resulttype = relabel->resulttype;
+						newrelabel->resulttypmod = relabel->resulttypmod;
+						newrelabel->resultcollid = relabel->resultcollid;
+						newrelabel->relabelformat = relabel->relabelformat;
+						newrelabel->location = relabel->location;
+
+						expr = (Node *) newrelabel;
+					}
+				}
 
 				/* found a match, see if we can extract pg_statistic row */
 				if (equal(node, expr))
diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out
index 1325e123877..263e7ce8ae4 100644
--- a/src/test/regress/expected/collate.icu.utf8.out
+++ b/src/test/regress/expected/collate.icu.utf8.out
@@ -11,6 +11,25 @@ SELECT getdatabaseencoding() <> 'UTF8' OR
 SET client_encoding TO UTF8;
 CREATE SCHEMA collate_tests;
 SET search_path = collate_tests;
+create function check_estimated_rows(text) returns table (estimated int, actual int)
+language plpgsql as
+$$
+declare
+    ln text;
+    tmp text[];
+    first_row bool := true;
+begin
+    for ln in
+        execute format('explain analyze %s', $1)
+    loop
+        if first_row then
+            first_row := false;
+            tmp := regexp_match(ln, 'rows=(\d*) .* rows=(\d*)');
+            return query select tmp[1]::int, tmp[2]::int;
+        end if;
+    end loop;
+end;
+$$;
 CREATE TABLE collate_test1 (
     a int,
     b text COLLATE "en-x-icu" NOT NULL
@@ -1461,6 +1480,36 @@ CREATE COLLATION case_sensitive (provider = icu, locale = '');
 NOTICE:  using standard form "und" for ICU locale ""
 CREATE COLLATION case_insensitive (provider = icu, locale = '@colStrength=secondary', deterministic = false);
 NOTICE:  using standard form "und-u-ks-level2" for ICU locale "@colStrength=secondary"
+CREATE TABLE test_stats_ext_coll (a text, b text, c int);
+INSERT INTO test_stats_ext_coll SELECT chr(65 + g % 52) FROM generate_series(1, 500) g;
+CREATE STATISTICS test_stats_ext_coll_stx ON (a COLLATE case_insensitive) FROM test_stats_ext_coll;
+ANALYZE test_stats_ext_coll;
+SELECT * FROM check_estimated_rows ($$ SELECT count(*) FROM test_stats_ext_coll GROUP BY a COLLATE "C" $$);
+ estimated | actual 
+-----------+--------
+        52 |     52
+(1 row)
+
+SELECT * FROM check_estimated_rows ($$SELECT count(*) FROM test_stats_ext_coll GROUP BY a COLLATE "POSIX" $$);
+ estimated | actual 
+-----------+--------
+        52 |     52
+(1 row)
+
+SELECT * FROM check_estimated_rows ($$SELECT count(*) FROM test_stats_ext_coll GROUP BY a COLLATE case_insensitive $$);
+ estimated | actual 
+-----------+--------
+        32 |     32
+(1 row)
+
+SELECT * FROM check_estimated_rows ($$SELECT count(*) FROM test_stats_ext_coll GROUP BY a $$);
+ estimated | actual 
+-----------+--------
+        52 |     52
+(1 row)
+
+DROP FUNCTION check_estimated_rows;
+DROP TABLE test_stats_ext_coll;
 SELECT 'abc' <= 'ABC' COLLATE case_sensitive, 'abc' >= 'ABC' COLLATE case_sensitive;
  ?column? | ?column? 
 ----------+----------
diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql
index b6c54503d21..b5c22bebfa6 100644
--- a/src/test/regress/sql/collate.icu.utf8.sql
+++ b/src/test/regress/sql/collate.icu.utf8.sql
@@ -16,6 +16,26 @@ CREATE SCHEMA collate_tests;
 SET search_path = collate_tests;
 
 
+create function check_estimated_rows(text) returns table (estimated int, actual int)
+language plpgsql as
+$$
+declare
+    ln text;
+    tmp text[];
+    first_row bool := true;
+begin
+    for ln in
+        execute format('explain analyze %s', $1)
+    loop
+        if first_row then
+            first_row := false;
+            tmp := regexp_match(ln, 'rows=(\d*) .* rows=(\d*)');
+            return query select tmp[1]::int, tmp[2]::int;
+        end if;
+    end loop;
+end;
+$$;
+
 CREATE TABLE collate_test1 (
     a int,
     b text COLLATE "en-x-icu" NOT NULL
@@ -561,6 +581,18 @@ SELECT * FROM test6a WHERE b = ARRAY['äbc'] COLLATE ctest_nondet;
 CREATE COLLATION case_sensitive (provider = icu, locale = '');
 CREATE COLLATION case_insensitive (provider = icu, locale = '@colStrength=secondary', deterministic = false);
 
+CREATE TABLE test_stats_ext_coll (a text, b text, c int);
+INSERT INTO test_stats_ext_coll SELECT chr(65 + g % 52) FROM generate_series(1, 500) g;
+CREATE STATISTICS test_stats_ext_coll_stx ON (a COLLATE case_insensitive) FROM test_stats_ext_coll;
+ANALYZE test_stats_ext_coll;
+
+SELECT * FROM check_estimated_rows ($$ SELECT count(*) FROM test_stats_ext_coll GROUP BY a COLLATE "C" $$);
+SELECT * FROM check_estimated_rows ($$SELECT count(*) FROM test_stats_ext_coll GROUP BY a COLLATE "POSIX" $$);
+SELECT * FROM check_estimated_rows ($$SELECT count(*) FROM test_stats_ext_coll GROUP BY a COLLATE case_insensitive $$);
+SELECT * FROM check_estimated_rows ($$SELECT count(*) FROM test_stats_ext_coll GROUP BY a $$);
+DROP FUNCTION check_estimated_rows;
+DROP TABLE test_stats_ext_coll;
+
 SELECT 'abc' <= 'ABC' COLLATE case_sensitive, 'abc' >= 'ABC' COLLATE case_sensitive;
 SELECT 'abc' <= 'ABC' COLLATE case_insensitive, 'abc' >= 'ABC' COLLATE case_insensitive;
 
-- 
2.34.1

examine_variable ignored CollateExpr

Reply via email to