From da1add080c1c4f54a99485faf27d4260047d658b Mon Sep 17 00:00:00 2001
From: Richard Guo <guofenglinux@gmail.com>
Date: Fri, 24 Apr 2026 14:02:26 +0900
Subject: [PATCH v1] Consider collation when proving uniqueness from unique
 indexes

relation_has_unique_index_for() has long had an XXX noting that it
doesn't check collations when matching a unique index's columns
against equality clauses.  This was benign as long as all collations
in play reduced to the same notion of equality, but has been incorrect
since nondeterministic collations were introduced in PG 12: a unique
index under a deterministic collation does not prove uniqueness under
a nondeterministic collation, nor vice versa.

The consequence is wrong query results for any planner optimization
that consumes the faulty proof, including inner-unique join execution
(which stops the inner search after the first match per outer row),
useless-left-join removal, semijoin-to-innerjoin reduction, and
self-join elimination.

Fix by requiring the index's collation to be compatible with the
clause's input collation.  Compatible means either what
IndexCollMatchesExprColl() already accepts (collation-insensitive
index, or matching collations), or a mismatch between two
deterministic collations: by definition a deterministic collation
treats two strings as equal iff they are byte-wise equal (see CREATE
COLLATION), so any two deterministic collations share the same
equality relation and the uniqueness proof carries over.  Any mismatch
involving a nondeterministic collation is rejected.
---
 src/backend/optimizer/path/indxpath.c         |  27 +++-
 .../regress/expected/collate.icu.utf8.out     | 123 ++++++++++++++++++
 src/test/regress/sql/collate.icu.utf8.sql     |  45 +++++++
 3 files changed, 192 insertions(+), 3 deletions(-)

diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c
index f76a5373c4b..77d22ebdabc 100644
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -4220,6 +4220,8 @@ relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel,
 			foreach(lc, restrictlist)
 			{
 				RestrictInfo *rinfo = (RestrictInfo *) lfirst(lc);
+				Oid			idxcoll;
+				Oid			clausecoll;
 				Node	   *rexpr;
 
 				/*
@@ -4232,10 +4234,29 @@ relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel,
 					continue;
 
 				/*
-				 * XXX at some point we may need to check collations here too.
-				 * For the moment we assume all collations reduce to the same
-				 * notion of equality.
+				 * The index's collation must be compatible with the clause's
+				 * input collation, else the index's uniqueness does not imply
+				 * uniqueness under the clause's equality semantics.
+				 *
+				 * IndexCollMatchesExprColl() is the usual check: the index
+				 * doesn't care about collation, or the two collations match.
+				 * Additionally accept a mismatch between two deterministic
+				 * collations: by definition a deterministic collation treats
+				 * two strings as equal iff they are byte-wise equal (see
+				 * CREATE COLLATION), so any two deterministic collations
+				 * share the same equality relation and the uniqueness proof
+				 * carries over.  A mismatch involving a nondeterministic
+				 * collation, however, may mean the two equality relations
+				 * disagree, so the proof is unsound.
 				 */
+				idxcoll = ind->indexcollations[c];
+				clausecoll = exprInputCollation((Node *) rinfo->clause);
+
+				if (!IndexCollMatchesExprColl(idxcoll, clausecoll) &&
+					(!OidIsValid(clausecoll) ||
+					 !get_collation_isdeterministic(idxcoll) ||
+					 !get_collation_isdeterministic(clausecoll)))
+					continue;
 
 				/* OK, see if the condition operand matches the index key */
 				if (rinfo->outer_is_left)
diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out
index fce726029a2..db87062b4a3 100644
--- a/src/test/regress/expected/collate.icu.utf8.out
+++ b/src/test/regress/expected/collate.icu.utf8.out
@@ -1654,6 +1654,129 @@ SELECT string_to_array('ABCDEFGHI' COLLATE case_sensitive, NULL, 'b');
  {A,B,C,D,E,F,G,H,I}
 (1 row)
 
+--
+-- A unique index under one collation does not prove uniqueness under
+-- another, so the planner must not use such a proof for any optimization.
+--
+-- Ensure that we do not use inner-unique join execution
+EXPLAIN (VERBOSE, COSTS OFF)
+SELECT * FROM test1cs t1, test3cs t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+                              QUERY PLAN                              
+----------------------------------------------------------------------
+ Sort
+   Output: t1.x, t2.x
+   Sort Key: t1.x COLLATE case_sensitive, t2.x COLLATE case_sensitive
+   ->  Hash Join
+         Output: t1.x, t2.x
+         Hash Cond: ((t2.x)::text = (t1.x)::text)
+         ->  Seq Scan on collate_tests.test3cs t2
+               Output: t2.x
+         ->  Hash
+               Output: t1.x
+               ->  Seq Scan on collate_tests.test1cs t1
+                     Output: t1.x
+(12 rows)
+
+SELECT * FROM test1cs t1, test3cs t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+  x  |  x  
+-----+-----
+ abc | abc
+ abc | ABC
+ ABC | abc
+ ABC | ABC
+ def | def
+ ghi | ghi
+(6 rows)
+
+-- Ensure that left-join is not removed
+EXPLAIN (COSTS OFF)
+SELECT t1.* FROM test3cs t1
+       LEFT JOIN test3cs t2 ON t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1;
+                QUERY PLAN                
+------------------------------------------
+ Sort
+   Sort Key: t1.x COLLATE case_sensitive
+   ->  Hash Left Join
+         Hash Cond: (t1.x = (t2.x)::text)
+         ->  Seq Scan on test3cs t1
+         ->  Hash
+               ->  Seq Scan on test3cs t2
+(7 rows)
+
+SELECT t1.* FROM test3cs t1
+       LEFT JOIN test3cs t2 ON t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1;
+  x  
+-----
+ abc
+ abc
+ ABC
+ ABC
+ def
+ ghi
+(6 rows)
+
+-- Ensure that self-join is not removed
+EXPLAIN (COSTS OFF)
+SELECT * FROM test3cs t1, test3cs t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+                              QUERY PLAN                              
+----------------------------------------------------------------------
+ Sort
+   Sort Key: t1.x COLLATE case_sensitive, t2.x COLLATE case_sensitive
+   ->  Hash Join
+         Hash Cond: ((t1.x)::text = (t2.x)::text)
+         ->  Seq Scan on test3cs t1
+         ->  Hash
+               ->  Seq Scan on test3cs t2
+(7 rows)
+
+SELECT * FROM test3cs t1, test3cs t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+  x  |  x  
+-----+-----
+ abc | abc
+ abc | ABC
+ ABC | abc
+ ABC | ABC
+ def | def
+ ghi | ghi
+(6 rows)
+
+-- Ensure that semijoin is not reduced to innerjoin
+EXPLAIN (COSTS OFF)
+SELECT * FROM test3cs t1
+  WHERE EXISTS (SELECT 1 FROM test3cs t2 WHERE t1.x = t2.x COLLATE case_insensitive)
+ORDER BY 1;
+                    QUERY PLAN                    
+--------------------------------------------------
+ Sort
+   Sort Key: t1.x COLLATE case_sensitive
+   ->  Hash Semi Join
+         Hash Cond: ((t1.x)::text = (t2.x)::text)
+         ->  Seq Scan on test3cs t1
+         ->  Hash
+               ->  Seq Scan on test3cs t2
+(7 rows)
+
+SELECT * FROM test3cs t1
+  WHERE EXISTS (SELECT 1 FROM test3cs t2 WHERE t1.x = t2.x COLLATE case_insensitive)
+ORDER BY 1;
+  x  
+-----
+ abc
+ ABC
+ def
+ ghi
+(4 rows)
+
 CREATE TABLE test1ci (x text COLLATE case_insensitive);
 CREATE TABLE test2ci (x text COLLATE case_insensitive);
 CREATE TABLE test3ci (x text COLLATE case_insensitive);
diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql
index 0bf65a63535..95c16859afa 100644
--- a/src/test/regress/sql/collate.icu.utf8.sql
+++ b/src/test/regress/sql/collate.icu.utf8.sql
@@ -612,6 +612,51 @@ CREATE UNIQUE INDEX ON test3cs (x);  -- ok
 SELECT string_to_array('ABC,DEF,GHI' COLLATE case_sensitive, ',', 'abc');
 SELECT string_to_array('ABCDEFGHI' COLLATE case_sensitive, NULL, 'b');
 
+--
+-- A unique index under one collation does not prove uniqueness under
+-- another, so the planner must not use such a proof for any optimization.
+--
+
+-- Ensure that we do not use inner-unique join execution
+EXPLAIN (VERBOSE, COSTS OFF)
+SELECT * FROM test1cs t1, test3cs t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+
+SELECT * FROM test1cs t1, test3cs t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+
+-- Ensure that left-join is not removed
+EXPLAIN (COSTS OFF)
+SELECT t1.* FROM test3cs t1
+       LEFT JOIN test3cs t2 ON t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1;
+
+SELECT t1.* FROM test3cs t1
+       LEFT JOIN test3cs t2 ON t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1;
+
+-- Ensure that self-join is not removed
+EXPLAIN (COSTS OFF)
+SELECT * FROM test3cs t1, test3cs t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+
+SELECT * FROM test3cs t1, test3cs t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+
+-- Ensure that semijoin is not reduced to innerjoin
+EXPLAIN (COSTS OFF)
+SELECT * FROM test3cs t1
+  WHERE EXISTS (SELECT 1 FROM test3cs t2 WHERE t1.x = t2.x COLLATE case_insensitive)
+ORDER BY 1;
+
+SELECT * FROM test3cs t1
+  WHERE EXISTS (SELECT 1 FROM test3cs t2 WHERE t1.x = t2.x COLLATE case_insensitive)
+ORDER BY 1;
+
 CREATE TABLE test1ci (x text COLLATE case_insensitive);
 CREATE TABLE test2ci (x text COLLATE case_insensitive);
 CREATE TABLE test3ci (x text COLLATE case_insensitive);
-- 
2.39.5 (Apple Git-154)

