Use CASEFOLD() internally rather than LOWER()

Jeff Davis Mon, 12 Jan 2026 10:22:45 -0800

There are a number of internal callers of LOWER(), and conceptually
those should all be using CASEFOLD(). Patches attached.


I'm not sure if we want the citext patch -- it would require REINDEX of
all existing citext indexes after upgrade, and there's already a
documented tip ("Consider using nondeterministic collations...), so
perhaps it's a legacy extension anyway.

It would be nice to make the tsearch change this release, as there are
already changes that could require a reindex.

I didn't change pg_trgm yet, because I think that we have to change the
regex machinery to be aware of more than two case variants first (and
potentially increasing string lengths, too).

Regards,
        Jeff Davis

From 94b77294ac95901f07f1e2a571fad483a7409639 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Mon, 12 Jan 2026 08:58:43 -0800
Subject: [PATCH v1 1/4] ILIKE: use CASEFOLD() rather than LOWER().

For non-C locales, we casefold the entire string before performing
pattern matching with ILIKE. Previously, casefolding was done with the
LOWER() function; now that a proper CASEFOLD() function exists, use
that instead.

CASEFOLD() is better than LOWER() for case-insensitive comparisons in
builtin and ICU locales. For instance, CASEFOLD() transforms a GREEK
SMALL LETTER FINAL SIGMA (U+03C2) into GREEK SMALL LETTER SIGMA
(U+03C3) so that the two characters match in a case-insensitive
comparison; whereas LOWER() does not transform it because it's already
lowercase, so they will not match.
---
 src/backend/utils/adt/like.c               |  8 +++-----
 src/test/regress/expected/collate.utf8.out | 24 ++++++++++++++++++++++
 src/test/regress/sql/collate.utf8.sql      |  6 ++++++
 3 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c
index 2143d8658e8..b04c6cc6661 100644
--- a/src/backend/utils/adt/like.c
+++ b/src/backend/utils/adt/like.c
@@ -190,10 +190,8 @@ Generic_Text_IC_like(text *str, text *pat, Oid collation)
 				 errmsg("nondeterministic collations are not supported for ILIKE")));
 
 	/*
-	 * For efficiency reasons, in the C locale we don't call lower() on the
+	 * For efficiency reasons, in the C locale we don't call casefold() on the
 	 * pattern and text, but instead lowercase each character lazily.
-	 *
-	 * XXX: use casefolding instead?
 	 */
 
 	if (locale->ctype_is_c)
@@ -206,11 +204,11 @@ Generic_Text_IC_like(text *str, text *pat, Oid collation)
 	}
 	else
 	{
-		pat = DatumGetTextPP(DirectFunctionCall1Coll(lower, collation,
+		pat = DatumGetTextPP(DirectFunctionCall1Coll(casefold, collation,
 													 PointerGetDatum(pat)));
 		p = VARDATA_ANY(pat);
 		plen = VARSIZE_ANY_EXHDR(pat);
-		str = DatumGetTextPP(DirectFunctionCall1Coll(lower, collation,
+		str = DatumGetTextPP(DirectFunctionCall1Coll(casefold, collation,
 													 PointerGetDatum(str)));
 		s = VARDATA_ANY(str);
 		slen = VARSIZE_ANY_EXHDR(str);
diff --git a/src/test/regress/expected/collate.utf8.out b/src/test/regress/expected/collate.utf8.out
index 0c3ab5c89b2..3d4292611e2 100644
--- a/src/test/regress/expected/collate.utf8.out
+++ b/src/test/regress/expected/collate.utf8.out
@@ -169,6 +169,18 @@ select casefold('AbCd 123 #$% ıiIİ ẞ ß Ǆǅǆ Σσς' collate PG_C_UTF8);
  abcd 123 #$% ıiiİ ß ß ǆǆǆ σσσ
 (1 row)
 
+SELECT U&'ς' ILIKE U&'σ' COLLATE PG_C_UTF8;
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT U&'straße' ILIKE U&'STRASSE' COLLATE PG_C_UTF8;
+ ?column? 
+----------
+ f
+(1 row)
+
 --
 -- Test PG_UNICODE_FAST
 --
@@ -338,3 +350,15 @@ select casefold('AbCd 123 #$% ıiIİ ẞ ß Ǆǅǆ Σσς' collate PG_UNICODE_FA
  abcd 123 #$% ıiii̇ ss ss ǆǆǆ σσσ
 (1 row)
 
+SELECT U&'ς' ILIKE U&'σ' COLLATE PG_UNICODE_FAST;
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT U&'straße' ILIKE U&'STRASSE' COLLATE PG_UNICODE_FAST;
+ ?column? 
+----------
+ t
+(1 row)
+
diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql
index d6d14220ab3..4a5e519cf07 100644
--- a/src/test/regress/sql/collate.utf8.sql
+++ b/src/test/regress/sql/collate.utf8.sql
@@ -85,6 +85,9 @@ SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed
 -- case folding
 select casefold('AbCd 123 #$% ıiIİ ẞ ß Ǆǅǆ Σσς' collate PG_C_UTF8);
 
+SELECT U&'ς' ILIKE U&'σ' COLLATE PG_C_UTF8;
+SELECT U&'straße' ILIKE U&'STRASSE' COLLATE PG_C_UTF8;
+
 --
 -- Test PG_UNICODE_FAST
 --
@@ -148,3 +151,6 @@ SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_UNICODE_FAST; -- same as above with cases re
 
 -- case folding
 select casefold('AbCd 123 #$% ıiIİ ẞ ß Ǆǅǆ Σσς' collate PG_UNICODE_FAST);
+
+SELECT U&'ς' ILIKE U&'σ' COLLATE PG_UNICODE_FAST;
+SELECT U&'straße' ILIKE U&'STRASSE' COLLATE PG_UNICODE_FAST;
-- 
2.43.0

From 8526ae1884e7d36ab9d0da06a1fd16b9ea0f7206 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Mon, 12 Jan 2026 09:04:03 -0800
Subject: [PATCH v1 2/4] citext: use CASEFOLD() rather than LOWER().

CASEFOLD() is better for case-insensitive matching in edge cases.

Existing citext indexes require REINDEX.
---
 contrib/citext/citext.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/contrib/citext/citext.c b/contrib/citext/citext.c
index a15ce5db829..02cad8c4ac3 100644
--- a/contrib/citext/citext.c
+++ b/contrib/citext/citext.c
@@ -43,15 +43,15 @@ citextcmp(text *left, text *right, Oid collid)
 	int32		result;
 
 	/*
-	 * We must do our str_tolower calls with DEFAULT_COLLATION_OID, not the
+	 * We must do our str_casefold calls with DEFAULT_COLLATION_OID, not the
 	 * input collation as you might expect.  This is so that the behavior of
 	 * citext's equality and hashing functions is not collation-dependent.  We
 	 * should change this once the core infrastructure is able to cope with
 	 * collation-dependent equality and hashing functions.
 	 */
 
-	lcstr = str_tolower(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left), DEFAULT_COLLATION_OID);
-	rcstr = str_tolower(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right), DEFAULT_COLLATION_OID);
+	lcstr = str_casefold(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left), DEFAULT_COLLATION_OID);
+	rcstr = str_casefold(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right), DEFAULT_COLLATION_OID);
 
 	result = varstr_cmp(lcstr, strlen(lcstr),
 						rcstr, strlen(rcstr),
@@ -77,8 +77,8 @@ internal_citext_pattern_cmp(text *left, text *right, Oid collid)
 				rlen;
 	int32		result;
 
-	lcstr = str_tolower(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left), DEFAULT_COLLATION_OID);
-	rcstr = str_tolower(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right), DEFAULT_COLLATION_OID);
+	lcstr = str_casefold(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left), DEFAULT_COLLATION_OID);
+	rcstr = str_casefold(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right), DEFAULT_COLLATION_OID);
 
 	llen = strlen(lcstr);
 	rlen = strlen(rcstr);
@@ -147,7 +147,7 @@ citext_hash(PG_FUNCTION_ARGS)
 	char	   *str;
 	Datum		result;
 
-	str = str_tolower(VARDATA_ANY(txt), VARSIZE_ANY_EXHDR(txt), DEFAULT_COLLATION_OID);
+	str = str_casefold(VARDATA_ANY(txt), VARSIZE_ANY_EXHDR(txt), DEFAULT_COLLATION_OID);
 	result = hash_any((unsigned char *) str, strlen(str));
 	pfree(str);
 
@@ -167,7 +167,7 @@ citext_hash_extended(PG_FUNCTION_ARGS)
 	char	   *str;
 	Datum		result;
 
-	str = str_tolower(VARDATA_ANY(txt), VARSIZE_ANY_EXHDR(txt), DEFAULT_COLLATION_OID);
+	str = str_casefold(VARDATA_ANY(txt), VARSIZE_ANY_EXHDR(txt), DEFAULT_COLLATION_OID);
 	result = hash_any_extended((unsigned char *) str, strlen(str), seed);
 	pfree(str);
 
@@ -196,8 +196,8 @@ citext_eq(PG_FUNCTION_ARGS)
 
 	/* We can't compare lengths in advance of downcasing ... */
 
-	lcstr = str_tolower(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left), DEFAULT_COLLATION_OID);
-	rcstr = str_tolower(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right), DEFAULT_COLLATION_OID);
+	lcstr = str_casefold(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left), DEFAULT_COLLATION_OID);
+	rcstr = str_casefold(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right), DEFAULT_COLLATION_OID);
 
 	/*
 	 * Since we only care about equality or not-equality, we can avoid all the
@@ -226,8 +226,8 @@ citext_ne(PG_FUNCTION_ARGS)
 
 	/* We can't compare lengths in advance of downcasing ... */
 
-	lcstr = str_tolower(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left), DEFAULT_COLLATION_OID);
-	rcstr = str_tolower(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right), DEFAULT_COLLATION_OID);
+	lcstr = str_casefold(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left), DEFAULT_COLLATION_OID);
+	rcstr = str_casefold(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right), DEFAULT_COLLATION_OID);
 
 	/*
 	 * Since we only care about equality or not-equality, we can avoid all the
-- 
2.43.0

From 19c2d1d413d7842e6c90c237fab33fe0f93caf82 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Mon, 12 Jan 2026 09:11:56 -0800
Subject: [PATCH v1 3/4] dict_xsyn: use CASEFOLD() rather than LOWER().

CASEFOLD is better for case-insensitive matching in edge cases.
---
 contrib/dict_xsyn/dict_xsyn.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/contrib/dict_xsyn/dict_xsyn.c b/contrib/dict_xsyn/dict_xsyn.c
index 5c4917ce1fc..613527fd392 100644
--- a/contrib/dict_xsyn/dict_xsyn.c
+++ b/contrib/dict_xsyn/dict_xsyn.c
@@ -98,7 +98,7 @@ read_dictionary(DictSyn *d, const char *filename)
 		if (*line == '\0')
 			continue;
 
-		value = str_tolower(line, strlen(line), DEFAULT_COLLATION_OID);
+		value = str_casefold(line, strlen(line), DEFAULT_COLLATION_OID);
 		pfree(line);
 
 		pos = value;
@@ -215,7 +215,7 @@ dxsyn_lexize(PG_FUNCTION_ARGS)
 	{
 		char	   *temp = pnstrdup(in, length);
 
-		word.key = str_tolower(temp, length, DEFAULT_COLLATION_OID);
+		word.key = str_casefold(temp, length, DEFAULT_COLLATION_OID);
 		pfree(temp);
 		word.value = NULL;
 	}
-- 
2.43.0

From 7dfcc58e48baf9aa9a8fa6f41c0c94b1d6d16bae Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Mon, 12 Jan 2026 09:24:16 -0800
Subject: [PATCH v1 4/4] tsearch: use CASEFOLD() rather than LOWER().

CASEFOLD() is better for case-insensitive matching in edge cases.
---
 src/backend/snowball/dict_snowball.c | 4 ++--
 src/backend/tsearch/dict_ispell.c    | 4 ++--
 src/backend/tsearch/dict_simple.c    | 4 ++--
 src/backend/tsearch/dict_synonym.c   | 6 +++---
 src/backend/tsearch/spell.c          | 6 +++---
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/backend/snowball/dict_snowball.c b/src/backend/snowball/dict_snowball.c
index 182bd156995..cb2d3061953 100644
--- a/src/backend/snowball/dict_snowball.c
+++ b/src/backend/snowball/dict_snowball.c
@@ -251,7 +251,7 @@ dsnowball_init(PG_FUNCTION_ARGS)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 						 errmsg("multiple StopWords parameters")));
-			readstoplist(defGetString(defel), &d->stoplist, str_tolower);
+			readstoplist(defGetString(defel), &d->stoplist, str_casefold);
 			stoploaded = true;
 		}
 		else if (strcmp(defel->defname, "language") == 0)
@@ -287,7 +287,7 @@ dsnowball_lexize(PG_FUNCTION_ARGS)
 	DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
 	char	   *in = (char *) PG_GETARG_POINTER(1);
 	int32		len = PG_GETARG_INT32(2);
-	char	   *txt = str_tolower(in, len, DEFAULT_COLLATION_OID);
+	char	   *txt = str_casefold(in, len, DEFAULT_COLLATION_OID);
 	TSLexeme   *res = palloc0_array(TSLexeme, 2);
 
 	/*
diff --git a/src/backend/tsearch/dict_ispell.c b/src/backend/tsearch/dict_ispell.c
index ad5c26ebccb..bdcfc836e80 100644
--- a/src/backend/tsearch/dict_ispell.c
+++ b/src/backend/tsearch/dict_ispell.c
@@ -79,7 +79,7 @@ dispell_init(PG_FUNCTION_ARGS)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 						 errmsg("multiple StopWords parameters")));
-			readstoplist(defGetString(defel), &(d->stoplist), str_tolower);
+			readstoplist(defGetString(defel), &(d->stoplist), str_casefold);
 			stoploaded = true;
 		}
 		else
@@ -128,7 +128,7 @@ dispell_lexize(PG_FUNCTION_ARGS)
 	if (len <= 0)
 		PG_RETURN_POINTER(NULL);
 
-	txt = str_tolower(in, len, DEFAULT_COLLATION_OID);
+	txt = str_casefold(in, len, DEFAULT_COLLATION_OID);
 	res = NINormalizeWord(&(d->obj), txt);
 
 	if (res == NULL)
diff --git a/src/backend/tsearch/dict_simple.c b/src/backend/tsearch/dict_simple.c
index 44d945b2be8..52df5251e20 100644
--- a/src/backend/tsearch/dict_simple.c
+++ b/src/backend/tsearch/dict_simple.c
@@ -48,7 +48,7 @@ dsimple_init(PG_FUNCTION_ARGS)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 						 errmsg("multiple StopWords parameters")));
-			readstoplist(defGetString(defel), &d->stoplist, str_tolower);
+			readstoplist(defGetString(defel), &d->stoplist, str_casefold);
 			stoploaded = true;
 		}
 		else if (strcmp(defel->defname, "accept") == 0)
@@ -81,7 +81,7 @@ dsimple_lexize(PG_FUNCTION_ARGS)
 	char	   *txt;
 	TSLexeme   *res;
 
-	txt = str_tolower(in, len, DEFAULT_COLLATION_OID);
+	txt = str_casefold(in, len, DEFAULT_COLLATION_OID);
 
 	if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
 	{
diff --git a/src/backend/tsearch/dict_synonym.c b/src/backend/tsearch/dict_synonym.c
index 6dee28ae525..b5ff8c23cab 100644
--- a/src/backend/tsearch/dict_synonym.c
+++ b/src/backend/tsearch/dict_synonym.c
@@ -185,8 +185,8 @@ dsynonym_init(PG_FUNCTION_ARGS)
 		}
 		else
 		{
-			d->syn[cur].in = str_tolower(starti, strlen(starti), DEFAULT_COLLATION_OID);
-			d->syn[cur].out = str_tolower(starto, strlen(starto), DEFAULT_COLLATION_OID);
+			d->syn[cur].in = str_casefold(starti, strlen(starti), DEFAULT_COLLATION_OID);
+			d->syn[cur].out = str_casefold(starto, strlen(starto), DEFAULT_COLLATION_OID);
 		}
 
 		d->syn[cur].outlen = strlen(starto);
@@ -226,7 +226,7 @@ dsynonym_lexize(PG_FUNCTION_ARGS)
 	if (d->case_sensitive)
 		key.in = pnstrdup(in, len);
 	else
-		key.in = str_tolower(in, len, DEFAULT_COLLATION_OID);
+		key.in = str_casefold(in, len, DEFAULT_COLLATION_OID);
 
 	key.out = NULL;
 
diff --git a/src/backend/tsearch/spell.c b/src/backend/tsearch/spell.c
index e3436dbddd2..e946b88f38b 100644
--- a/src/backend/tsearch/spell.c
+++ b/src/backend/tsearch/spell.c
@@ -170,7 +170,7 @@ cpstrdup(IspellDict *Conf, const char *str)
 
 
 /*
- * Apply str_tolower(), producing a temporary result (in the buildCxt).
+ * Apply str_casefold(), producing a temporary result (in the buildCxt).
  */
 static char *
 lowerstr_ctx(IspellDict *Conf, const char *src)
@@ -179,7 +179,7 @@ lowerstr_ctx(IspellDict *Conf, const char *src)
 	char	   *dst;
 
 	saveCtx = MemoryContextSwitchTo(Conf->buildCxt);
-	dst = str_tolower(src, strlen(src), DEFAULT_COLLATION_OID);
+	dst = str_casefold(src, strlen(src), DEFAULT_COLLATION_OID);
 	MemoryContextSwitchTo(saveCtx);
 
 	return dst;
@@ -1453,7 +1453,7 @@ NIImportAffixes(IspellDict *Conf, const char *filename)
 
 	while ((recoded = tsearch_readline(&trst)) != NULL)
 	{
-		pstr = str_tolower(recoded, strlen(recoded), DEFAULT_COLLATION_OID);
+		pstr = str_casefold(recoded, strlen(recoded), DEFAULT_COLLATION_OID);
 
 		/* Skip comments and empty lines */
 		if (*pstr == '#' || *pstr == '\n')
-- 
2.43.0

Use CASEFOLD() internally rather than LOWER()

Reply via email to