On Tue, 2025-10-07 at 15:49 -0700, Jeff Davis wrote:
> This patch series allows tsearch to use the database default locale
> for
> parsing. If the database collation is libc, there's no change.
I committed a couple of the refactoring patches and rebased. v3
attached.
v3-0003 which eliminates the "wstr" logic and uses only the "pgwstr". I
was a bit confused why both were needed, as the purpose of pg_wchar is
to abstract away the problems with wchar_t. Perhaps it's historical, or
perhaps I missed something.
Regarding the risk of behavior changes: this affects parsing the
values, but not the interpretation of values after parsing, so the risk
of index inconsistencies seems low. There's risk that a document parsed
in the old version would be parsed differently in the new version,
though. Overall, it seems comparable to the risk of fb1a18810f.
Regards,
Jeff Davis
From be5b75236445183b0c0bfbb14a79b154d8c10d13 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Mon, 6 Oct 2025 13:01:25 -0700
Subject: [PATCH v3 1/4] Add pg_iswxdigit(), useful for tsearch.
---
src/backend/utils/adt/pg_locale.c | 12 ++++++++++++
src/backend/utils/adt/pg_locale_builtin.c | 7 +++++++
src/backend/utils/adt/pg_locale_icu.c | 7 +++++++
src/backend/utils/adt/pg_locale_libc.c | 23 +++++++++++++++++++++++
src/include/utils/pg_locale.h | 2 ++
5 files changed, 51 insertions(+)
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 50b25445f7a..3860ada1905 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1493,6 +1493,18 @@ pg_iswspace(pg_wchar wc, pg_locale_t locale)
return locale->ctype->wc_isspace(wc, locale);
}
+bool
+pg_iswxdigit(pg_wchar wc, pg_locale_t locale)
+{
+ if (locale->ctype == NULL)
+ return (wc <= (pg_wchar) 127 &&
+ ((pg_char_properties[wc] & PG_ISDIGIT) ||
+ ((wc >= 'A' && wc <= 'F') ||
+ (wc >= 'a' && wc <= 'f'))));
+ else
+ return locale->ctype->wc_isxdigit(wc, locale);
+}
+
pg_wchar
pg_towupper(pg_wchar wc, pg_locale_t locale)
{
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 526ab3c6711..3dc611b50e1 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -163,6 +163,12 @@ wc_isspace_builtin(pg_wchar wc, pg_locale_t locale)
return pg_u_isspace(wc);
}
+static bool
+wc_isxdigit_builtin(pg_wchar wc, pg_locale_t locale)
+{
+ return pg_u_isxdigit(wc, !locale->builtin.casemap_full);
+}
+
static bool
char_is_cased_builtin(char ch, pg_locale_t locale)
{
@@ -196,6 +202,7 @@ static const struct ctype_methods ctype_methods_builtin = {
.wc_isprint = wc_isprint_builtin,
.wc_ispunct = wc_ispunct_builtin,
.wc_isspace = wc_isspace_builtin,
+ .wc_isxdigit = wc_isxdigit_builtin,
.char_is_cased = char_is_cased_builtin,
.wc_tolower = wc_tolower_builtin,
.wc_toupper = wc_toupper_builtin,
diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c
index 9f0b4eead73..05bad202669 100644
--- a/src/backend/utils/adt/pg_locale_icu.c
+++ b/src/backend/utils/adt/pg_locale_icu.c
@@ -212,6 +212,12 @@ wc_isspace_icu(pg_wchar wc, pg_locale_t locale)
return u_isspace(wc);
}
+static bool
+wc_isxdigit_icu(pg_wchar wc, pg_locale_t locale)
+{
+ return u_isxdigit(wc);
+}
+
static const struct ctype_methods ctype_methods_icu = {
.strlower = strlower_icu,
.strtitle = strtitle_icu,
@@ -226,6 +232,7 @@ static const struct ctype_methods ctype_methods_icu = {
.wc_isprint = wc_isprint_icu,
.wc_ispunct = wc_ispunct_icu,
.wc_isspace = wc_isspace_icu,
+ .wc_isxdigit = wc_isxdigit_icu,
.char_is_cased = char_is_cased_icu,
.wc_toupper = toupper_icu,
.wc_tolower = tolower_icu,
diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c
index f56b5dbdd37..34865ccf00e 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -172,6 +172,16 @@ wc_isspace_libc_sb(pg_wchar wc, pg_locale_t locale)
return isspace_l((unsigned char) wc, locale->lt);
}
+static bool
+wc_isxdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
+{
+#ifndef WIN32
+ return isxdigit_l((unsigned char) wc, locale->lt);
+#else
+ return _isxdigit_l((unsigned char) wc, locale->lt);
+#endif
+}
+
static bool
wc_isdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
{
@@ -226,6 +236,16 @@ wc_isspace_libc_mb(pg_wchar wc, pg_locale_t locale)
return iswspace_l((wint_t) wc, locale->lt);
}
+static bool
+wc_isxdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
+{
+#ifndef WIN32
+ return iswxdigit_l((wint_t) wc, locale->lt);
+#else
+ return _iswxdigit_l((wint_t) wc, locale->lt);
+#endif
+}
+
static char
char_tolower_libc(unsigned char ch, pg_locale_t locale)
{
@@ -313,6 +333,7 @@ static const struct ctype_methods ctype_methods_libc_sb = {
.wc_isprint = wc_isprint_libc_sb,
.wc_ispunct = wc_ispunct_libc_sb,
.wc_isspace = wc_isspace_libc_sb,
+ .wc_isxdigit = wc_isxdigit_libc_sb,
.char_is_cased = char_is_cased_libc,
.char_tolower = char_tolower_libc,
.wc_toupper = toupper_libc_sb,
@@ -337,6 +358,7 @@ static const struct ctype_methods ctype_methods_libc_other_mb = {
.wc_isprint = wc_isprint_libc_sb,
.wc_ispunct = wc_ispunct_libc_sb,
.wc_isspace = wc_isspace_libc_sb,
+ .wc_isxdigit = wc_isxdigit_libc_sb,
.char_is_cased = char_is_cased_libc,
.char_tolower = char_tolower_libc,
.wc_toupper = toupper_libc_sb,
@@ -357,6 +379,7 @@ static const struct ctype_methods ctype_methods_libc_utf8 = {
.wc_isprint = wc_isprint_libc_mb,
.wc_ispunct = wc_ispunct_libc_mb,
.wc_isspace = wc_isspace_libc_mb,
+ .wc_isxdigit = wc_isxdigit_libc_mb,
.char_is_cased = char_is_cased_libc,
.char_tolower = char_tolower_libc,
.wc_toupper = toupper_libc_mb,
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index 009f5334a87..29c21d4413c 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -110,6 +110,7 @@ struct ctype_methods
bool (*wc_isprint) (pg_wchar wc, pg_locale_t locale);
bool (*wc_ispunct) (pg_wchar wc, pg_locale_t locale);
bool (*wc_isspace) (pg_wchar wc, pg_locale_t locale);
+ bool (*wc_isxdigit) (pg_wchar wc, pg_locale_t locale);
pg_wchar (*wc_toupper) (pg_wchar wc, pg_locale_t locale);
pg_wchar (*wc_tolower) (pg_wchar wc, pg_locale_t locale);
@@ -217,6 +218,7 @@ extern bool pg_iswgraph(pg_wchar wc, pg_locale_t locale);
extern bool pg_iswprint(pg_wchar wc, pg_locale_t locale);
extern bool pg_iswpunct(pg_wchar wc, pg_locale_t locale);
extern bool pg_iswspace(pg_wchar wc, pg_locale_t locale);
+extern bool pg_iswxdigit(pg_wchar wc, pg_locale_t locale);
extern pg_wchar pg_towupper(pg_wchar wc, pg_locale_t locale);
extern pg_wchar pg_towlower(pg_wchar wc, pg_locale_t locale);
--
2.43.0
From af486af2dd836b8783089bc1f55db8eecd1fcd72 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Mon, 6 Oct 2025 14:24:59 -0700
Subject: [PATCH v3 2/4] Add pg_database_locale() to retrieve database default
locale.
---
src/backend/utils/adt/pg_locale.c | 9 +++++++++
src/include/utils/pg_locale.h | 1 +
2 files changed, 10 insertions(+)
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 3860ada1905..00d1e031472 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1162,6 +1162,15 @@ init_database_collation(void)
default_locale = result;
}
+/*
+ * Get database default locale.
+ */
+pg_locale_t
+pg_database_locale(void)
+{
+ return pg_newlocale_from_collation(DEFAULT_COLLATION_OID);
+}
+
/*
* Create a pg_locale_t from a collation OID. Results are cached for the
* lifetime of the backend. Thus, do not free the result with freelocale().
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index 29c21d4413c..86c48c34f26 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -176,6 +176,7 @@ struct pg_locale_struct
};
extern void init_database_collation(void);
+extern pg_locale_t pg_database_locale(void);
extern pg_locale_t pg_newlocale_from_collation(Oid collid);
extern char *get_collation_actual_version(char collprovider, const char *collcollate);
--
2.43.0
From 2c362d560257ec283b9dd358e03f525e8dd6655b Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Mon, 6 Oct 2025 13:05:17 -0700
Subject: [PATCH v3 3/4] tsearch: use database default collation for parsing.
Previously, tsearch used the database's CTYPE setting, which only
matches the database default collation if the locale provider is libc.
Note that tsearch types (tsvector and tsquery) are not collatable
types. The locale affects parsing the original text, which is a lossy
process, so a COLLATE clause on the already-parsed value would not
make sense.
---
src/backend/tsearch/ts_locale.c | 40 ++++++-----------
src/backend/tsearch/wparser_def.c | 71 ++++++-------------------------
2 files changed, 27 insertions(+), 84 deletions(-)
diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c
index 4801fe90089..4422f042d12 100644
--- a/src/backend/tsearch/ts_locale.c
+++ b/src/backend/tsearch/ts_locale.c
@@ -20,45 +20,33 @@
static void tsearch_readline_callback(void *arg);
-/*
- * The reason these functions use a 3-wchar_t output buffer, not 2 as you
- * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be
- * getting from char2wchar() is UTF16 not UTF32. A single input character
- * may therefore produce a surrogate pair rather than just one wchar_t;
- * we also need room for a trailing null. When we do get a surrogate pair,
- * we pass just the first code to iswdigit() etc, so that these functions will
- * always return false for characters outside the Basic Multilingual Plane.
- */
-#define WC_BUF_LEN 3
+/* space for a single character plus a trailing NUL */
+#define WC_BUF_LEN 2
int
t_isalpha(const char *ptr)
{
- int clen = pg_mblen(ptr);
- wchar_t character[WC_BUF_LEN];
- locale_t mylocale = 0; /* TODO */
+ pg_wchar wstr[WC_BUF_LEN];
+ int wlen pg_attribute_unused();
- if (clen == 1 || database_ctype_is_c)
- return isalpha(TOUCHAR(ptr));
+ wlen = pg_mb2wchar_with_len(ptr, wstr, pg_mblen(ptr));
+ Assert(wlen <= 1);
- char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
-
- return iswalpha((wint_t) character[0]);
+ /* pass single character, or NUL if empty */
+ return pg_iswalpha(wstr[0], pg_database_locale());
}
int
t_isalnum(const char *ptr)
{
- int clen = pg_mblen(ptr);
- wchar_t character[WC_BUF_LEN];
- locale_t mylocale = 0; /* TODO */
-
- if (clen == 1 || database_ctype_is_c)
- return isalnum(TOUCHAR(ptr));
+ pg_wchar wstr[WC_BUF_LEN];
+ int wlen pg_attribute_unused();
- char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
+ wlen = pg_mb2wchar_with_len(ptr, wstr, pg_mblen(ptr));
+ Assert(wlen <= 1);
- return iswalnum((wint_t) character[0]);
+ /* pass single character, or NUL if empty */
+ return pg_iswalnum(wstr[0], pg_database_locale());
}
diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c
index e2dd3da3aa3..251a2ae6563 100644
--- a/src/backend/tsearch/wparser_def.c
+++ b/src/backend/tsearch/wparser_def.c
@@ -243,9 +243,7 @@ typedef struct TParser
/* string and position information */
char *str; /* multibyte string */
int lenstr; /* length of mbstring */
- wchar_t *wstr; /* wide character string */
pg_wchar *pgwstr; /* wide character string for C-locale */
- bool usewide;
/* State of parse */
int charmaxlen;
@@ -293,33 +291,8 @@ TParserInit(char *str, int len)
prs->charmaxlen = pg_database_encoding_max_length();
prs->str = str;
prs->lenstr = len;
-
- /*
- * Use wide char code only when max encoding length > 1.
- */
- if (prs->charmaxlen > 1)
- {
- locale_t mylocale = 0; /* TODO */
-
- prs->usewide = true;
- if (database_ctype_is_c)
- {
- /*
- * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could
- * be different from sizeof(wchar_t)
- */
- prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
- pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
- }
- else
- {
- prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
- char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr,
- mylocale);
- }
- }
- else
- prs->usewide = false;
+ prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
+ pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
prs->state = newTParserPosition(NULL);
prs->state->state = TPS_Base;
@@ -350,12 +323,9 @@ TParserCopyInit(const TParser *orig)
prs->charmaxlen = orig->charmaxlen;
prs->str = orig->str + orig->state->posbyte;
prs->lenstr = orig->lenstr - orig->state->posbyte;
- prs->usewide = orig->usewide;
if (orig->pgwstr)
prs->pgwstr = orig->pgwstr + orig->state->poschar;
- if (orig->wstr)
- prs->wstr = orig->wstr + orig->state->poschar;
prs->state = newTParserPosition(NULL);
prs->state->state = TPS_Base;
@@ -379,8 +349,6 @@ TParserClose(TParser *prs)
prs->state = ptr;
}
- if (prs->wstr)
- pfree(prs->wstr);
if (prs->pgwstr)
pfree(prs->pgwstr);
@@ -412,13 +380,9 @@ TParserCopyClose(TParser *prs)
/*
- * Character-type support functions, equivalent to is* macros, but
- * working with any possible encodings and locales. Notes:
- * - with multibyte encoding and C-locale isw* function may fail
- * or give wrong result.
- * - multibyte encoding and C-locale often are used for
- * Asian languages.
- * - if locale is C then we use pgwstr instead of wstr.
+ * Character-type support functions using the database default locale. If the
+ * locale is C, and the input character is non-ascii, the value to be returned
+ * is determined by the 'nonascii' macro argument.
*/
#define p_iswhat(type, nonascii) \
@@ -426,19 +390,13 @@ TParserCopyClose(TParser *prs)
static int \
p_is##type(TParser *prs) \
{ \
+ pg_locale_t locale = pg_database_locale(); \
+ pg_wchar wc; \
Assert(prs->state); \
- if (prs->usewide) \
- { \
- if (prs->pgwstr) \
- { \
- unsigned int c = *(prs->pgwstr + prs->state->poschar); \
- if (c > 0x7f) \
- return nonascii; \
- return is##type(c); \
- } \
- return isw##type(*(prs->wstr + prs->state->poschar)); \
- } \
- return is##type(*(unsigned char *) (prs->str + prs->state->posbyte)); \
+ wc = prs->pgwstr[prs->state->poschar]; \
+ if (prs->charmaxlen > 1 && locale->ctype_is_c && wc > 0x7f) \
+ return nonascii; \
+ return pg_isw##type(wc, pg_database_locale()); \
} \
\
static int \
@@ -703,7 +661,7 @@ p_isspecial(TParser *prs)
* Check that only in utf encoding, because other encodings aren't
* supported by postgres or even exists.
*/
- if (GetDatabaseEncoding() == PG_UTF8 && prs->usewide)
+ if (GetDatabaseEncoding() == PG_UTF8)
{
static const pg_wchar strange_letter[] = {
/*
@@ -944,10 +902,7 @@ p_isspecial(TParser *prs)
*StopMiddle;
pg_wchar c;
- if (prs->pgwstr)
- c = *(prs->pgwstr + prs->state->poschar);
- else
- c = (pg_wchar) *(prs->wstr + prs->state->poschar);
+ c = *(prs->pgwstr + prs->state->poschar);
while (StopLow < StopHigh)
{
--
2.43.0
From 8f01949e03b5205a1c168cc10b76397f2064f3bd Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Tue, 7 Oct 2025 14:20:48 -0700
Subject: [PATCH v3 4/4] Remove obsolete global database_ctype_is_c.
Now that tsearch uses the database default locale, there's no need to
track the database CTYPE separately.
---
src/backend/utils/adt/pg_locale.c | 3 ---
src/backend/utils/init/postinit.c | 4 ----
src/include/utils/pg_locale.h | 3 ---
3 files changed, 10 deletions(-)
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 00d1e031472..67299c55ed8 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -100,9 +100,6 @@ char *localized_full_days[7 + 1];
char *localized_abbrev_months[12 + 1];
char *localized_full_months[12 + 1];
-/* is the databases's LC_CTYPE the C locale? */
-bool database_ctype_is_c = false;
-
static pg_locale_t default_locale = NULL;
/* indicates whether locale information cache is valid */
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 641e535a73c..98f9598cd78 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -430,10 +430,6 @@ CheckMyDatabase(const char *name, bool am_superuser, bool override_allow_connect
" which is not recognized by setlocale().", ctype),
errhint("Recreate the database with another locale or install the missing locale.")));
- if (strcmp(ctype, "C") == 0 ||
- strcmp(ctype, "POSIX") == 0)
- database_ctype_is_c = true;
-
init_database_collation();
/*
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index 86c48c34f26..e6f939cb085 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -39,9 +39,6 @@ extern PGDLLIMPORT char *localized_full_days[];
extern PGDLLIMPORT char *localized_abbrev_months[];
extern PGDLLIMPORT char *localized_full_months[];
-/* is the databases's LC_CTYPE the C locale? */
-extern PGDLLIMPORT bool database_ctype_is_c;
-
extern bool check_locale(int category, const char *locale, char **canonname);
extern char *pg_perm_setlocale(int category, const char *locale);
--
2.43.0