On Fri, 2025-10-17 at 18:15 +0200, Peter Eisentraut wrote:
> 
> This is indeed a bit mysterious.  AFAICT, the behavior you describe
> is 
> conditional on if (prs->usewide), so it apparently depends also on
> the 
> encoding?  I'm not sure if the new code covers this.

I believe the new code does cover this case:

Previously, the code was effectively:
   if (prs->usewide && prs->pgwstr != NULL && c > 0x7f)
      retirm nonascii

and the new code is:
   if (prs->charmaxlen > 1 && locale->ctype_is_c && wc > 0x7f)
      return nonascii;

unless I missed something, those are equivalent.

> After this patch set, char2wchar() can become a local function in 
> pg_locale_libc.c.  (But we still need wchar2char() externally, so
> maybe 
> it's not worth changing this (yes).)

Done.

The rest of the patches are rebased with no other changes. I plan to
commit soon.

Regards,
        Jeff Davis

From 9a0add7e664d4313ab1bdb0730da02af61fb184f Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Mon, 6 Oct 2025 13:05:17 -0700
Subject: [PATCH v4 1/3] tsearch: use database default collation for parsing.

Previously, tsearch used the database's CTYPE setting, which only
matches the database default collation if the locale provider is libc.

Note that tsearch types (tsvector and tsquery) are not collatable
types. The locale affects parsing the original text, which is a lossy
process, so a COLLATE clause on the already-parsed value would not
make sense.

Reviewed-by: Peter Eisentraut <[email protected]>
Discussion: https://postgr.es/m/[email protected]
---
 src/backend/tsearch/ts_locale.c   | 40 ++++++-----------
 src/backend/tsearch/wparser_def.c | 71 ++++++-------------------------
 2 files changed, 27 insertions(+), 84 deletions(-)

diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c
index 4801fe90089..4422f042d12 100644
--- a/src/backend/tsearch/ts_locale.c
+++ b/src/backend/tsearch/ts_locale.c
@@ -20,45 +20,33 @@
 static void tsearch_readline_callback(void *arg);
 
 
-/*
- * The reason these functions use a 3-wchar_t output buffer, not 2 as you
- * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be
- * getting from char2wchar() is UTF16 not UTF32.  A single input character
- * may therefore produce a surrogate pair rather than just one wchar_t;
- * we also need room for a trailing null.  When we do get a surrogate pair,
- * we pass just the first code to iswdigit() etc, so that these functions will
- * always return false for characters outside the Basic Multilingual Plane.
- */
-#define WC_BUF_LEN  3
+/* space for a single character plus a trailing NUL */
+#define WC_BUF_LEN  2
 
 int
 t_isalpha(const char *ptr)
 {
-	int			clen = pg_mblen(ptr);
-	wchar_t		character[WC_BUF_LEN];
-	locale_t	mylocale = 0;	/* TODO */
+	pg_wchar	wstr[WC_BUF_LEN];
+	int			wlen pg_attribute_unused();
 
-	if (clen == 1 || database_ctype_is_c)
-		return isalpha(TOUCHAR(ptr));
+	wlen = pg_mb2wchar_with_len(ptr, wstr, pg_mblen(ptr));
+	Assert(wlen <= 1);
 
-	char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
-
-	return iswalpha((wint_t) character[0]);
+	/* pass single character, or NUL if empty */
+	return pg_iswalpha(wstr[0], pg_database_locale());
 }
 
 int
 t_isalnum(const char *ptr)
 {
-	int			clen = pg_mblen(ptr);
-	wchar_t		character[WC_BUF_LEN];
-	locale_t	mylocale = 0;	/* TODO */
-
-	if (clen == 1 || database_ctype_is_c)
-		return isalnum(TOUCHAR(ptr));
+	pg_wchar	wstr[WC_BUF_LEN];
+	int			wlen pg_attribute_unused();
 
-	char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
+	wlen = pg_mb2wchar_with_len(ptr, wstr, pg_mblen(ptr));
+	Assert(wlen <= 1);
 
-	return iswalnum((wint_t) character[0]);
+	/* pass single character, or NUL if empty */
+	return pg_iswalnum(wstr[0], pg_database_locale());
 }
 
 
diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c
index e2dd3da3aa3..251a2ae6563 100644
--- a/src/backend/tsearch/wparser_def.c
+++ b/src/backend/tsearch/wparser_def.c
@@ -243,9 +243,7 @@ typedef struct TParser
 	/* string and position information */
 	char	   *str;			/* multibyte string */
 	int			lenstr;			/* length of mbstring */
-	wchar_t    *wstr;			/* wide character string */
 	pg_wchar   *pgwstr;			/* wide character string for C-locale */
-	bool		usewide;
 
 	/* State of parse */
 	int			charmaxlen;
@@ -293,33 +291,8 @@ TParserInit(char *str, int len)
 	prs->charmaxlen = pg_database_encoding_max_length();
 	prs->str = str;
 	prs->lenstr = len;
-
-	/*
-	 * Use wide char code only when max encoding length > 1.
-	 */
-	if (prs->charmaxlen > 1)
-	{
-		locale_t	mylocale = 0;	/* TODO */
-
-		prs->usewide = true;
-		if (database_ctype_is_c)
-		{
-			/*
-			 * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could
-			 * be different from sizeof(wchar_t)
-			 */
-			prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
-			pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
-		}
-		else
-		{
-			prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
-			char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr,
-					   mylocale);
-		}
-	}
-	else
-		prs->usewide = false;
+	prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
+	pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
 
 	prs->state = newTParserPosition(NULL);
 	prs->state->state = TPS_Base;
@@ -350,12 +323,9 @@ TParserCopyInit(const TParser *orig)
 	prs->charmaxlen = orig->charmaxlen;
 	prs->str = orig->str + orig->state->posbyte;
 	prs->lenstr = orig->lenstr - orig->state->posbyte;
-	prs->usewide = orig->usewide;
 
 	if (orig->pgwstr)
 		prs->pgwstr = orig->pgwstr + orig->state->poschar;
-	if (orig->wstr)
-		prs->wstr = orig->wstr + orig->state->poschar;
 
 	prs->state = newTParserPosition(NULL);
 	prs->state->state = TPS_Base;
@@ -379,8 +349,6 @@ TParserClose(TParser *prs)
 		prs->state = ptr;
 	}
 
-	if (prs->wstr)
-		pfree(prs->wstr);
 	if (prs->pgwstr)
 		pfree(prs->pgwstr);
 
@@ -412,13 +380,9 @@ TParserCopyClose(TParser *prs)
 
 
 /*
- * Character-type support functions, equivalent to is* macros, but
- * working with any possible encodings and locales. Notes:
- *	- with multibyte encoding and C-locale isw* function may fail
- *	  or give wrong result.
- *	- multibyte encoding and C-locale often are used for
- *	  Asian languages.
- *	- if locale is C then we use pgwstr instead of wstr.
+ * Character-type support functions using the database default locale. If the
+ * locale is C, and the input character is non-ascii, the value to be returned
+ * is determined by the 'nonascii' macro argument.
  */
 
 #define p_iswhat(type, nonascii)											\
@@ -426,19 +390,13 @@ TParserCopyClose(TParser *prs)
 static int																	\
 p_is##type(TParser *prs)													\
 {																			\
+	pg_locale_t locale = pg_database_locale();								\
+	pg_wchar	wc;															\
 	Assert(prs->state);														\
-	if (prs->usewide)														\
-	{																		\
-		if (prs->pgwstr)													\
-		{																	\
-			unsigned int c = *(prs->pgwstr + prs->state->poschar);			\
-			if (c > 0x7f)													\
-				return nonascii;											\
-			return is##type(c);												\
-		}																	\
-		return isw##type(*(prs->wstr + prs->state->poschar));				\
-	}																		\
-	return is##type(*(unsigned char *) (prs->str + prs->state->posbyte));	\
+	wc = prs->pgwstr[prs->state->poschar];									\
+	if (prs->charmaxlen > 1 && locale->ctype_is_c && wc > 0x7f)				\
+		return nonascii;													\
+	return pg_isw##type(wc, pg_database_locale());						\
 }																			\
 																			\
 static int																	\
@@ -703,7 +661,7 @@ p_isspecial(TParser *prs)
 	 * Check that only in utf encoding, because other encodings aren't
 	 * supported by postgres or even exists.
 	 */
-	if (GetDatabaseEncoding() == PG_UTF8 && prs->usewide)
+	if (GetDatabaseEncoding() == PG_UTF8)
 	{
 		static const pg_wchar strange_letter[] = {
 			/*
@@ -944,10 +902,7 @@ p_isspecial(TParser *prs)
 				   *StopMiddle;
 		pg_wchar	c;
 
-		if (prs->pgwstr)
-			c = *(prs->pgwstr + prs->state->poschar);
-		else
-			c = (pg_wchar) *(prs->wstr + prs->state->poschar);
+		c = *(prs->pgwstr + prs->state->poschar);
 
 		while (StopLow < StopHigh)
 		{
-- 
2.43.0

From f507785d80dc7cb26f880418e3e6fb1f29ff3e61 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Tue, 7 Oct 2025 14:20:48 -0700
Subject: [PATCH v4 2/3] Remove obsolete global database_ctype_is_c.

Now that tsearch uses the database default locale, there's no need to
track the database CTYPE separately.

Reviewed-by: Peter Eisentraut <[email protected]>
Discussion: https://postgr.es/m/[email protected]
---
 src/backend/utils/adt/pg_locale.c | 3 ---
 src/backend/utils/init/postinit.c | 4 ----
 src/include/utils/pg_locale.h     | 3 ---
 3 files changed, 10 deletions(-)

diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 00d1e031472..67299c55ed8 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -100,9 +100,6 @@ char	   *localized_full_days[7 + 1];
 char	   *localized_abbrev_months[12 + 1];
 char	   *localized_full_months[12 + 1];
 
-/* is the databases's LC_CTYPE the C locale? */
-bool		database_ctype_is_c = false;
-
 static pg_locale_t default_locale = NULL;
 
 /* indicates whether locale information cache is valid */
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 641e535a73c..98f9598cd78 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -430,10 +430,6 @@ CheckMyDatabase(const char *name, bool am_superuser, bool override_allow_connect
 						   " which is not recognized by setlocale().", ctype),
 				 errhint("Recreate the database with another locale or install the missing locale.")));
 
-	if (strcmp(ctype, "C") == 0 ||
-		strcmp(ctype, "POSIX") == 0)
-		database_ctype_is_c = true;
-
 	init_database_collation();
 
 	/*
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index 86c48c34f26..e6f939cb085 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -39,9 +39,6 @@ extern PGDLLIMPORT char *localized_full_days[];
 extern PGDLLIMPORT char *localized_abbrev_months[];
 extern PGDLLIMPORT char *localized_full_months[];
 
-/* is the databases's LC_CTYPE the C locale? */
-extern PGDLLIMPORT bool database_ctype_is_c;
-
 extern bool check_locale(int category, const char *locale, char **canonname);
 extern char *pg_perm_setlocale(int category, const char *locale);
 
-- 
2.43.0

From 0115cb089f868fd033e6c0031db7266c40f435aa Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Sat, 18 Oct 2025 17:06:19 -0700
Subject: [PATCH v4 3/3] Make char2wchar() static.

Reviewed-by: Peter Eisentraut <[email protected]>
Discussion: https://postgr.es/m/[email protected]
---
 src/backend/utils/adt/pg_locale_libc.c |  5 ++++-
 src/include/utils/pg_locale.h          | 10 +++-------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c
index 34865ccf00e..19a50662398 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -99,6 +99,9 @@ static int	strncoll_libc_win32_utf8(const char *arg1, ssize_t len1,
 									 pg_locale_t locale);
 #endif
 
+static size_t char2wchar(wchar_t *to, size_t tolen, const char *from,
+						 size_t fromlen, locale_t loc);
+
 static size_t strlower_libc_sb(char *dest, size_t destsize,
 							   const char *src, ssize_t srclen,
 							   pg_locale_t locale);
@@ -1206,7 +1209,7 @@ wchar2char(char *to, const wchar_t *from, size_t tolen, locale_t loc)
  * input encoding.  tolen is the maximum number of wchar_t's to store at *to.
  * The output will be zero-terminated iff there is room.
  */
-size_t
+static size_t
 char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen,
 		   locale_t loc)
 {
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index e6f939cb085..e08cb8228fa 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -139,10 +139,8 @@ struct ctype_methods
  * "default" collation, there are separate static cache variables, since
  * consulting the pg_collation catalog doesn't tell us what we need.
  *
- * Note that some code relies on the flags not reporting false negatives
- * (that is, saying it's not C when it is).  For example, char2wchar()
- * could fail if the locale is C, so str_tolower() shouldn't call it
- * in that case.
+ * Note that some code, such as wchar2char(), relies on the flags not
+ * reporting false negatives (that is, saying it's not C when it is).
  */
 struct pg_locale_struct
 {
@@ -226,10 +224,8 @@ extern void icu_validate_locale(const char *loc_str);
 extern char *icu_language_tag(const char *loc_str, int elevel);
 extern void report_newlocale_failure(const char *localename);
 
-/* These functions convert from/to libc's wchar_t, *not* pg_wchar_t */
+/* This function converts from libc's wchar_t, *not* pg_wchar_t */
 extern size_t wchar2char(char *to, const wchar_t *from, size_t tolen,
 						 locale_t loc);
-extern size_t char2wchar(wchar_t *to, size_t tolen,
-						 const char *from, size_t fromlen, locale_t loc);
 
 #endif							/* _PG_LOCALE_ */
-- 
2.43.0

Reply via email to