On Wed, Oct 29, 2025 at 2:00 PM Thomas Munro <[email protected]> wrote:
> I'm picturing something like PG_WCHAR_CHAR
> (direclty usable with ctype.h), PG_WCHAR_UTF32 (self-explanatory, also
> assumed be compatible with UTF-8 locales' wchar_t), PG_WCHAR_CUSTOM
> (we only know that ASCII range is sane as Ishii-san explained, and for
> anything else you'd need to re-encode via libc or give up, but
> preferably not go nuts and return junk). The enum would create a new
> central place to document the cross-module semantics.
Here are some sketch-quality patches to try out some of these ideas,
for discussion. I gave them .txt endings so as not to hijack your
thread's CI.
* Fixing a different but related bug spotted in passing: we truncate
codepoints passed to Windows' iswalpha_l() et al, instead of detecting
overflow like some other places do. Not tested on Windows, but it
seemed pretty obviously wrong?
* Classifying all pg_wchar encodings as producing PG_WCHAR_CHAR,
PG_WCHAR_UTF32 or PG_WCHAR_CUSTOM, and dispatching to libc ctype
methods based with that.
* Easy EUC change: filtering out non-ASCII for _CUSTOM. I can't seem
to convince SQL-level regexes to expose bogus results on master
though... maybe the pg_wchar encoding actively avoids the by shifting
values up so you often or always cast to a harmless value? Still
better to formalise that I think, if we don't move ahead with the more
ambitious plan...
* More ambitious re-encoding strategy, replacing previous change, with
apparently plausible results.
* Various refactorings with helper macros to avoid making mistakes in
all that repetitive wrapper stuff.
Here's what my ja_JP.eucJP database shows, on FreeBSD. BTW in my
earlier emails I was confused and thought that kanji would not be in
class [[:alpha:]], but that's wrong: Unicode calls it "other letter",
and it looks like that makes all modern libcs return true for
iswalpha():
postgres=# select regexp_replace('1234 Постгрес 5678', '[[:alpha:]]+', '象');
regexp_replace
----------------
1234 象 5678
(1 row)
postgres=# select regexp_replace('1234 ポスグレ 5678', '[[:alpha:]]+', '象');
regexp_replace
----------------
1234 象 5678
(1 row)
postgres=# select regexp_replace('1234 ポスグレ? 5678', '[[:punct:]]+', '。');
regexp_replace
----------------------
1234 ポスグレ。 5678
(1 row)
(That's not an ASCII question mark, it's one of the kanji-box sized
punctuation characters.)
I had to hack regc_pg_locale.c slightly to teach it that just because
I set max_chr to 127 it doesn't mean I want it to turn locale support
off. Haven't looked into that code to figure out what it should do
instead, but it definitely shouldn't be allowed to probe made up
pg_wchar values, because EUC's pg_wchar encoding is sparse and
transcoding can error out.
A mystery that blocked me for too long: regexp_match('café', 'CAFÉ',
'i') and regexp_match('Αθήνα', 'ΑΘΉΝΑ', 'i') match with Apple's
ja_JP.eucJP as do the examples above, but mysteriously didn't on
FreeBSD's where this code started, could be a bug in its ja_JP.eucJP
locale affecting toupper/tolower... Wish I could get that time back.
I imagine that for the ICU + non-UTF-8 locale bug you mentioned, we
might need a very similar set of re-encoding wrappers: something like
pg_wchar -> mb -> UTF-8 -> UTF-32. All this re-encoding sounds
pretty bad, but I can't see any way around the re-encoding with these
edge-case configurations, and we're still supposed to spit out correct
right answers...
From 5525b5e35121bdfd5eb566b7a08916fe90822422 Mon Sep 17 00:00:00 2001
From: Thomas Munro <[email protected]>
Date: Wed, 29 Oct 2025 15:53:46 +1300
Subject: [PATCH 1/8] Fix Windows wctype.h usage for codepoints outside Unicode
BMP.
Windows' wchar_t is only 16 bits wide. As established by the
towupper_l()/towlower_l() wrapper functions, we should avoid truncating
overflowing code points when calling wctype.h functions, and just return
false. Windows just can't answer that question, but it didn't make
sense to return the answer for a totally different character.
---
src/backend/utils/adt/pg_locale_libc.c | 27 +++++++++++++++++---------
1 file changed, 18 insertions(+), 9 deletions(-)
diff --git a/src/backend/utils/adt/pg_locale_libc.c
b/src/backend/utils/adt/pg_locale_libc.c
index 9c7fcd1fc7a..761ed1a0603 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -187,55 +187,64 @@ wc_isxdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool
wc_isdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
{
- return iswdigit_l((wint_t) wc, locale->lt);
+ return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
+ iswdigit_l((wint_t) wc, locale->lt);
}
static bool
wc_isalpha_libc_mb(pg_wchar wc, pg_locale_t locale)
{
- return iswalpha_l((wint_t) wc, locale->lt);
+ return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
+ iswalpha_l((wint_t) wc, locale->lt);
}
static bool
wc_isalnum_libc_mb(pg_wchar wc, pg_locale_t locale)
{
- return iswalnum_l((wint_t) wc, locale->lt);
+ return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
+ iswalnum_l((wint_t) wc, locale->lt);
}
static bool
wc_isupper_libc_mb(pg_wchar wc, pg_locale_t locale)
{
- return iswupper_l((wint_t) wc, locale->lt);
+ return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
+ iswupper_l((wint_t) wc, locale->lt);
}
static bool
wc_islower_libc_mb(pg_wchar wc, pg_locale_t locale)
{
- return iswlower_l((wint_t) wc, locale->lt);
+ return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
+ iswlower_l((wint_t) wc, locale->lt);
}
static bool
wc_isgraph_libc_mb(pg_wchar wc, pg_locale_t locale)
{
- return iswgraph_l((wint_t) wc, locale->lt);
+ return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
+ iswgraph_l((wint_t) wc, locale->lt);
}
static bool
wc_isprint_libc_mb(pg_wchar wc, pg_locale_t locale)
{
- return iswprint_l((wint_t) wc, locale->lt);
+ return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
+ iswprint_l((wint_t) wc, locale->lt);
}
static bool
wc_ispunct_libc_mb(pg_wchar wc, pg_locale_t locale)
{
- return iswpunct_l((wint_t) wc, locale->lt);
+ return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
+ iswpunct_l((wint_t) wc, locale->lt);
}
static bool
wc_isspace_libc_mb(pg_wchar wc, pg_locale_t locale)
{
- return iswspace_l((wint_t) wc, locale->lt);
+ return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
+ iswspace_l((wint_t) wc, locale->lt);
}
static bool
--
2.50.1 (Apple Git-155)
From 9a9026c29f3e9cd3c1b7fd92e053bcb5ecc5f6ae Mon Sep 17 00:00:00 2001
From: Thomas Munro <[email protected]>
Date: Wed, 29 Oct 2025 15:14:13 +1300
Subject: [PATCH 2/8] Formalize pg_wchar encoding schemes.
Create a bit more clarity about the different ways that pg_wchar can be
encoded, by naming the three schemes in use. This also allows a
dispatch-table format in pg_locale_libc.c.
Discussion:
https://www.postgresql.org/message-id/flat/CA%2BhUKG%2BhDkp1etcfy%3DtaxJ8ybf8KapyOjqdBRPF7yaoSoSj1_w%40mail.gmail.com
---
src/backend/utils/adt/pg_locale_libc.c | 163 +++++++++++++------------
src/common/wchar.c | 94 +++++++-------
src/include/mb/pg_wchar.h | 51 ++++++++
src/tools/pgindent/typedefs.list | 1 +
4 files changed, 187 insertions(+), 122 deletions(-)
diff --git a/src/backend/utils/adt/pg_locale_libc.c
b/src/backend/utils/adt/pg_locale_libc.c
index 761ed1a0603..1892ed3c5ce 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -43,20 +43,25 @@
* the <ctype.h> functions since those will obey LC_CTYPE. Note that these
* collations don't give a fig about multibyte characters.
*
- * 2. When working in UTF8 encoding, we use the <wctype.h> functions.
+ * 2. PG_WCHAR_UTF32 encoding scheme:
+ *
+ * When working in UTF8 encoding, we use the <wctype.h> functions.
* This assumes that every platform uses Unicode codepoints directly
* as the wchar_t representation of Unicode. On some platforms
* wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
*
- * 3. In all other encodings, we use the <ctype.h> functions for pg_wchar
+ * 3. PG_WCHAR_CHAR and PG_WCHAR_CUSTOM encoding schemes:
+ *
+ * In all other encodings, we use the <ctype.h> functions for pg_wchar
* values up to 255, and punt for values above that. This is 100% correct
- * only in single-byte encodings such as LATINn. However, non-Unicode
- * multibyte encodings are mostly Far Eastern character sets for which the
- * properties being tested here aren't very relevant for higher code values
- * anyway. The difficulty with using the <wctype.h> functions with
- * non-Unicode multibyte encodings is that we can have no certainty that
- * the platform's wchar_t representation matches what we do in pg_wchar
- * conversions.
+ * only in single-byte encodings such as LATINn (PG_WCHAR_CHAR). However,
+ * non-Unicode multibyte encodings (PG_WCHAR_CUSTOM) are all Far Eastern
+ * character sets for which the properties being tested here aren't very
+ * relevant for higher code values anyway. The difficulty with using the
+ * <wctype.h> functions with non-Unicode multibyte encodings is that we can
+ * have no certainty that the platform's wchar_t representation matches what we
+ * do in pg_wchar conversions. (MULE is also declared PG_WCHAR_CUSTOM but is
+ * not available as a multi-byte encoding in any known libc.)
*
* As a special case, in the "default" collation, (2) and (3) force ASCII
* letters to follow ASCII upcase/downcase rules, while in a non-default
@@ -331,70 +336,75 @@ tolower_libc_mb(pg_wchar wc, pg_locale_t locale)
return wc;
}
-static const struct ctype_methods ctype_methods_libc_sb = {
- .strlower = strlower_libc_sb,
- .strtitle = strtitle_libc_sb,
- .strupper = strupper_libc_sb,
- .wc_isdigit = wc_isdigit_libc_sb,
- .wc_isalpha = wc_isalpha_libc_sb,
- .wc_isalnum = wc_isalnum_libc_sb,
- .wc_isupper = wc_isupper_libc_sb,
- .wc_islower = wc_islower_libc_sb,
- .wc_isgraph = wc_isgraph_libc_sb,
- .wc_isprint = wc_isprint_libc_sb,
- .wc_ispunct = wc_ispunct_libc_sb,
- .wc_isspace = wc_isspace_libc_sb,
- .wc_isxdigit = wc_isxdigit_libc_sb,
- .char_is_cased = char_is_cased_libc,
- .char_tolower = char_tolower_libc,
- .wc_toupper = toupper_libc_sb,
- .wc_tolower = tolower_libc_sb,
- .max_chr = UCHAR_MAX,
-};
-
-/*
- * Non-UTF8 multibyte encodings use multibyte semantics for case mapping, but
- * single-byte semantics for pattern matching.
- */
-static const struct ctype_methods ctype_methods_libc_other_mb = {
- .strlower = strlower_libc_mb,
- .strtitle = strtitle_libc_mb,
- .strupper = strupper_libc_mb,
- .wc_isdigit = wc_isdigit_libc_sb,
- .wc_isalpha = wc_isalpha_libc_sb,
- .wc_isalnum = wc_isalnum_libc_sb,
- .wc_isupper = wc_isupper_libc_sb,
- .wc_islower = wc_islower_libc_sb,
- .wc_isgraph = wc_isgraph_libc_sb,
- .wc_isprint = wc_isprint_libc_sb,
- .wc_ispunct = wc_ispunct_libc_sb,
- .wc_isspace = wc_isspace_libc_sb,
- .wc_isxdigit = wc_isxdigit_libc_sb,
- .char_is_cased = char_is_cased_libc,
- .char_tolower = char_tolower_libc,
- .wc_toupper = toupper_libc_sb,
- .wc_tolower = tolower_libc_sb,
- .max_chr = UCHAR_MAX,
-};
+static const struct ctype_methods ctype_methods_libc[] = {
+ [PG_WCHAR_CHAR] = {
+ .strlower = strlower_libc_sb,
+ .strtitle = strtitle_libc_sb,
+ .strupper = strupper_libc_sb,
+ .wc_isdigit = wc_isdigit_libc_sb,
+ .wc_isalpha = wc_isalpha_libc_sb,
+ .wc_isalnum = wc_isalnum_libc_sb,
+ .wc_isupper = wc_isupper_libc_sb,
+ .wc_islower = wc_islower_libc_sb,
+ .wc_isgraph = wc_isgraph_libc_sb,
+ .wc_isprint = wc_isprint_libc_sb,
+ .wc_ispunct = wc_ispunct_libc_sb,
+ .wc_isspace = wc_isspace_libc_sb,
+ .wc_isxdigit = wc_isxdigit_libc_sb,
+ .char_is_cased = char_is_cased_libc,
+ .char_tolower = char_tolower_libc,
+ .wc_toupper = toupper_libc_sb,
+ .wc_tolower = tolower_libc_sb,
+ .max_chr = UCHAR_MAX,
+ },
+ [PG_WCHAR_UTF32] = {
+ .strlower = strlower_libc_mb,
+ .strtitle = strtitle_libc_mb,
+ .strupper = strupper_libc_mb,
+ .wc_isdigit = wc_isdigit_libc_mb,
+ .wc_isalpha = wc_isalpha_libc_mb,
+ .wc_isalnum = wc_isalnum_libc_mb,
+ .wc_isupper = wc_isupper_libc_mb,
+ .wc_islower = wc_islower_libc_mb,
+ .wc_isgraph = wc_isgraph_libc_mb,
+ .wc_isprint = wc_isprint_libc_mb,
+ .wc_ispunct = wc_ispunct_libc_mb,
+ .wc_isspace = wc_isspace_libc_mb,
+ .wc_isxdigit = wc_isxdigit_libc_mb,
+ .char_is_cased = char_is_cased_libc,
+ .char_tolower = char_tolower_libc,
+ .wc_toupper = toupper_libc_mb,
+ .wc_tolower = tolower_libc_mb,
+ },
-static const struct ctype_methods ctype_methods_libc_utf8 = {
- .strlower = strlower_libc_mb,
- .strtitle = strtitle_libc_mb,
- .strupper = strupper_libc_mb,
- .wc_isdigit = wc_isdigit_libc_mb,
- .wc_isalpha = wc_isalpha_libc_mb,
- .wc_isalnum = wc_isalnum_libc_mb,
- .wc_isupper = wc_isupper_libc_mb,
- .wc_islower = wc_islower_libc_mb,
- .wc_isgraph = wc_isgraph_libc_mb,
- .wc_isprint = wc_isprint_libc_mb,
- .wc_ispunct = wc_ispunct_libc_mb,
- .wc_isspace = wc_isspace_libc_mb,
- .wc_isxdigit = wc_isxdigit_libc_mb,
- .char_is_cased = char_is_cased_libc,
- .char_tolower = char_tolower_libc,
- .wc_toupper = toupper_libc_mb,
- .wc_tolower = tolower_libc_mb,
+ /*
+ * Custom pg_wchar format converted from non-UTF8 multibyte encodings
use
+ * multibyte semantics for case mapping, but single-byte semantics for
+ * pattern matching.
+ *
+ * XXX Therefore this gives incorrect results for pattern matching
outside
+ * the ASCII range. Could be fixed.
+ */
+ [PG_WCHAR_CUSTOM] = {
+ .strlower = strlower_libc_mb,
+ .strtitle = strtitle_libc_mb,
+ .strupper = strupper_libc_mb,
+ .wc_isdigit = wc_isdigit_libc_sb,
+ .wc_isalpha = wc_isalpha_libc_sb,
+ .wc_isalnum = wc_isalnum_libc_sb,
+ .wc_isupper = wc_isupper_libc_sb,
+ .wc_islower = wc_islower_libc_sb,
+ .wc_isgraph = wc_isgraph_libc_sb,
+ .wc_isprint = wc_isprint_libc_sb,
+ .wc_ispunct = wc_ispunct_libc_sb,
+ .wc_isspace = wc_isspace_libc_sb,
+ .wc_isxdigit = wc_isxdigit_libc_sb,
+ .char_is_cased = char_is_cased_libc,
+ .char_tolower = char_tolower_libc,
+ .wc_toupper = toupper_libc_sb,
+ .wc_tolower = tolower_libc_sb,
+ .max_chr = UCHAR_MAX,
+ },
};
static const struct collate_methods collate_methods_libc = {
@@ -763,14 +773,7 @@ create_pg_locale_libc(Oid collid, MemoryContext context)
result->collate = &collate_methods_libc;
}
if (!result->ctype_is_c)
- {
- if (GetDatabaseEncoding() == PG_UTF8)
- result->ctype = &ctype_methods_libc_utf8;
- else if (pg_database_encoding_max_length() > 1)
- result->ctype = &ctype_methods_libc_other_mb;
- else
- result->ctype = &ctype_methods_libc_sb;
- }
+ result->ctype =
&ctype_methods_libc[pg_wchar_encoding_scheme(GetDatabaseEncoding())];
return result;
}
diff --git a/src/common/wchar.c b/src/common/wchar.c
index a4bc29921de..f453587749a 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -2062,50 +2062,60 @@ pg_encoding_set_invalid(int encoding, char *dst)
*-------------------------------------------------------------------
*/
const pg_wchar_tbl pg_wchar_table[] = {
- [PG_SQL_ASCII] = {pg_ascii2wchar_with_len, pg_wchar2single_with_len,
pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1},
- [PG_EUC_JP] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len,
pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
- [PG_EUC_CN] = {pg_euccn2wchar_with_len, pg_wchar2euc_with_len,
pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 2},
- [PG_EUC_KR] = {pg_euckr2wchar_with_len, pg_wchar2euc_with_len,
pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3},
- [PG_EUC_TW] = {pg_euctw2wchar_with_len, pg_wchar2euc_with_len,
pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4},
- [PG_EUC_JIS_2004] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len,
pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
- [PG_UTF8] = {pg_utf2wchar_with_len, pg_wchar2utf_with_len,
pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar, pg_utf8_verifystr, 4},
- [PG_MULE_INTERNAL] = {pg_mule2wchar_with_len, pg_wchar2mule_with_len,
pg_mule_mblen, pg_mule_dsplen, pg_mule_verifychar, pg_mule_verifystr, 4},
- [PG_LATIN1] = {pg_latin12wchar_with_len, pg_wchar2single_with_len,
pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr,
1},
- [PG_LATIN2] = {pg_latin12wchar_with_len, pg_wchar2single_with_len,
pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr,
1},
- [PG_LATIN3] = {pg_latin12wchar_with_len, pg_wchar2single_with_len,
pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr,
1},
- [PG_LATIN4] = {pg_latin12wchar_with_len, pg_wchar2single_with_len,
pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr,
1},
- [PG_LATIN5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len,
pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr,
1},
- [PG_LATIN6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len,
pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr,
1},
- [PG_LATIN7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len,
pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr,
1},
- [PG_LATIN8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len,
pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr,
1},
- [PG_LATIN9] = {pg_latin12wchar_with_len, pg_wchar2single_with_len,
pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr,
1},
- [PG_LATIN10] = {pg_latin12wchar_with_len, pg_wchar2single_with_len,
pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr,
1},
- [PG_WIN1256] = {pg_latin12wchar_with_len, pg_wchar2single_with_len,
pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr,
1},
- [PG_WIN1258] = {pg_latin12wchar_with_len, pg_wchar2single_with_len,
pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr,
1},
- [PG_WIN866] = {pg_latin12wchar_with_len, pg_wchar2single_with_len,
pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr,
1},
- [PG_WIN874] = {pg_latin12wchar_with_len, pg_wchar2single_with_len,
pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr,
1},
- [PG_KOI8R] = {pg_latin12wchar_with_len, pg_wchar2single_with_len,
pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr,
1},
- [PG_WIN1251] = {pg_latin12wchar_with_len, pg_wchar2single_with_len,
pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr,
1},
- [PG_WIN1252] = {pg_latin12wchar_with_len, pg_wchar2single_with_len,
pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr,
1},
- [PG_ISO_8859_5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len,
pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr,
1},
- [PG_ISO_8859_6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len,
pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr,
1},
- [PG_ISO_8859_7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len,
pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr,
1},
- [PG_ISO_8859_8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len,
pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr,
1},
- [PG_WIN1250] = {pg_latin12wchar_with_len, pg_wchar2single_with_len,
pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr,
1},
- [PG_WIN1253] = {pg_latin12wchar_with_len, pg_wchar2single_with_len,
pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr,
1},
- [PG_WIN1254] = {pg_latin12wchar_with_len, pg_wchar2single_with_len,
pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr,
1},
- [PG_WIN1255] = {pg_latin12wchar_with_len, pg_wchar2single_with_len,
pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr,
1},
- [PG_WIN1257] = {pg_latin12wchar_with_len, pg_wchar2single_with_len,
pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr,
1},
- [PG_KOI8U] = {pg_latin12wchar_with_len, pg_wchar2single_with_len,
pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr,
1},
- [PG_SJIS] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar,
pg_sjis_verifystr, 2},
- [PG_BIG5] = {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifychar,
pg_big5_verifystr, 2},
- [PG_GBK] = {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar,
pg_gbk_verifystr, 2},
- [PG_UHC] = {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar,
pg_uhc_verifystr, 2},
- [PG_GB18030] = {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen,
pg_gb18030_verifychar, pg_gb18030_verifystr, 4},
- [PG_JOHAB] = {0, 0, pg_johab_mblen, pg_johab_dsplen,
pg_johab_verifychar, pg_johab_verifystr, 3},
- [PG_SHIFT_JIS_2004] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen,
pg_sjis_verifychar, pg_sjis_verifystr, 2},
+ [PG_SQL_ASCII] = {PG_WCHAR_CHAR, pg_ascii2wchar_with_len,
pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar,
pg_ascii_verifystr, 1},
+ [PG_EUC_JP] = {PG_WCHAR_CUSTOM, pg_eucjp2wchar_with_len,
pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar,
pg_eucjp_verifystr, 3},
+ [PG_EUC_CN] = {PG_WCHAR_CUSTOM, pg_euccn2wchar_with_len,
pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar,
pg_euccn_verifystr, 2},
+ [PG_EUC_KR] = {PG_WCHAR_CUSTOM, pg_euckr2wchar_with_len,
pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar,
pg_euckr_verifystr, 3},
+ [PG_EUC_TW] = {PG_WCHAR_CUSTOM, pg_euctw2wchar_with_len,
pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar,
pg_euctw_verifystr, 4},
+ [PG_EUC_JIS_2004] = {PG_WCHAR_CUSTOM, pg_eucjp2wchar_with_len,
pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar,
pg_eucjp_verifystr, 3},
+ [PG_UTF8] = {PG_WCHAR_UTF32, pg_utf2wchar_with_len,
pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar,
pg_utf8_verifystr, 4},
+ [PG_MULE_INTERNAL] = {PG_WCHAR_CUSTOM, pg_mule2wchar_with_len,
pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifychar,
pg_mule_verifystr, 4},
+ [PG_LATIN1] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len,
pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen,
pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_LATIN2] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len,
pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen,
pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_LATIN3] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len,
pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen,
pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_LATIN4] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len,
pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen,
pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_LATIN5] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len,
pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen,
pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_LATIN6] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len,
pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen,
pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_LATIN7] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len,
pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen,
pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_LATIN8] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len,
pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen,
pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_LATIN9] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len,
pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen,
pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_LATIN10] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len,
pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen,
pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_WIN1256] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len,
pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen,
pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_WIN1258] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len,
pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen,
pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_WIN866] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len,
pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen,
pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_WIN874] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len,
pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen,
pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_KOI8R] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len,
pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen,
pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_WIN1251] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len,
pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen,
pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_WIN1252] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len,
pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen,
pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_ISO_8859_5] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len,
pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen,
pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_ISO_8859_6] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len,
pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen,
pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_ISO_8859_7] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len,
pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen,
pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_ISO_8859_8] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len,
pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen,
pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_WIN1250] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len,
pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen,
pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_WIN1253] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len,
pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen,
pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_WIN1254] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len,
pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen,
pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_WIN1255] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len,
pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen,
pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_WIN1257] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len,
pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen,
pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_KOI8U] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len,
pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen,
pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_SJIS] = {PG_WCHAR_NONE, 0, 0, pg_sjis_mblen, pg_sjis_dsplen,
pg_sjis_verifychar, pg_sjis_verifystr, 2},
+ [PG_BIG5] = {PG_WCHAR_NONE, 0, 0, pg_big5_mblen, pg_big5_dsplen,
pg_big5_verifychar, pg_big5_verifystr, 2},
+ [PG_GBK] = {PG_WCHAR_NONE, 0, 0, pg_gbk_mblen, pg_gbk_dsplen,
pg_gbk_verifychar, pg_gbk_verifystr, 2},
+ [PG_UHC] = {PG_WCHAR_NONE, 0, 0, pg_uhc_mblen, pg_uhc_dsplen,
pg_uhc_verifychar, pg_uhc_verifystr, 2},
+ [PG_GB18030] = {PG_WCHAR_NONE, 0, 0, pg_gb18030_mblen,
pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4},
+ [PG_JOHAB] = {PG_WCHAR_NONE, 0, 0, pg_johab_mblen, pg_johab_dsplen,
pg_johab_verifychar, pg_johab_verifystr, 3},
+ [PG_SHIFT_JIS_2004] = {PG_WCHAR_NONE, 0, 0, pg_sjis_mblen,
pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
};
+/*
+ * Returns the encoding scheme for pg_wchar values in the current database
+ * encoding.
+ */
+PgWcharEncodingScheme
+pg_wchar_encoding_scheme(int encoding)
+{
+ return pg_wchar_table[encoding].encoding_scheme;
+}
+
/*
* Returns the byte length of a multibyte character.
*
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 4b4a9974b75..5db00cebcef 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -27,6 +27,55 @@
*/
typedef unsigned int pg_wchar;
+/*
+ * Encoding schemes that pg_wchar might hold.
+ *
+ * Each multi-byte encoding has a corresponding wide encoding scheme,
+ * conceptually like wchar_t in C. Conversions to and from char should be
+ * performed by pg_mb2wchar*() and pg_wchar2mb*() functions. In all encoding
+ * schemes, values 0-127 represent ASCII. For higher values, see below.
+ *
+ * Locale providers make use of the known properties of these encoding schemes
+ * to implement ctype/wctype functionality.
+ */
+typedef enum PgWcharEncodingScheme
+{
+ /*
+ * 8-bit characters in the database encoding, zero-extended to pg_wchar
+ * width.
+ */
+ PG_WCHAR_CHAR,
+
+ /*
+ * 32-bit Unicode code points. PostgreSQL assumes that all libc
+ * implementations use UTF-32 or at least UTF-16 if wchar_t is narrow
for
+ * locales that use UTF-8 encoding for char strings, so it has a special
+ * case for this.
+ */
+ PG_WCHAR_UTF32,
+
+ /*
+ * For multi-byte database encodings other than UTF-8, the encoding is
+ * unspecified outside the ASCII range.
+ */
+ PG_WCHAR_CUSTOM,
+
+ /*
+ * This scheme is not currently used by any of the supported encodings,
+ * but is included here for completeness, providing terminology. In a
few
+ * places, pg_wchar is used to transport wchar_t in whatever unknown
+ * encoding libc uses for the database encoding. This is second from
last
+ * so that lookup arrays don't have to waste an entry.
+ */
+ PG_WCHAR_SYSTEM_WCHAR_T,
+
+ /*
+ * pg_wchar conversion is not available for the database encoding. This
+ * is last so that lookup arrays don't have to waste an entry.
+ */
+ PG_WCHAR_NONE,
+} PgWcharEncodingScheme;
+
/*
* Maximum byte length of multibyte characters in any backend encoding
*/
@@ -391,6 +440,7 @@ typedef int (*mbstr_verifier) (const unsigned char *mbstr,
int len);
typedef struct
{
+ PgWcharEncodingScheme encoding_scheme; /* pg_wchar representation */
mb2wchar_with_len_converter mb2wchar_with_len; /* convert a multibyte
* string to a wchar */
wchar2mb_with_len_converter wchar2mb_with_len; /* convert a wchar
string
@@ -713,6 +763,7 @@ extern int SetClientEncoding(int encoding);
extern void InitializeClientEncoding(void);
extern int pg_get_client_encoding(void);
extern const char *pg_get_client_encoding_name(void);
+extern PgWcharEncodingScheme pg_wchar_encoding_scheme(int encoding);
extern void SetDatabaseEncoding(int encoding);
extern int GetDatabaseEncoding(void);
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index ac2da4c98cf..d6973751f12 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -2264,6 +2264,7 @@ PgStat_WalCounters
PgStat_WalStats
PgXmlErrorContext
PgXmlStrictness
+PgWcharEncodingScheme
Pg_abi_values
Pg_finfo_record
Pg_magic_struct
--
2.50.1 (Apple Git-155)
From 304dc61ed765f1a57a9b9f9cf32a6342f0b15e6a Mon Sep 17 00:00:00 2001
From: Thomas Munro <[email protected]>
Date: Wed, 29 Oct 2025 19:25:40 +1300
Subject: [PATCH 3/8] Fix corrupted ctype.h handling for non-ASCII in EUC
encodings.
Previously we treated PG_WCHAR_CUSTOM encodings the same way as
PG_WCHAR_CHAR, by passing the lower 8 bits of pg_wchar to ctype.h
functions. That was OK for 7-bit ASCII, but arbitrary junk for any
higher values. New functions are provided that just return false for
non-ASCII values.
A more correct fix would convert to libc wchar_t format and use
wctype.h, but that isn't done here.
---
src/backend/utils/adt/pg_locale_libc.c | 103 ++++++++++++++++---------
1 file changed, 68 insertions(+), 35 deletions(-)
diff --git a/src/backend/utils/adt/pg_locale_libc.c
b/src/backend/utils/adt/pg_locale_libc.c
index 1892ed3c5ce..1d6e8be3a82 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -33,6 +33,11 @@
#include <shlwapi.h>
#endif
+#if defined(WIN32)
+#define isxdigit_l _isxdigit_l
+#define iswxdigit_l _iswxdigit_l
+#endif
+
/*
* For the libc provider, to provide as much functionality as possible on a
* variety of platforms without going so far as to implement everything from
@@ -50,20 +55,20 @@
* as the wchar_t representation of Unicode. On some platforms
* wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
*
- * 3. PG_WCHAR_CHAR and PG_WCHAR_CUSTOM encoding schemes:
+ * 3. PG_WCHAR_CUSTOM encoding scheme:
+ *
+ * When working with the EUC_* family of encodings (and technically MULE
+ * internal too, but no libc systems are known to support that encoding), we
+ * convert to wchar_t on the fly and use the <wctype.h> functions, except in
+ * the ASCII range where we use the <ctype.h> functions.
+ *
+ * 4. PG_WCHAR_CHAR encoding scheme:
*
* In all other encodings, we use the <ctype.h> functions for pg_wchar
- * values up to 255, and punt for values above that. This is 100% correct
- * only in single-byte encodings such as LATINn (PG_WCHAR_CHAR). However,
- * non-Unicode multibyte encodings (PG_WCHAR_CUSTOM) are all Far Eastern
- * character sets for which the properties being tested here aren't very
- * relevant for higher code values anyway. The difficulty with using the
- * <wctype.h> functions with non-Unicode multibyte encodings is that we can
- * have no certainty that the platform's wchar_t representation matches what we
- * do in pg_wchar conversions. (MULE is also declared PG_WCHAR_CUSTOM but is
- * not available as a multi-byte encoding in any known libc.)
+ * values up to 255. This is 100% correct since the values originated as char
+ * and were just widened to pg_wchar without change.
*
- * As a special case, in the "default" collation, (2) and (3) force ASCII
+ * As a special case, in the "default" collation, (2), (3) and (4) force ASCII
* letters to follow ASCII upcase/downcase rules, while in a non-default
* collation we just let the library functions do what they will. The case
* where this matters is treatment of I/i in Turkish, and the behavior is
@@ -125,6 +130,30 @@ static size_t strupper_libc_mb(char *dest, size_t destsize,
const char *src,
ssize_t srclen,
pg_locale_t locale);
+/*
+ * Generate a function that passes single-byte characters directly to <ctype.h>
+ * functions, but only if they are in the ASCII range. This is suitable for
+ * PG_WCHAR_CUSTOM pg_wchar encoding (used with EUC_* encodings). Values
+ * outside ASCII have an unknown encoding, so we just return false.
+ */
+#define DEFINE_WC_CTYPE_LIBC_ASCII(ctype) \
+static bool \
+wc_is##ctype##_libc_ascii(pg_wchar wc, pg_locale_t locale) \
+{ \
+ return is##ctype##_l((unsigned char) wc, locale->lt); \
+}
+
+DEFINE_WC_CTYPE_LIBC_ASCII(digit);
+DEFINE_WC_CTYPE_LIBC_ASCII(alpha);
+DEFINE_WC_CTYPE_LIBC_ASCII(alnum);
+DEFINE_WC_CTYPE_LIBC_ASCII(upper);
+DEFINE_WC_CTYPE_LIBC_ASCII(lower);
+DEFINE_WC_CTYPE_LIBC_ASCII(graph);
+DEFINE_WC_CTYPE_LIBC_ASCII(print);
+DEFINE_WC_CTYPE_LIBC_ASCII(punct);
+DEFINE_WC_CTYPE_LIBC_ASCII(space);
+DEFINE_WC_CTYPE_LIBC_ASCII(xdigit);
+
static bool
wc_isdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
{
@@ -182,11 +211,7 @@ wc_isspace_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool
wc_isxdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
{
-#ifndef WIN32
return isxdigit_l((unsigned char) wc, locale->lt);
-#else
- return _isxdigit_l((unsigned char) wc, locale->lt);
-#endif
}
static bool
@@ -255,11 +280,7 @@ wc_isspace_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool
wc_isxdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
{
-#ifndef WIN32
return iswxdigit_l((wint_t) wc, locale->lt);
-#else
- return _iswxdigit_l((wint_t) wc, locale->lt);
-#endif
}
static char
@@ -280,6 +301,12 @@ char_is_cased_libc(char ch, pg_locale_t locale)
return isalpha_l((unsigned char) ch, locale->lt);
}
+static pg_wchar
+toupper_libc_ascii(pg_wchar wc, pg_locale_t locale)
+{
+ return wc < 128 ? toupper_l((unsigned char) wc, locale->lt) : wc;
+}
+
static pg_wchar
toupper_libc_sb(pg_wchar wc, pg_locale_t locale)
{
@@ -308,6 +335,12 @@ toupper_libc_mb(pg_wchar wc, pg_locale_t locale)
return wc;
}
+static pg_wchar
+tolower_libc_ascii(pg_wchar wc, pg_locale_t locale)
+{
+ return wc < 128 ? tolower_l((unsigned char) wc, locale->lt) : wc;
+}
+
static pg_wchar
tolower_libc_sb(pg_wchar wc, pg_locale_t locale)
{
@@ -379,30 +412,30 @@ static const struct ctype_methods ctype_methods_libc[] = {
/*
* Custom pg_wchar format converted from non-UTF8 multibyte encodings
use
- * multibyte semantics for case mapping, but single-byte semantics for
- * pattern matching.
+ * multibyte semantics for case mapping, but ASCII-only semantics for
+ * pattern matching, since libc doesn't understand custom encoding of
+ * higher values.
*
- * XXX Therefore this gives incorrect results for pattern matching
outside
- * the ASCII range. Could be fixed.
+ * XXX We could convert to wchar_t to fix that, at considerable cost.
*/
[PG_WCHAR_CUSTOM] = {
.strlower = strlower_libc_mb,
.strtitle = strtitle_libc_mb,
.strupper = strupper_libc_mb,
- .wc_isdigit = wc_isdigit_libc_sb,
- .wc_isalpha = wc_isalpha_libc_sb,
- .wc_isalnum = wc_isalnum_libc_sb,
- .wc_isupper = wc_isupper_libc_sb,
- .wc_islower = wc_islower_libc_sb,
- .wc_isgraph = wc_isgraph_libc_sb,
- .wc_isprint = wc_isprint_libc_sb,
- .wc_ispunct = wc_ispunct_libc_sb,
- .wc_isspace = wc_isspace_libc_sb,
- .wc_isxdigit = wc_isxdigit_libc_sb,
+ .wc_isdigit = wc_isdigit_libc_ascii,
+ .wc_isalpha = wc_isalpha_libc_ascii,
+ .wc_isalnum = wc_isalnum_libc_ascii,
+ .wc_isupper = wc_isupper_libc_ascii,
+ .wc_islower = wc_islower_libc_ascii,
+ .wc_isgraph = wc_isgraph_libc_ascii,
+ .wc_isprint = wc_isprint_libc_ascii,
+ .wc_ispunct = wc_ispunct_libc_ascii,
+ .wc_isspace = wc_isspace_libc_ascii,
+ .wc_isxdigit = wc_isxdigit_libc_ascii,
.char_is_cased = char_is_cased_libc,
.char_tolower = char_tolower_libc,
- .wc_toupper = toupper_libc_sb,
- .wc_tolower = tolower_libc_sb,
+ .wc_toupper = toupper_libc_ascii,
+ .wc_tolower = tolower_libc_ascii,
.max_chr = UCHAR_MAX,
},
};
--
2.50.1 (Apple Git-155)
From 39ebd5e689a458508b2762b84beb197f7dc6fd92 Mon Sep 17 00:00:00 2001
From: Thomas Munro <[email protected]>
Date: Wed, 29 Oct 2025 17:37:03 +1300
Subject: [PATCH 4/8] Support wctype.h classification for EUC encodings.
Instead of giving up on non-ASCII characters, convert pg_wchar values
using the PG_WCHAR_CUSTOM encoding scheme to wchar_t so that we can use
wctype.h functions on any character.
XXX This replaces the _ascii() version from the previous patch, to
experiment with a different approach
XXX Is this too expensive?
---
src/backend/utils/adt/pg_locale_libc.c | 148 ++++++++++++++++++-------
1 file changed, 109 insertions(+), 39 deletions(-)
diff --git a/src/backend/utils/adt/pg_locale_libc.c
b/src/backend/utils/adt/pg_locale_libc.c
index 1d6e8be3a82..e6724880f1b 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -130,29 +130,81 @@ static size_t strupper_libc_mb(char *dest, size_t
destsize,
const char *src,
ssize_t srclen,
pg_locale_t locale);
+static wint_t
+pg_wchar_to_wchar_t(pg_wchar wc, pg_locale_t locale)
+{
+ wchar_t out[MAX_CONVERSION_GROWTH + 1];
+ char mb[MAX_CONVERSION_GROWTH + 1];
+ size_t mb_len;
+ size_t wchar_t_len;
+
+ /* pg_wchar -> multibyte using PostgreSQL pg_wchar encoding */
+ mb_len = pg_wchar2mb_with_len(&wc, mb, 1);
+
+ /* multibyte -> wchar_t using libc */
+ wchar_t_len = char2wchar(out, lengthof(out), mb, mb_len, locale->lt);
+
+ /* reject surrogates and combining characters */
+ if (wchar_t_len != 1)
+ return WEOF;
+
+ return out[0];
+}
+
+static pg_wchar
+wchar_t_to_pg_wchar(wchar_t wc, pg_locale_t locale)
+{
+ wchar_t in[] = {wc, 0};
+ pg_wchar out[MAX_CONVERSION_GROWTH + 1];
+ char mb[MAX_CONVERSION_GROWTH + 1];
+ size_t mb_len;
+ size_t pg_wchar_len;
+
+ /* wchar_t -> multibyte using libc */
+ mb_len = wchar2char(mb, in, lengthof(mb), locale->lt);
+
+ /* multibyte -> pg_wchar using PostgreSQL pg_wchar encoding */
+ pg_wchar_len = pg_mb2wchar_with_len(mb, out, mb_len);
+
+ /* should be exactly one character */
+ if (pg_wchar_len != 1)
+ elog(ERROR, "unexpected number of output characters: %zu",
pg_wchar_len);
+
+ return out[0];
+}
+
/*
- * Generate a function that passes single-byte characters directly to <ctype.h>
- * functions, but only if they are in the ASCII range. This is suitable for
- * PG_WCHAR_CUSTOM pg_wchar encoding (used with EUC_* encodings). Values
- * outside ASCII have an unknown encoding, so we just return false.
+ * Generate a function that handles the ASCII range with <ctype.h>, and
+ * otherwise converts pg_wchar to libc's wchar_t to be able to use <wctype.h>
+ * routines. This extra conversion is only required when using encodings that
+ * declare a PG_WCHAR_CUSTOM encoding scheme (EUC encodings for CJK).
+ *
+ * XXX If wchar.c had a function that could identify pg_wchar values that
+ * definitely won't return true (eg the big kanji/hanzi ranges), then we could
+ * skip the expensive conversion but still give correct answers for other
+ * characters.
*/
-#define DEFINE_WC_CTYPE_LIBC_ASCII(ctype) \
+#define DEFINE_WC_CTYPE_LIBC_CUSTOM(ctype) \
static bool \
-wc_is##ctype##_libc_ascii(pg_wchar wc, pg_locale_t locale) \
+wc_is##ctype##_libc_custom(pg_wchar wc, pg_locale_t locale) \
{ \
- return is##ctype##_l((unsigned char) wc, locale->lt); \
+ wint_t wint; \
+ if (wc < 128) \
+ return is##ctype##_l(wc, locale->lt); \
+ wint = pg_wchar_to_wchar_t(wc, locale); \
+ return wint != WEOF && isw##ctype##_l(wint, locale->lt); \
}
-DEFINE_WC_CTYPE_LIBC_ASCII(digit);
-DEFINE_WC_CTYPE_LIBC_ASCII(alpha);
-DEFINE_WC_CTYPE_LIBC_ASCII(alnum);
-DEFINE_WC_CTYPE_LIBC_ASCII(upper);
-DEFINE_WC_CTYPE_LIBC_ASCII(lower);
-DEFINE_WC_CTYPE_LIBC_ASCII(graph);
-DEFINE_WC_CTYPE_LIBC_ASCII(print);
-DEFINE_WC_CTYPE_LIBC_ASCII(punct);
-DEFINE_WC_CTYPE_LIBC_ASCII(space);
-DEFINE_WC_CTYPE_LIBC_ASCII(xdigit);
+DEFINE_WC_CTYPE_LIBC_CUSTOM(digit);
+DEFINE_WC_CTYPE_LIBC_CUSTOM(alpha);
+DEFINE_WC_CTYPE_LIBC_CUSTOM(alnum);
+DEFINE_WC_CTYPE_LIBC_CUSTOM(upper);
+DEFINE_WC_CTYPE_LIBC_CUSTOM(lower);
+DEFINE_WC_CTYPE_LIBC_CUSTOM(graph);
+DEFINE_WC_CTYPE_LIBC_CUSTOM(print);
+DEFINE_WC_CTYPE_LIBC_CUSTOM(punct);
+DEFINE_WC_CTYPE_LIBC_CUSTOM(space);
+DEFINE_WC_CTYPE_LIBC_CUSTOM(xdigit);
static bool
wc_isdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
@@ -302,9 +354,19 @@ char_is_cased_libc(char ch, pg_locale_t locale)
}
static pg_wchar
-toupper_libc_ascii(pg_wchar wc, pg_locale_t locale)
+toupper_libc_custom(pg_wchar wc, pg_locale_t locale)
{
- return wc < 128 ? toupper_l((unsigned char) wc, locale->lt) : wc;
+ wint_t wint;
+
+ /* force C behavior for ASCII characters, per comments above */
+ if (locale->is_default && wc <= (pg_wchar) 127)
+ return pg_ascii_toupper((unsigned char) wc);
+ if (wc <= 127)
+ return towupper_l((wint_t) wc, locale->lt);
+ wint = pg_wchar_to_wchar_t(wc, locale);
+ if ((wint = pg_wchar_to_wchar_t(wc, locale)) != WEOF)
+ return wchar_t_to_pg_wchar(towupper_l(wint, locale->lt),
locale);
+ return wc;
}
static pg_wchar
@@ -336,9 +398,19 @@ toupper_libc_mb(pg_wchar wc, pg_locale_t locale)
}
static pg_wchar
-tolower_libc_ascii(pg_wchar wc, pg_locale_t locale)
+tolower_libc_custom(pg_wchar wc, pg_locale_t locale)
{
- return wc < 128 ? tolower_l((unsigned char) wc, locale->lt) : wc;
+ wint_t wint;
+
+ /* force C behavior for ASCII characters, per comments above */
+ if (locale->is_default && wc <= (pg_wchar) 127)
+ return pg_ascii_tolower((unsigned char) wc);
+ if (wc <= 127)
+ return towlower_l((wint_t) wc, locale->lt);
+ wint = pg_wchar_to_wchar_t(wc, locale);
+ if ((wint = pg_wchar_to_wchar_t(wc, locale)) != WEOF)
+ return wchar_t_to_pg_wchar(towlower_l(wint, locale->lt),
locale);
+ return wc;
}
static pg_wchar
@@ -412,31 +484,29 @@ static const struct ctype_methods ctype_methods_libc[] = {
/*
* Custom pg_wchar format converted from non-UTF8 multibyte encodings
use
- * multibyte semantics for case mapping, but ASCII-only semantics for
- * pattern matching, since libc doesn't understand custom encoding of
- * higher values.
- *
- * XXX We could convert to wchar_t to fix that, at considerable cost.
+ * multibyte semantics for case mapping, and conversions to libc's
wchar_t
+ * except for the ASCII range that can be handled directly by ctype
+ * functions.
*/
[PG_WCHAR_CUSTOM] = {
.strlower = strlower_libc_mb,
.strtitle = strtitle_libc_mb,
.strupper = strupper_libc_mb,
- .wc_isdigit = wc_isdigit_libc_ascii,
- .wc_isalpha = wc_isalpha_libc_ascii,
- .wc_isalnum = wc_isalnum_libc_ascii,
- .wc_isupper = wc_isupper_libc_ascii,
- .wc_islower = wc_islower_libc_ascii,
- .wc_isgraph = wc_isgraph_libc_ascii,
- .wc_isprint = wc_isprint_libc_ascii,
- .wc_ispunct = wc_ispunct_libc_ascii,
- .wc_isspace = wc_isspace_libc_ascii,
- .wc_isxdigit = wc_isxdigit_libc_ascii,
+ .wc_isdigit = wc_isdigit_libc_custom,
+ .wc_isalpha = wc_isalpha_libc_custom,
+ .wc_isalnum = wc_isalnum_libc_custom,
+ .wc_isupper = wc_isupper_libc_custom,
+ .wc_islower = wc_islower_libc_custom,
+ .wc_isgraph = wc_isgraph_libc_custom,
+ .wc_isprint = wc_isprint_libc_custom,
+ .wc_ispunct = wc_ispunct_libc_custom,
+ .wc_isspace = wc_isspace_libc_custom,
+ .wc_isxdigit = wc_isxdigit_libc_custom,
.char_is_cased = char_is_cased_libc,
.char_tolower = char_tolower_libc,
- .wc_toupper = toupper_libc_ascii,
- .wc_tolower = tolower_libc_ascii,
- .max_chr = UCHAR_MAX,
+ .wc_toupper = toupper_libc_custom,
+ .wc_tolower = tolower_libc_custom,
+ .max_chr = 127, /* values outside ASCII may be illegal
to probe */
},
};
--
2.50.1 (Apple Git-155)
From 50dc5a57bcd31be893071da62c4ba6b0537695cc Mon Sep 17 00:00:00 2001
From: Thomas Munro <[email protected]>
Date: Thu, 30 Oct 2025 00:52:51 +1300
Subject: [PATCH 5/8] XXX work around regc_pg_locale.c's probing logic
---
src/backend/regex/regc_pg_locale.c | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/src/backend/regex/regc_pg_locale.c
b/src/backend/regex/regc_pg_locale.c
index e0c892db713..fdc3fac0bbe 100644
--- a/src/backend/regex/regc_pg_locale.c
+++ b/src/backend/regex/regc_pg_locale.c
@@ -352,7 +352,15 @@ regc_ctype_get_cache(regc_wc_probefunc probefunc, int
cclasscode)
pg_regex_locale->ctype->max_chr <= MAX_SIMPLE_CHR)
{
max_chr = pg_regex_locale->ctype->max_chr;
- pcc->cv.cclasscode = -1;
+
+ /*
+ * XXX TODO: don't turn off locales just because
pg_locale_libc.c
+ * told us it's not cool to probe arbitrary pg_wchar
values over
+ * 127! Without this, re-encoding fails at pg_wchar
0x80, which
+ * can't be converted back to mb (the EUC pg_wchar
encoding has
+ * holes in it)
+ */
+ //pcc->cv.cclasscode = -1;
}
else
max_chr = (pg_wchar) MAX_SIMPLE_CHR;
--
2.50.1 (Apple Git-155)
From 553e2d5a2923d582a0f1a3ef72033b40149c08c7 Mon Sep 17 00:00:00 2001
From: Thomas Munro <[email protected]>
Date: Wed, 29 Oct 2025 16:01:32 +1300
Subject: [PATCH 6/8] Improve naming of libc collation functions.
The functions that expect pg_wchar to hold a UTF-32-encoded code point
because the encoding scheme is PG_WCHAR_UTF32 had names ending _mb, but
_utf32 makes more sense. The remaining _mb functions really do work
with multibyte input.
It might be tempting to rename the _sb functions to _char to match
PG_WCHAR_CHAR, but since the _sb and _mb functions both work with chars (one
or variable), that would probably just be more confusing.
---
src/backend/utils/adt/pg_locale_libc.c | 48 +++++++++++++-------------
1 file changed, 24 insertions(+), 24 deletions(-)
diff --git a/src/backend/utils/adt/pg_locale_libc.c
b/src/backend/utils/adt/pg_locale_libc.c
index e6724880f1b..b33897c683e 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -267,70 +267,70 @@ wc_isxdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
}
static bool
-wc_isdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
+wc_isdigit_libc_utf32(pg_wchar wc, pg_locale_t locale)
{
return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
iswdigit_l((wint_t) wc, locale->lt);
}
static bool
-wc_isalpha_libc_mb(pg_wchar wc, pg_locale_t locale)
+wc_isalpha_libc_utf32(pg_wchar wc, pg_locale_t locale)
{
return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
iswalpha_l((wint_t) wc, locale->lt);
}
static bool
-wc_isalnum_libc_mb(pg_wchar wc, pg_locale_t locale)
+wc_isalnum_libc_utf32(pg_wchar wc, pg_locale_t locale)
{
return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
iswalnum_l((wint_t) wc, locale->lt);
}
static bool
-wc_isupper_libc_mb(pg_wchar wc, pg_locale_t locale)
+wc_isupper_libc_utf32(pg_wchar wc, pg_locale_t locale)
{
return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
iswupper_l((wint_t) wc, locale->lt);
}
static bool
-wc_islower_libc_mb(pg_wchar wc, pg_locale_t locale)
+wc_islower_libc_utf32(pg_wchar wc, pg_locale_t locale)
{
return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
iswlower_l((wint_t) wc, locale->lt);
}
static bool
-wc_isgraph_libc_mb(pg_wchar wc, pg_locale_t locale)
+wc_isgraph_libc_utf32(pg_wchar wc, pg_locale_t locale)
{
return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
iswgraph_l((wint_t) wc, locale->lt);
}
static bool
-wc_isprint_libc_mb(pg_wchar wc, pg_locale_t locale)
+wc_isprint_libc_utf32(pg_wchar wc, pg_locale_t locale)
{
return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
iswprint_l((wint_t) wc, locale->lt);
}
static bool
-wc_ispunct_libc_mb(pg_wchar wc, pg_locale_t locale)
+wc_ispunct_libc_utf32(pg_wchar wc, pg_locale_t locale)
{
return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
iswpunct_l((wint_t) wc, locale->lt);
}
static bool
-wc_isspace_libc_mb(pg_wchar wc, pg_locale_t locale)
+wc_isspace_libc_utf32(pg_wchar wc, pg_locale_t locale)
{
return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
iswspace_l((wint_t) wc, locale->lt);
}
static bool
-wc_isxdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
+wc_isxdigit_libc_utf32(pg_wchar wc, pg_locale_t locale)
{
return iswxdigit_l((wint_t) wc, locale->lt);
}
@@ -384,7 +384,7 @@ toupper_libc_sb(pg_wchar wc, pg_locale_t locale)
}
static pg_wchar
-toupper_libc_mb(pg_wchar wc, pg_locale_t locale)
+toupper_libc_utf32(pg_wchar wc, pg_locale_t locale)
{
Assert(GetDatabaseEncoding() == PG_UTF8);
@@ -428,7 +428,7 @@ tolower_libc_sb(pg_wchar wc, pg_locale_t locale)
}
static pg_wchar
-tolower_libc_mb(pg_wchar wc, pg_locale_t locale)
+tolower_libc_utf32(pg_wchar wc, pg_locale_t locale)
{
Assert(GetDatabaseEncoding() == PG_UTF8);
@@ -466,20 +466,20 @@ static const struct ctype_methods ctype_methods_libc[] = {
.strlower = strlower_libc_mb,
.strtitle = strtitle_libc_mb,
.strupper = strupper_libc_mb,
- .wc_isdigit = wc_isdigit_libc_mb,
- .wc_isalpha = wc_isalpha_libc_mb,
- .wc_isalnum = wc_isalnum_libc_mb,
- .wc_isupper = wc_isupper_libc_mb,
- .wc_islower = wc_islower_libc_mb,
- .wc_isgraph = wc_isgraph_libc_mb,
- .wc_isprint = wc_isprint_libc_mb,
- .wc_ispunct = wc_ispunct_libc_mb,
- .wc_isspace = wc_isspace_libc_mb,
- .wc_isxdigit = wc_isxdigit_libc_mb,
+ .wc_isdigit = wc_isdigit_libc_utf32,
+ .wc_isalpha = wc_isalpha_libc_utf32,
+ .wc_isalnum = wc_isalnum_libc_utf32,
+ .wc_isupper = wc_isupper_libc_utf32,
+ .wc_islower = wc_islower_libc_utf32,
+ .wc_isgraph = wc_isgraph_libc_utf32,
+ .wc_isprint = wc_isprint_libc_utf32,
+ .wc_ispunct = wc_ispunct_libc_utf32,
+ .wc_isspace = wc_isspace_libc_utf32,
+ .wc_isxdigit = wc_isxdigit_libc_utf32,
.char_is_cased = char_is_cased_libc,
.char_tolower = char_tolower_libc,
- .wc_toupper = toupper_libc_mb,
- .wc_tolower = tolower_libc_mb,
+ .wc_toupper = toupper_libc_utf32,
+ .wc_tolower = tolower_libc_utf32,
},
/*
--
2.50.1 (Apple Git-155)
From b2ddd005740f73c6d4a9e2441616926fcea94684 Mon Sep 17 00:00:00 2001
From: Thomas Munro <[email protected]>
Date: Wed, 29 Oct 2025 17:54:06 +1300
Subject: [PATCH 7/8] Use compact notation for isXXX_l() wrappers.
Instead of loads of repeating functions for the PG_WCHAR_CHAR and
PG_WCHAR_UTF32 handlers, make a macro to avoid typos, as was already
done for the new PG_WCHAR_CUSTOM handlers.
---
src/backend/utils/adt/pg_locale_libc.c | 166 +++++++------------------
1 file changed, 43 insertions(+), 123 deletions(-)
diff --git a/src/backend/utils/adt/pg_locale_libc.c
b/src/backend/utils/adt/pg_locale_libc.c
index b33897c683e..fc758e2607c 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -206,134 +206,54 @@ DEFINE_WC_CTYPE_LIBC_CUSTOM(punct);
DEFINE_WC_CTYPE_LIBC_CUSTOM(space);
DEFINE_WC_CTYPE_LIBC_CUSTOM(xdigit);
-static bool
-wc_isdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
-{
- return isdigit_l((unsigned char) wc, locale->lt);
-}
-
-static bool
-wc_isalpha_libc_sb(pg_wchar wc, pg_locale_t locale)
-{
- return isalpha_l((unsigned char) wc, locale->lt);
-}
-
-static bool
-wc_isalnum_libc_sb(pg_wchar wc, pg_locale_t locale)
-{
- return isalnum_l((unsigned char) wc, locale->lt);
-}
-
-static bool
-wc_isupper_libc_sb(pg_wchar wc, pg_locale_t locale)
-{
- return isupper_l((unsigned char) wc, locale->lt);
-}
-
-static bool
-wc_islower_libc_sb(pg_wchar wc, pg_locale_t locale)
-{
- return islower_l((unsigned char) wc, locale->lt);
-}
-
-static bool
-wc_isgraph_libc_sb(pg_wchar wc, pg_locale_t locale)
-{
- return isgraph_l((unsigned char) wc, locale->lt);
-}
-
-static bool
-wc_isprint_libc_sb(pg_wchar wc, pg_locale_t locale)
-{
- return isprint_l((unsigned char) wc, locale->lt);
-}
-
-static bool
-wc_ispunct_libc_sb(pg_wchar wc, pg_locale_t locale)
-{
- return ispunct_l((unsigned char) wc, locale->lt);
-}
-
-static bool
-wc_isspace_libc_sb(pg_wchar wc, pg_locale_t locale)
-{
- return isspace_l((unsigned char) wc, locale->lt);
-}
-
-static bool
-wc_isxdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
-{
- return isxdigit_l((unsigned char) wc, locale->lt);
-}
-
-static bool
-wc_isdigit_libc_utf32(pg_wchar wc, pg_locale_t locale)
-{
- return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
- iswdigit_l((wint_t) wc, locale->lt);
-}
-
-static bool
-wc_isalpha_libc_utf32(pg_wchar wc, pg_locale_t locale)
-{
- return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
- iswalpha_l((wint_t) wc, locale->lt);
-}
-
-static bool
-wc_isalnum_libc_utf32(pg_wchar wc, pg_locale_t locale)
-{
- return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
- iswalnum_l((wint_t) wc, locale->lt);
-}
-
-static bool
-wc_isupper_libc_utf32(pg_wchar wc, pg_locale_t locale)
-{
- return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
- iswupper_l((wint_t) wc, locale->lt);
-}
-
-static bool
-wc_islower_libc_utf32(pg_wchar wc, pg_locale_t locale)
-{
- return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
- iswlower_l((wint_t) wc, locale->lt);
-}
-
-static bool
-wc_isgraph_libc_utf32(pg_wchar wc, pg_locale_t locale)
-{
- return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
- iswgraph_l((wint_t) wc, locale->lt);
-}
-
-static bool
-wc_isprint_libc_utf32(pg_wchar wc, pg_locale_t locale)
-{
- return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
- iswprint_l((wint_t) wc, locale->lt);
+/*
+ * Generate a function that passes single-byte characters directly to <ctype.h>
+ * functions. This is suitable for PG_WCHAR_CHAR encodings, where pg_wchar
+ * holds a one byte.
+ */
+#define DEFINE_WC_CTYPE_LIBC_SB(ctype) \
+static bool \
+wc_is##ctype##_libc_sb(pg_wchar wc, pg_locale_t locale) \
+{ \
+ return is##ctype##_l((unsigned char) wc, locale->lt); \
}
-static bool
-wc_ispunct_libc_utf32(pg_wchar wc, pg_locale_t locale)
-{
- return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
- iswpunct_l((wint_t) wc, locale->lt);
-}
+DEFINE_WC_CTYPE_LIBC_SB(digit);
+DEFINE_WC_CTYPE_LIBC_SB(alpha);
+DEFINE_WC_CTYPE_LIBC_SB(alnum);
+DEFINE_WC_CTYPE_LIBC_SB(upper);
+DEFINE_WC_CTYPE_LIBC_SB(lower);
+DEFINE_WC_CTYPE_LIBC_SB(graph);
+DEFINE_WC_CTYPE_LIBC_SB(print);
+DEFINE_WC_CTYPE_LIBC_SB(punct);
+DEFINE_WC_CTYPE_LIBC_SB(space);
+DEFINE_WC_CTYPE_LIBC_SB(xdigit);
-static bool
-wc_isspace_libc_utf32(pg_wchar wc, pg_locale_t locale)
-{
- return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
- iswspace_l((wint_t) wc, locale->lt);
+/*
+ * Generate a function that passes UTF-32 characters directly to <wctype.h>
+ * functions, This is suitable for PG_WCHAR_UTF32 encodings, with the
+ * assumption that any libc locale that uses UTF-8 as its char encoding must
+ * use UTF-32 or UTF-16 for its wchar_t encoding. For the UTF-16 case, just
+ * return false for codepoints outside the BMP.
+ */
+#define DEFINE_WC_CTYPE_LIBC_UTF32(ctype) \
+static bool \
+wc_is##ctype##_libc_utf32(pg_wchar wc, pg_locale_t locale) \
+{ \
+ return (sizeof(wchar_t) >= 4 || wc <= 0xffff) && \
+ isw##ctype##_l((wint_t) wc, locale->lt); \
}
-static bool
-wc_isxdigit_libc_utf32(pg_wchar wc, pg_locale_t locale)
-{
- return iswxdigit_l((wint_t) wc, locale->lt);
-}
+DEFINE_WC_CTYPE_LIBC_UTF32(digit);
+DEFINE_WC_CTYPE_LIBC_UTF32(alpha);
+DEFINE_WC_CTYPE_LIBC_UTF32(alnum);
+DEFINE_WC_CTYPE_LIBC_UTF32(upper);
+DEFINE_WC_CTYPE_LIBC_UTF32(lower);
+DEFINE_WC_CTYPE_LIBC_UTF32(graph);
+DEFINE_WC_CTYPE_LIBC_UTF32(print);
+DEFINE_WC_CTYPE_LIBC_UTF32(punct);
+DEFINE_WC_CTYPE_LIBC_UTF32(space);
+DEFINE_WC_CTYPE_LIBC_UTF32(xdigit);
static char
char_tolower_libc(unsigned char ch, pg_locale_t locale)
--
2.50.1 (Apple Git-155)
From c70c67492e6c1a9fffd97037abe49251e408cff8 Mon Sep 17 00:00:00 2001
From: Thomas Munro <[email protected]>
Date: Wed, 29 Oct 2025 23:23:42 +1300
Subject: [PATCH 8/8] Use compact notation for toupper/tolower wrappers.
Extend the macro technique used for generating isalpha etc also to
toupper/tolower functions, removing some duplication.
---
src/backend/utils/adt/pg_locale_libc.c | 118 ++++++++-----------------
1 file changed, 39 insertions(+), 79 deletions(-)
diff --git a/src/backend/utils/adt/pg_locale_libc.c
b/src/backend/utils/adt/pg_locale_libc.c
index fc758e2607c..7591fb812ac 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -273,93 +273,53 @@ char_is_cased_libc(char ch, pg_locale_t locale)
return isalpha_l((unsigned char) ch, locale->lt);
}
-static pg_wchar
-toupper_libc_custom(pg_wchar wc, pg_locale_t locale)
-{
- wint_t wint;
-
- /* force C behavior for ASCII characters, per comments above */
- if (locale->is_default && wc <= (pg_wchar) 127)
- return pg_ascii_toupper((unsigned char) wc);
- if (wc <= 127)
- return towupper_l((wint_t) wc, locale->lt);
- wint = pg_wchar_to_wchar_t(wc, locale);
- if ((wint = pg_wchar_to_wchar_t(wc, locale)) != WEOF)
- return wchar_t_to_pg_wchar(towupper_l(wint, locale->lt),
locale);
- return wc;
-}
-
-static pg_wchar
-toupper_libc_sb(pg_wchar wc, pg_locale_t locale)
-{
- Assert(GetDatabaseEncoding() != PG_UTF8);
-
- /* force C behavior for ASCII characters, per comments above */
- if (locale->is_default && wc <= (pg_wchar) 127)
- return pg_ascii_toupper((unsigned char) wc);
- if (wc <= (pg_wchar) UCHAR_MAX)
- return toupper_l((unsigned char) wc, locale->lt);
- else
- return wc;
+#define DEFINE_WC_CASE_LIBC_CUSTOM(case) \
+static pg_wchar \
+to##case##_libc_custom(pg_wchar wc, pg_locale_t locale) \
+{ \
+ wint_t wint; \
+ if (locale->is_default && wc <= (pg_wchar) 127) \
+ return pg_ascii_to##case((unsigned char) wc); \
+ if (wc <= 127) \
+ return to##case##_l((wint_t) wc, locale->lt); \
+ wint = pg_wchar_to_wchar_t(wc, locale); \
+ if ((wint = pg_wchar_to_wchar_t(wc, locale)) != WEOF) \
+ return wchar_t_to_pg_wchar(to##case##_l(wint, locale->lt),
locale); \
+ return wc; \
}
-static pg_wchar
-toupper_libc_utf32(pg_wchar wc, pg_locale_t locale)
-{
- Assert(GetDatabaseEncoding() == PG_UTF8);
+DEFINE_WC_CASE_LIBC_CUSTOM(upper);
+DEFINE_WC_CASE_LIBC_CUSTOM(lower);
- /* force C behavior for ASCII characters, per comments above */
- if (locale->is_default && wc <= (pg_wchar) 127)
- return pg_ascii_toupper((unsigned char) wc);
- if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)
- return towupper_l((wint_t) wc, locale->lt);
- else
- return wc;
-}
-
-static pg_wchar
-tolower_libc_custom(pg_wchar wc, pg_locale_t locale)
-{
- wint_t wint;
-
- /* force C behavior for ASCII characters, per comments above */
- if (locale->is_default && wc <= (pg_wchar) 127)
- return pg_ascii_tolower((unsigned char) wc);
- if (wc <= 127)
- return towlower_l((wint_t) wc, locale->lt);
- wint = pg_wchar_to_wchar_t(wc, locale);
- if ((wint = pg_wchar_to_wchar_t(wc, locale)) != WEOF)
- return wchar_t_to_pg_wchar(towlower_l(wint, locale->lt),
locale);
- return wc;
+#define DEFINE_WC_CASE_LIBC_SB(case) \
+static pg_wchar \
+to##case##_libc_sb(pg_wchar wc, pg_locale_t locale) \
+{ \
+ if (locale->is_default && wc <= (pg_wchar) 127) \
+ return pg_ascii_to##case((unsigned char) wc); \
+ if (wc <= (pg_wchar) UCHAR_MAX) \
+ return to##case##_l((unsigned char) wc, locale->lt); \
+ else \
+ return wc; \
}
-static pg_wchar
-tolower_libc_sb(pg_wchar wc, pg_locale_t locale)
-{
- Assert(GetDatabaseEncoding() != PG_UTF8);
+DEFINE_WC_CASE_LIBC_SB(upper);
+DEFINE_WC_CASE_LIBC_SB(lower);
- /* force C behavior for ASCII characters, per comments above */
- if (locale->is_default && wc <= (pg_wchar) 127)
- return pg_ascii_tolower((unsigned char) wc);
- if (wc <= (pg_wchar) UCHAR_MAX)
- return tolower_l((unsigned char) wc, locale->lt);
- else
- return wc;
+#define DEFINE_WC_CASE_LIBC_UTF32(case) \
+static pg_wchar \
+to##case##_libc_utf32(pg_wchar wc, pg_locale_t locale) \
+{ \
+ if (locale->is_default && wc <= (pg_wchar) 127) \
+ return pg_ascii_to##case((unsigned char) wc); \
+ if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF) \
+ return tow##case##_l((wint_t) wc, locale->lt); \
+ else \
+ return wc; \
}
-static pg_wchar
-tolower_libc_utf32(pg_wchar wc, pg_locale_t locale)
-{
- Assert(GetDatabaseEncoding() == PG_UTF8);
-
- /* force C behavior for ASCII characters, per comments above */
- if (locale->is_default && wc <= (pg_wchar) 127)
- return pg_ascii_tolower((unsigned char) wc);
- if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)
- return towlower_l((wint_t) wc, locale->lt);
- else
- return wc;
-}
+DEFINE_WC_CASE_LIBC_UTF32(upper);
+DEFINE_WC_CASE_LIBC_UTF32(lower);
static const struct ctype_methods ctype_methods_libc[] = {
[PG_WCHAR_CHAR] = {
--
2.50.1 (Apple Git-155)