On Tue, 2023-01-17 at 14:18 -0800, Peter Geoghegan wrote: > The second goal is a perfectly good enough goal on its own, and one > that I am totally supportive of. Making the code clearer is icing on > the cake.
Attached v8, which is just a rebase. To reiterate: commitfest entry https://commitfest.postgresql.org/41/3956/ is dependent on these patches and is a big part of the motivation for refactoring. > > I don't know. Quite possibly not. It would be nice to have some data > on that, though. I tested with hash aggregation, which might be more dependent on pg_locale_deterministic() than sorting. I didn't see any significant difference between master and the refactoring branch, so I don't see a need to make that function "inline". I also re-tested sorting and found some interesting results for en-US- x-icu on a UTF-8 database (which is I suspect one of the most common configurations for ICU): * the refactoring branch is now more than 5% faster, whether using abbreviated keys or not * disabling abbreviated keys makes sorting 8-10% faster on both master and the refactoring branch Both of these are surprising, and I haven't investigated deeply yet. Maybe something about LTO, some intervening patch, or I just made some mistakes somewhere (I did this fairly quickly). But as of now, it doesn't look like the refactoring patch hurts anything. -- Jeff Davis PostgreSQL Contributor Team - AWS
From d1e2e1757b043c876695b8fa8c304b5126efb3aa Mon Sep 17 00:00:00 2001 From: Jeff Davis <j...@j-davis.com> Date: Mon, 5 Dec 2022 10:43:52 -0800 Subject: [PATCH v8 2/2] Refactor pg_locale_t routines. * add pg_locale_internal.h to hide pg_locale_struct * move info.lt into info.libc.lt to match icu * introduce init_default_locale() * introduce pg_locale_deterministic() accessor * make default_locale a static global in pg_locale.c * refactor pg_newlocale_from_collation() --- src/backend/access/hash/hashfunc.c | 82 +++--- src/backend/commands/collationcmds.c | 1 + src/backend/regex/regc_pg_locale.c | 45 ++-- src/backend/utils/adt/formatting.c | 25 +- src/backend/utils/adt/like.c | 3 +- src/backend/utils/adt/like_support.c | 3 +- src/backend/utils/adt/pg_locale.c | 342 +++++++++++++++---------- src/backend/utils/adt/varchar.c | 62 ++--- src/backend/utils/adt/varlena.c | 14 +- src/backend/utils/init/postinit.c | 29 ++- src/include/utils/pg_locale.h | 55 +--- src/include/utils/pg_locale_internal.h | 68 +++++ 12 files changed, 402 insertions(+), 327 deletions(-) create mode 100644 src/include/utils/pg_locale_internal.h diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c index c0ed995919..7cbd39f466 100644 --- a/src/backend/access/hash/hashfunc.c +++ b/src/backend/access/hash/hashfunc.c @@ -282,36 +282,28 @@ hashtext(PG_FUNCTION_ARGS) if (!lc_collate_is_c(collid)) mylocale = pg_newlocale_from_collation(collid); - if (!mylocale || mylocale->deterministic) + if (pg_locale_deterministic(mylocale)) { result = hash_any((unsigned char *) VARDATA_ANY(key), VARSIZE_ANY_EXHDR(key)); } else { -#ifdef USE_ICU - if (mylocale->provider == COLLPROVIDER_ICU) - { - Size bsize, rsize; - char *buf; - const char *keydata = VARDATA_ANY(key); - size_t keylen = VARSIZE_ANY_EXHDR(key); - - bsize = pg_strnxfrm(NULL, 0, keydata, keylen, mylocale); - buf = palloc(bsize); - - rsize = pg_strnxfrm(buf, bsize, keydata, keylen, mylocale); - if (rsize != bsize) - elog(ERROR, "pg_strnxfrm() returned unexpected result"); - - result = hash_any((uint8_t *) buf, bsize); - - pfree(buf); - } - else -#endif - /* shouldn't happen */ - elog(ERROR, "unsupported collprovider: %c", mylocale->provider); + Size bsize, rsize; + char *buf; + const char *keydata = VARDATA_ANY(key); + size_t keylen = VARSIZE_ANY_EXHDR(key); + + bsize = pg_strnxfrm(NULL, 0, keydata, keylen, mylocale); + buf = palloc(bsize); + + rsize = pg_strnxfrm(buf, bsize, keydata, keylen, mylocale); + if (rsize != bsize) + elog(ERROR, "pg_strnxfrm() returned unexpected result"); + + result = hash_any((uint8_t *) buf, bsize); + + pfree(buf); } /* Avoid leaking memory for toasted inputs */ @@ -337,7 +329,7 @@ hashtextextended(PG_FUNCTION_ARGS) if (!lc_collate_is_c(collid)) mylocale = pg_newlocale_from_collation(collid); - if (!mylocale || mylocale->deterministic) + if (pg_locale_deterministic(mylocale)) { result = hash_any_extended((unsigned char *) VARDATA_ANY(key), VARSIZE_ANY_EXHDR(key), @@ -345,30 +337,22 @@ hashtextextended(PG_FUNCTION_ARGS) } else { -#ifdef USE_ICU - if (mylocale->provider == COLLPROVIDER_ICU) - { - Size bsize, rsize; - char *buf; - const char *keydata = VARDATA_ANY(key); - size_t keylen = VARSIZE_ANY_EXHDR(key); - - bsize = pg_strnxfrm(NULL, 0, keydata, keylen, mylocale); - buf = palloc(bsize); - - rsize = pg_strnxfrm(buf, bsize, keydata, keylen, mylocale); - if (rsize != bsize) - elog(ERROR, "pg_strnxfrm() returned unexpected result"); - - result = hash_any_extended((uint8_t *) buf, bsize, - PG_GETARG_INT64(1)); - - pfree(buf); - } - else -#endif - /* shouldn't happen */ - elog(ERROR, "unsupported collprovider: %c", mylocale->provider); + Size bsize, rsize; + char *buf; + const char *keydata = VARDATA_ANY(key); + size_t keylen = VARSIZE_ANY_EXHDR(key); + + bsize = pg_strnxfrm(NULL, 0, keydata, keylen, mylocale); + buf = palloc(bsize); + + rsize = pg_strnxfrm(buf, bsize, keydata, keylen, mylocale); + if (rsize != bsize) + elog(ERROR, "pg_strnxfrm() returned unexpected result"); + + result = hash_any_extended((uint8_t *) buf, bsize, + PG_GETARG_INT64(1)); + + pfree(buf); } PG_FREE_IF_COPY(key, 0); diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c index 6a4311cc63..d878be2fb8 100644 --- a/src/backend/commands/collationcmds.c +++ b/src/backend/commands/collationcmds.c @@ -36,6 +36,7 @@ #include "utils/builtins.h" #include "utils/lsyscache.h" #include "utils/pg_locale.h" +#include "utils/pg_locale_internal.h" #include "utils/rel.h" #include "utils/syscache.h" diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c index 323f00516d..3dc89b0336 100644 --- a/src/backend/regex/regc_pg_locale.c +++ b/src/backend/regex/regc_pg_locale.c @@ -17,6 +17,7 @@ #include "catalog/pg_collation.h" #include "utils/pg_locale.h" +#include "utils/pg_locale_internal.h" /* * To provide as much functionality as possible on a variety of platforms, @@ -306,13 +307,13 @@ pg_wc_isdigit(pg_wchar c) case PG_REGEX_LOCALE_WIDE_L: #ifdef HAVE_LOCALE_T if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswdigit_l((wint_t) c, pg_regex_locale->info.lt); + return iswdigit_l((wint_t) c, pg_regex_locale->info.libc.lt); #endif /* FALL THRU */ case PG_REGEX_LOCALE_1BYTE_L: #ifdef HAVE_LOCALE_T return (c <= (pg_wchar) UCHAR_MAX && - isdigit_l((unsigned char) c, pg_regex_locale->info.lt)); + isdigit_l((unsigned char) c, pg_regex_locale->info.libc.lt)); #endif break; case PG_REGEX_LOCALE_ICU: @@ -342,13 +343,13 @@ pg_wc_isalpha(pg_wchar c) case PG_REGEX_LOCALE_WIDE_L: #ifdef HAVE_LOCALE_T if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswalpha_l((wint_t) c, pg_regex_locale->info.lt); + return iswalpha_l((wint_t) c, pg_regex_locale->info.libc.lt); #endif /* FALL THRU */ case PG_REGEX_LOCALE_1BYTE_L: #ifdef HAVE_LOCALE_T return (c <= (pg_wchar) UCHAR_MAX && - isalpha_l((unsigned char) c, pg_regex_locale->info.lt)); + isalpha_l((unsigned char) c, pg_regex_locale->info.libc.lt)); #endif break; case PG_REGEX_LOCALE_ICU: @@ -378,13 +379,13 @@ pg_wc_isalnum(pg_wchar c) case PG_REGEX_LOCALE_WIDE_L: #ifdef HAVE_LOCALE_T if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswalnum_l((wint_t) c, pg_regex_locale->info.lt); + return iswalnum_l((wint_t) c, pg_regex_locale->info.libc.lt); #endif /* FALL THRU */ case PG_REGEX_LOCALE_1BYTE_L: #ifdef HAVE_LOCALE_T return (c <= (pg_wchar) UCHAR_MAX && - isalnum_l((unsigned char) c, pg_regex_locale->info.lt)); + isalnum_l((unsigned char) c, pg_regex_locale->info.libc.lt)); #endif break; case PG_REGEX_LOCALE_ICU: @@ -423,13 +424,13 @@ pg_wc_isupper(pg_wchar c) case PG_REGEX_LOCALE_WIDE_L: #ifdef HAVE_LOCALE_T if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswupper_l((wint_t) c, pg_regex_locale->info.lt); + return iswupper_l((wint_t) c, pg_regex_locale->info.libc.lt); #endif /* FALL THRU */ case PG_REGEX_LOCALE_1BYTE_L: #ifdef HAVE_LOCALE_T return (c <= (pg_wchar) UCHAR_MAX && - isupper_l((unsigned char) c, pg_regex_locale->info.lt)); + isupper_l((unsigned char) c, pg_regex_locale->info.libc.lt)); #endif break; case PG_REGEX_LOCALE_ICU: @@ -459,13 +460,13 @@ pg_wc_islower(pg_wchar c) case PG_REGEX_LOCALE_WIDE_L: #ifdef HAVE_LOCALE_T if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswlower_l((wint_t) c, pg_regex_locale->info.lt); + return iswlower_l((wint_t) c, pg_regex_locale->info.libc.lt); #endif /* FALL THRU */ case PG_REGEX_LOCALE_1BYTE_L: #ifdef HAVE_LOCALE_T return (c <= (pg_wchar) UCHAR_MAX && - islower_l((unsigned char) c, pg_regex_locale->info.lt)); + islower_l((unsigned char) c, pg_regex_locale->info.libc.lt)); #endif break; case PG_REGEX_LOCALE_ICU: @@ -495,13 +496,13 @@ pg_wc_isgraph(pg_wchar c) case PG_REGEX_LOCALE_WIDE_L: #ifdef HAVE_LOCALE_T if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswgraph_l((wint_t) c, pg_regex_locale->info.lt); + return iswgraph_l((wint_t) c, pg_regex_locale->info.libc.lt); #endif /* FALL THRU */ case PG_REGEX_LOCALE_1BYTE_L: #ifdef HAVE_LOCALE_T return (c <= (pg_wchar) UCHAR_MAX && - isgraph_l((unsigned char) c, pg_regex_locale->info.lt)); + isgraph_l((unsigned char) c, pg_regex_locale->info.libc.lt)); #endif break; case PG_REGEX_LOCALE_ICU: @@ -531,13 +532,13 @@ pg_wc_isprint(pg_wchar c) case PG_REGEX_LOCALE_WIDE_L: #ifdef HAVE_LOCALE_T if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswprint_l((wint_t) c, pg_regex_locale->info.lt); + return iswprint_l((wint_t) c, pg_regex_locale->info.libc.lt); #endif /* FALL THRU */ case PG_REGEX_LOCALE_1BYTE_L: #ifdef HAVE_LOCALE_T return (c <= (pg_wchar) UCHAR_MAX && - isprint_l((unsigned char) c, pg_regex_locale->info.lt)); + isprint_l((unsigned char) c, pg_regex_locale->info.libc.lt)); #endif break; case PG_REGEX_LOCALE_ICU: @@ -567,13 +568,13 @@ pg_wc_ispunct(pg_wchar c) case PG_REGEX_LOCALE_WIDE_L: #ifdef HAVE_LOCALE_T if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswpunct_l((wint_t) c, pg_regex_locale->info.lt); + return iswpunct_l((wint_t) c, pg_regex_locale->info.libc.lt); #endif /* FALL THRU */ case PG_REGEX_LOCALE_1BYTE_L: #ifdef HAVE_LOCALE_T return (c <= (pg_wchar) UCHAR_MAX && - ispunct_l((unsigned char) c, pg_regex_locale->info.lt)); + ispunct_l((unsigned char) c, pg_regex_locale->info.libc.lt)); #endif break; case PG_REGEX_LOCALE_ICU: @@ -603,13 +604,13 @@ pg_wc_isspace(pg_wchar c) case PG_REGEX_LOCALE_WIDE_L: #ifdef HAVE_LOCALE_T if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswspace_l((wint_t) c, pg_regex_locale->info.lt); + return iswspace_l((wint_t) c, pg_regex_locale->info.libc.lt); #endif /* FALL THRU */ case PG_REGEX_LOCALE_1BYTE_L: #ifdef HAVE_LOCALE_T return (c <= (pg_wchar) UCHAR_MAX && - isspace_l((unsigned char) c, pg_regex_locale->info.lt)); + isspace_l((unsigned char) c, pg_regex_locale->info.libc.lt)); #endif break; case PG_REGEX_LOCALE_ICU: @@ -647,13 +648,13 @@ pg_wc_toupper(pg_wchar c) case PG_REGEX_LOCALE_WIDE_L: #ifdef HAVE_LOCALE_T if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return towupper_l((wint_t) c, pg_regex_locale->info.lt); + return towupper_l((wint_t) c, pg_regex_locale->info.libc.lt); #endif /* FALL THRU */ case PG_REGEX_LOCALE_1BYTE_L: #ifdef HAVE_LOCALE_T if (c <= (pg_wchar) UCHAR_MAX) - return toupper_l((unsigned char) c, pg_regex_locale->info.lt); + return toupper_l((unsigned char) c, pg_regex_locale->info.libc.lt); #endif return c; case PG_REGEX_LOCALE_ICU: @@ -691,13 +692,13 @@ pg_wc_tolower(pg_wchar c) case PG_REGEX_LOCALE_WIDE_L: #ifdef HAVE_LOCALE_T if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return towlower_l((wint_t) c, pg_regex_locale->info.lt); + return towlower_l((wint_t) c, pg_regex_locale->info.libc.lt); #endif /* FALL THRU */ case PG_REGEX_LOCALE_1BYTE_L: #ifdef HAVE_LOCALE_T if (c <= (pg_wchar) UCHAR_MAX) - return tolower_l((unsigned char) c, pg_regex_locale->info.lt); + return tolower_l((unsigned char) c, pg_regex_locale->info.libc.lt); #endif return c; case PG_REGEX_LOCALE_ICU: diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index f3f4db5ef6..f45e2043a1 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -88,6 +88,7 @@ #include "utils/memutils.h" #include "utils/numeric.h" #include "utils/pg_locale.h" +#include "utils/pg_locale_internal.h" #include "varatt.h" @@ -1577,7 +1578,7 @@ icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale, *buff_dest = palloc(len_dest * sizeof(**buff_dest)); status = U_ZERO_ERROR; len_dest = func(*buff_dest, len_dest, buff_source, len_source, - mylocale->info.icu.locale, &status); + mylocale->ctype, &status); if (status == U_BUFFER_OVERFLOW_ERROR) { /* try again with adjusted length */ @@ -1585,7 +1586,7 @@ icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale, *buff_dest = palloc(len_dest * sizeof(**buff_dest)); status = U_ZERO_ERROR; len_dest = func(*buff_dest, len_dest, buff_source, len_source, - mylocale->info.icu.locale, &status); + mylocale->ctype, &status); } if (U_FAILURE(status)) ereport(ERROR, @@ -1698,7 +1699,7 @@ str_tolower(const char *buff, size_t nbytes, Oid collid) { #ifdef HAVE_LOCALE_T if (mylocale) - workspace[curr_char] = towlower_l(workspace[curr_char], mylocale->info.lt); + workspace[curr_char] = towlower_l(workspace[curr_char], mylocale->info.libc.lt); else #endif workspace[curr_char] = towlower(workspace[curr_char]); @@ -1731,7 +1732,7 @@ str_tolower(const char *buff, size_t nbytes, Oid collid) { #ifdef HAVE_LOCALE_T if (mylocale) - *p = tolower_l((unsigned char) *p, mylocale->info.lt); + *p = tolower_l((unsigned char) *p, mylocale->info.libc.lt); else #endif *p = pg_tolower((unsigned char) *p); @@ -1820,7 +1821,7 @@ str_toupper(const char *buff, size_t nbytes, Oid collid) { #ifdef HAVE_LOCALE_T if (mylocale) - workspace[curr_char] = towupper_l(workspace[curr_char], mylocale->info.lt); + workspace[curr_char] = towupper_l(workspace[curr_char], mylocale->info.libc.lt); else #endif workspace[curr_char] = towupper(workspace[curr_char]); @@ -1853,7 +1854,7 @@ str_toupper(const char *buff, size_t nbytes, Oid collid) { #ifdef HAVE_LOCALE_T if (mylocale) - *p = toupper_l((unsigned char) *p, mylocale->info.lt); + *p = toupper_l((unsigned char) *p, mylocale->info.libc.lt); else #endif *p = pg_toupper((unsigned char) *p); @@ -1945,10 +1946,10 @@ str_initcap(const char *buff, size_t nbytes, Oid collid) if (mylocale) { if (wasalnum) - workspace[curr_char] = towlower_l(workspace[curr_char], mylocale->info.lt); + workspace[curr_char] = towlower_l(workspace[curr_char], mylocale->info.libc.lt); else - workspace[curr_char] = towupper_l(workspace[curr_char], mylocale->info.lt); - wasalnum = iswalnum_l(workspace[curr_char], mylocale->info.lt); + workspace[curr_char] = towupper_l(workspace[curr_char], mylocale->info.libc.lt); + wasalnum = iswalnum_l(workspace[curr_char], mylocale->info.libc.lt); } else #endif @@ -1990,10 +1991,10 @@ str_initcap(const char *buff, size_t nbytes, Oid collid) if (mylocale) { if (wasalnum) - *p = tolower_l((unsigned char) *p, mylocale->info.lt); + *p = tolower_l((unsigned char) *p, mylocale->info.libc.lt); else - *p = toupper_l((unsigned char) *p, mylocale->info.lt); - wasalnum = isalnum_l((unsigned char) *p, mylocale->info.lt); + *p = toupper_l((unsigned char) *p, mylocale->info.libc.lt); + wasalnum = isalnum_l((unsigned char) *p, mylocale->info.libc.lt); } else #endif diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c index fc6cb7f5b7..6fdfa2cebd 100644 --- a/src/backend/utils/adt/like.c +++ b/src/backend/utils/adt/like.c @@ -24,6 +24,7 @@ #include "miscadmin.h" #include "utils/builtins.h" #include "utils/pg_locale.h" +#include "utils/pg_locale_internal.h" #include "varatt.h" @@ -97,7 +98,7 @@ SB_lower_char(unsigned char c, pg_locale_t locale, bool locale_is_c) return pg_ascii_tolower(c); #ifdef HAVE_LOCALE_T else if (locale) - return tolower_l(c, locale->info.lt); + return tolower_l(c, locale->info.libc.lt); #endif else return pg_tolower(c); diff --git a/src/backend/utils/adt/like_support.c b/src/backend/utils/adt/like_support.c index 9b603d42f3..43150741c8 100644 --- a/src/backend/utils/adt/like_support.c +++ b/src/backend/utils/adt/like_support.c @@ -52,6 +52,7 @@ #include "utils/datum.h" #include "utils/lsyscache.h" #include "utils/pg_locale.h" +#include "utils/pg_locale_internal.h" #include "utils/selfuncs.h" #include "utils/varlena.h" @@ -1511,7 +1512,7 @@ pattern_char_isalpha(char c, bool is_multibyte, (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); #ifdef HAVE_LOCALE_T else if (locale && locale->provider == COLLPROVIDER_LIBC) - return isalpha_l((unsigned char) c, locale->info.lt); + return isalpha_l((unsigned char) c, locale->info.libc.lt); #endif else return isalpha((unsigned char) c); diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 509ef0784e..d3ca3c5011 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -65,6 +65,7 @@ #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/pg_locale.h" +#include "utils/pg_locale_internal.h" #include "utils/syscache.h" #ifdef USE_ICU @@ -131,6 +132,11 @@ static HTAB *collation_cache = NULL; static char *IsoLocaleName(const char *); #endif +/* + * Database default locale. + */ +static pg_locale_t default_locale = NULL; + #ifdef USE_ICU /* * Converter object for converting between ICU's UChar strings and C strings @@ -1336,7 +1342,7 @@ lc_collate_is_c(Oid collation) static int result = -1; char *localeptr; - if (default_locale.provider == COLLPROVIDER_ICU) + if (default_locale->provider == COLLPROVIDER_ICU) return false; if (result >= 0) @@ -1389,7 +1395,7 @@ lc_ctype_is_c(Oid collation) static int result = -1; char *localeptr; - if (default_locale.provider == COLLPROVIDER_ICU) + if (default_locale->provider == COLLPROVIDER_ICU) return false; if (result >= 0) @@ -1420,38 +1426,6 @@ lc_ctype_is_c(Oid collation) return (lookup_collation_cache(collation, true))->ctype_is_c; } -struct pg_locale_struct default_locale; - -void -make_icu_collator(const char *iculocstr, - struct pg_locale_struct *resultp) -{ -#ifdef USE_ICU - UCollator *collator; - UErrorCode status; - - status = U_ZERO_ERROR; - collator = ucol_open(iculocstr, &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("could not open collator for locale \"%s\": %s", - iculocstr, u_errorName(status)))); - - if (U_ICU_VERSION_MAJOR_NUM < 54) - icu_set_collation_attributes(collator, iculocstr); - - /* We will leak this string if the caller errors later :-( */ - resultp->info.icu.locale = MemoryContextStrdup(TopMemoryContext, iculocstr); - resultp->info.icu.ucol = collator; -#else /* not USE_ICU */ - /* could get here if a collation was created by a build with ICU */ - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("ICU is not supported in this build"))); -#endif /* not USE_ICU */ -} - - /* simple subroutine for reporting errors from newlocale() */ #ifdef HAVE_LOCALE_T static void @@ -1485,6 +1459,163 @@ report_newlocale_failure(const char *localename) #endif /* HAVE_LOCALE_T */ +/* + * Construct a new pg_locale_t object. + * + * Passing NULL for the version is allowed; and even if it is specified, the + * result may or may not have an exactly matching version. Other parameters + * are required. + * + * Ordinarily, collate and ctype should be the same. If the provider is ICU, + * this is a requirement, and the 'collate' and 'ctype' arguments should both + * come from colliculocale (or daticulocale). If the provider is libc, the + * arguments should come from collcollate and collctype (or datcollate and + * datctype), respectively. + * + * Structures are allocated in TopMemoryContext, and the libc locale_t or + * UCollator is not allocated in any memory context, so the result is + * effectively permanent. + */ +static pg_locale_t +pg_newlocale(char provider, bool deterministic, const char *collate, + const char *ctype, const char *version) +{ + pg_locale_t result = MemoryContextAlloc(TopMemoryContext, + sizeof(struct pg_locale_struct)); + + /* + * If COLLPROVIDER_DEFAULT, caller should use default_locale or NULL + * instead. + */ + Assert(provider != COLLPROVIDER_DEFAULT); + + result->provider = provider; + result->deterministic = deterministic; + result->collate = MemoryContextStrdup(TopMemoryContext, collate); + result->ctype = MemoryContextStrdup(TopMemoryContext, ctype); + + if (provider == COLLPROVIDER_LIBC) + { +#ifdef HAVE_LOCALE_T + locale_t loc; + + /* newlocale's result may be leaked if we encounter an error */ + + if (strcmp(collate, ctype) == 0) + { + /* Normal case where they're the same */ + errno = 0; +#ifndef WIN32 + loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collate, + NULL); +#else + loc = _create_locale(LC_ALL, collate); +#endif + if (!loc) + report_newlocale_failure(collate); + } + else + { +#ifndef WIN32 + /* We need two newlocale() steps */ + locale_t loc1; + + errno = 0; + loc1 = newlocale(LC_COLLATE_MASK, collate, NULL); + if (!loc1) + report_newlocale_failure(collate); + errno = 0; + loc = newlocale(LC_CTYPE_MASK, ctype, loc1); + if (!loc) + report_newlocale_failure(ctype); +#else + + /* + * XXX The _create_locale() API doesn't appear to support + * this. Could perhaps be worked around by changing + * pg_locale_t to contain two separate fields. + */ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("collations with different collate and ctype values are not supported on this platform"))); +#endif + } + + result->info.libc.lt = loc; +#else /* not HAVE_LOCALE_T */ + /* platform that doesn't support locale_t */ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("collation provider LIBC is not supported on this platform"))); +#endif /* not HAVE_LOCALE_T */ + } +#ifdef USE_ICU + else if (provider == COLLPROVIDER_ICU) + { + UCollator *collator; + UErrorCode status; + + /* collator may be leaked if we encounter an error */ + + status = U_ZERO_ERROR; + collator = ucol_open(collate, &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("could not open collator for locale \"%s\": %s", + collate, u_errorName(status)))); + + if (U_ICU_VERSION_MAJOR_NUM < 54) + icu_set_collation_attributes(collator, collate); + + result->info.icu.ucol = collator; + } +#endif + else + /* shouldn't happen */ + elog(ERROR, "unsupported collprovider: %c", provider); + + return result; +} + +/* + * Accessor so that callers don't need to include pg_locale_internal.h. + */ +bool +pg_locale_deterministic(pg_locale_t locale) +{ + if (locale == NULL) + return true; + else + return locale->deterministic; +} + +/* + * Initialize default database locale. + */ +void +init_default_locale(char provider, const char *collate, const char *ctype, + const char *iculocale, const char *version) +{ + /* + * For the purposes of pg_locale_t, if the provider is ICU, we use + * iculocale for both collate and ctype. + */ + if (provider == COLLPROVIDER_ICU) + { + collate = iculocale; + ctype = iculocale; + } + else + Assert(iculocale == NULL); + + /* + * Default locale is currently always deterministic. Nondeterministic + * locales currently don't support pattern matching, which would break a + * lot of things if applied globally. + */ + default_locale = pg_newlocale(provider, true, collate, ctype, version); +} + /* * Create a locale_t from a collation OID. Results are cached for the * lifetime of the backend. Thus, do not free the result with freelocale(). @@ -1509,8 +1640,8 @@ pg_newlocale_from_collation(Oid collid) if (collid == DEFAULT_COLLATION_OID) { - if (default_locale.provider == COLLPROVIDER_ICU) - return &default_locale; + if (default_locale->provider == COLLPROVIDER_ICU) + return default_locale; else return (pg_locale_t) 0; } @@ -1522,107 +1653,64 @@ pg_newlocale_from_collation(Oid collid) /* We haven't computed this yet in this session, so do it */ HeapTuple tp; Form_pg_collation collform; - struct pg_locale_struct result; - pg_locale_t resultp; + pg_locale_t locale; Datum datum; bool isnull; + char *collate; + char *ctype; + char *collversionstr; tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid)); if (!HeapTupleIsValid(tp)) elog(ERROR, "cache lookup failed for collation %u", collid); collform = (Form_pg_collation) GETSTRUCT(tp); - /* We'll fill in the result struct locally before allocating memory */ - memset(&result, 0, sizeof(result)); - result.provider = collform->collprovider; - result.deterministic = collform->collisdeterministic; + datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collversion, + &isnull); + if (!isnull) + collversionstr = TextDatumGetCString(datum); + else + collversionstr = NULL; if (collform->collprovider == COLLPROVIDER_LIBC) { -#ifdef HAVE_LOCALE_T - const char *collcollate; - const char *collctype pg_attribute_unused(); - locale_t loc; - - datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collcollate, &isnull); + datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collcollate, + &isnull); Assert(!isnull); - collcollate = TextDatumGetCString(datum); - datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collctype, &isnull); + collate = TextDatumGetCString(datum); + datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collctype, + &isnull); Assert(!isnull); - collctype = TextDatumGetCString(datum); - - if (strcmp(collcollate, collctype) == 0) - { - /* Normal case where they're the same */ - errno = 0; -#ifndef WIN32 - loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collcollate, - NULL); -#else - loc = _create_locale(LC_ALL, collcollate); -#endif - if (!loc) - report_newlocale_failure(collcollate); - } - else - { -#ifndef WIN32 - /* We need two newlocale() steps */ - locale_t loc1; - - errno = 0; - loc1 = newlocale(LC_COLLATE_MASK, collcollate, NULL); - if (!loc1) - report_newlocale_failure(collcollate); - errno = 0; - loc = newlocale(LC_CTYPE_MASK, collctype, loc1); - if (!loc) - report_newlocale_failure(collctype); -#else - - /* - * XXX The _create_locale() API doesn't appear to support - * this. Could perhaps be worked around by changing - * pg_locale_t to contain two separate fields. - */ - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("collations with different collate and ctype values are not supported on this platform"))); -#endif - } - - result.info.lt = loc; -#else /* not HAVE_LOCALE_T */ - /* platform that doesn't support locale_t */ - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("collation provider LIBC is not supported on this platform"))); -#endif /* not HAVE_LOCALE_T */ + ctype = TextDatumGetCString(datum); } +#ifdef USE_ICU else if (collform->collprovider == COLLPROVIDER_ICU) { - const char *iculocstr; - - datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_colliculocale, &isnull); + datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_colliculocale, + &isnull); Assert(!isnull); - iculocstr = TextDatumGetCString(datum); - make_icu_collator(iculocstr, &result); + collate = TextDatumGetCString(datum); + + /* for ICU, collate and ctype are both set from iculocale */ + ctype = collate; } +#endif + else + /* shouldn't happen */ + elog(ERROR, "unsupported collprovider: %c", collform->collprovider); - datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collversion, - &isnull); - if (!isnull) + locale = pg_newlocale(collform->collprovider, + collform->collisdeterministic, + collate, ctype, collversionstr); + + ReleaseSysCache(tp); + + if (collversionstr != NULL) { char *actual_versionstr; - char *collversionstr; - collversionstr = TextDatumGetCString(datum); - - datum = SysCacheGetAttr(COLLOID, tp, collform->collprovider == COLLPROVIDER_ICU ? Anum_pg_collation_colliculocale : Anum_pg_collation_collcollate, &isnull); - Assert(!isnull); + actual_versionstr = get_collation_actual_version(collform->collprovider, collate); - actual_versionstr = get_collation_actual_version(collform->collprovider, - TextDatumGetCString(datum)); if (!actual_versionstr) { /* @@ -1649,13 +1737,7 @@ pg_newlocale_from_collation(Oid collid) NameStr(collform->collname))))); } - ReleaseSysCache(tp); - - /* We'll keep the pg_locale_t structures in TopMemoryContext */ - resultp = MemoryContextAlloc(TopMemoryContext, sizeof(*resultp)); - *resultp = result; - - cache_entry->locale = resultp; + cache_entry->locale = locale; } return cache_entry->locale; @@ -1815,7 +1897,7 @@ pg_strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2, errno = 0; #ifdef HAVE_LOCALE_T if (locale) - result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->info.lt); + result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->info.libc.lt); else #endif result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p); @@ -1858,7 +1940,7 @@ pg_strcoll_libc(const char *arg1, const char *arg2, pg_locale_t locale) if (locale) { #ifdef HAVE_LOCALE_T - result = strcoll_l(arg1, arg2, locale->info.lt); + result = strcoll_l(arg1, arg2, locale->info.libc.lt); #else /* shouldn't happen */ elog(ERROR, "unsupported collprovider: %c", locale->provider); @@ -2102,7 +2184,7 @@ pg_strxfrm_libc(char *dest, const char *src, size_t destsize, #ifdef TRUST_STXFRM #ifdef HAVE_LOCALE_T if (locale) - return strxfrm_l(dest, src, destsize, locale->info.lt); + return strxfrm_l(dest, src, destsize, locale->info.libc.lt); else #endif return strxfrm(dest, src, destsize); @@ -2699,8 +2781,8 @@ void check_icu_locale(const char *icu_locale) { #ifdef USE_ICU - UCollator *collator; - UErrorCode status; + UCollator *collator; + UErrorCode status; status = U_ZERO_ERROR; collator = ucol_open(icu_locale, &status); @@ -2774,10 +2856,10 @@ wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale) #ifdef HAVE_LOCALE_T #ifdef HAVE_WCSTOMBS_L /* Use wcstombs_l for nondefault locales */ - result = wcstombs_l(to, from, tolen, locale->info.lt); + result = wcstombs_l(to, from, tolen, locale->info.libc.lt); #else /* !HAVE_WCSTOMBS_L */ /* We have to temporarily set the locale as current ... ugh */ - locale_t save_locale = uselocale(locale->info.lt); + locale_t save_locale = uselocale(locale->info.libc.lt); result = wcstombs(to, from, tolen); @@ -2851,10 +2933,10 @@ char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, #ifdef HAVE_LOCALE_T #ifdef HAVE_MBSTOWCS_L /* Use mbstowcs_l for nondefault locales */ - result = mbstowcs_l(to, str, tolen, locale->info.lt); + result = mbstowcs_l(to, str, tolen, locale->info.libc.lt); #else /* !HAVE_MBSTOWCS_L */ /* We have to temporarily set the locale as current ... ugh */ - locale_t save_locale = uselocale(locale->info.lt); + locale_t save_locale = uselocale(locale->info.libc.lt); result = mbstowcs(to, str, tolen); diff --git a/src/backend/utils/adt/varchar.c b/src/backend/utils/adt/varchar.c index 5bbba94687..8dc1faaaf2 100644 --- a/src/backend/utils/adt/varchar.c +++ b/src/backend/utils/adt/varchar.c @@ -762,7 +762,7 @@ bpchareq(PG_FUNCTION_ARGS) else mylocale = pg_newlocale_from_collation(collid); - if (locale_is_c || !mylocale || mylocale->deterministic) + if (locale_is_c || pg_locale_deterministic(mylocale)) { /* * Since we only care about equality or not-equality, we can avoid all @@ -807,7 +807,7 @@ bpcharne(PG_FUNCTION_ARGS) else mylocale = pg_newlocale_from_collation(collid); - if (locale_is_c || !mylocale || mylocale->deterministic) + if (locale_is_c || pg_locale_deterministic(mylocale)) { /* * Since we only care about equality or not-equality, we can avoid all @@ -1015,33 +1015,25 @@ hashbpchar(PG_FUNCTION_ARGS) if (!lc_collate_is_c(collid)) mylocale = pg_newlocale_from_collation(collid); - if (!mylocale || mylocale->deterministic) + if (pg_locale_deterministic(mylocale)) { result = hash_any((unsigned char *) keydata, keylen); } else { -#ifdef USE_ICU - if (mylocale->provider == COLLPROVIDER_ICU) - { - Size bsize, rsize; - char *buf; + Size bsize, rsize; + char *buf; - bsize = pg_strnxfrm(NULL, 0, keydata, keylen, mylocale); - buf = palloc(bsize); + bsize = pg_strnxfrm(NULL, 0, keydata, keylen, mylocale); + buf = palloc(bsize); - rsize = pg_strnxfrm(buf, bsize, keydata, keylen, mylocale); - if (rsize != bsize) - elog(ERROR, "pg_strnxfrm() returned unexpected result"); + rsize = pg_strnxfrm(buf, bsize, keydata, keylen, mylocale); + if (rsize != bsize) + elog(ERROR, "pg_strnxfrm() returned unexpected result"); - result = hash_any((uint8_t *) buf, bsize); + result = hash_any((uint8_t *) buf, bsize); - pfree(buf); - } - else -#endif - /* shouldn't happen */ - elog(ERROR, "unsupported collprovider: %c", mylocale->provider); + pfree(buf); } /* Avoid leaking memory for toasted inputs */ @@ -1072,35 +1064,27 @@ hashbpcharextended(PG_FUNCTION_ARGS) if (!lc_collate_is_c(collid)) mylocale = pg_newlocale_from_collation(collid); - if (!mylocale || mylocale->deterministic) + if (pg_locale_deterministic(mylocale)) { result = hash_any_extended((unsigned char *) keydata, keylen, PG_GETARG_INT64(1)); } else { -#ifdef USE_ICU - if (mylocale->provider == COLLPROVIDER_ICU) - { - Size bsize, rsize; - char *buf; + Size bsize, rsize; + char *buf; - bsize = pg_strnxfrm(NULL, 0, keydata, keylen, mylocale); - buf = palloc(bsize); + bsize = pg_strnxfrm(NULL, 0, keydata, keylen, mylocale); + buf = palloc(bsize); - rsize = pg_strnxfrm(buf, bsize, keydata, keylen, mylocale); - if (rsize != bsize) - elog(ERROR, "pg_strnxfrm() returned unexpected result"); + rsize = pg_strnxfrm(buf, bsize, keydata, keylen, mylocale); + if (rsize != bsize) + elog(ERROR, "pg_strnxfrm() returned unexpected result"); - result = hash_any_extended((uint8_t *) buf, bsize, - PG_GETARG_INT64(1)); + result = hash_any_extended((uint8_t *) buf, bsize, + PG_GETARG_INT64(1)); - pfree(buf); - } - else -#endif - /* shouldn't happen */ - elog(ERROR, "unsupported collprovider: %c", mylocale->provider); + pfree(buf); } PG_FREE_IF_COPY(key, 0); diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 5030328f31..3c4e7be687 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -1205,7 +1205,7 @@ text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state) if (!lc_collate_is_c(collid)) mylocale = pg_newlocale_from_collation(collid); - if (mylocale && !mylocale->deterministic) + if (!pg_locale_deterministic(mylocale)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("nondeterministic collations are not supported for substring searches"))); @@ -1556,8 +1556,7 @@ varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid) result = pg_strncoll(arg1, len1, arg2, len2, mylocale); /* Break tie if necessary. */ - if (result == 0 && - (!mylocale || mylocale->deterministic)) + if (result == 0 && pg_locale_deterministic(mylocale)) { result = memcmp(arg1, arg2, Min(len1, len2)); if ((result == 0) && (len1 != len2)) @@ -1612,7 +1611,7 @@ texteq(PG_FUNCTION_ARGS) else mylocale = pg_newlocale_from_collation(collid); - if (locale_is_c || !mylocale || mylocale->deterministic) + if (locale_is_c || pg_locale_deterministic(mylocale)) { Datum arg1 = PG_GETARG_DATUM(0); Datum arg2 = PG_GETARG_DATUM(1); @@ -1671,7 +1670,7 @@ textne(PG_FUNCTION_ARGS) else mylocale = pg_newlocale_from_collation(collid); - if (locale_is_c || !mylocale || mylocale->deterministic) + if (locale_is_c || pg_locale_deterministic(mylocale)) { Datum arg1 = PG_GETARG_DATUM(0); Datum arg2 = PG_GETARG_DATUM(1); @@ -1785,7 +1784,7 @@ text_starts_with(PG_FUNCTION_ARGS) if (!lc_collate_is_c(collid)) mylocale = pg_newlocale_from_collation(collid); - if (mylocale && !mylocale->deterministic) + if (!pg_locale_deterministic(mylocale)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("nondeterministic collations are not supported for substring searches"))); @@ -2201,8 +2200,7 @@ varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup) result = pg_strcoll(sss->buf1, sss->buf2, sss->locale); /* Break tie if necessary. */ - if (result == 0 && - (!sss->locale || sss->locale->deterministic)) + if (result == 0 && pg_locale_deterministic(sss->locale)) result = strcmp(sss->buf1, sss->buf2); /* Cache result, perhaps saving an expensive strcoll() call next time */ diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index ae5a85ed65..c26dfb6104 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -317,6 +317,7 @@ CheckMyDatabase(const char *name, bool am_superuser, bool override_allow_connect char *collate; char *ctype; char *iculocale; + char *collversionstr; /* Fetch our pg_database row normally, via syscache */ tup = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId)); @@ -424,35 +425,33 @@ CheckMyDatabase(const char *name, bool am_superuser, bool override_allow_connect datum = SysCacheGetAttr(DATABASEOID, tup, Anum_pg_database_daticulocale, &isnull); Assert(!isnull); iculocale = TextDatumGetCString(datum); - make_icu_collator(iculocale, &default_locale); } else iculocale = NULL; - default_locale.provider = dbform->datlocprovider; + datum = SysCacheGetAttr(DATABASEOID, tup, Anum_pg_database_datcollversion, + &isnull); + if (!isnull) + collversionstr = TextDatumGetCString(datum); + else + collversionstr = NULL; - /* - * Default locale is currently always deterministic. Nondeterministic - * locales currently don't support pattern matching, which would break a - * lot of things if applied globally. - */ - default_locale.deterministic = true; + init_default_locale(dbform->datlocprovider, collate, ctype, iculocale, + collversionstr); /* * Check collation version. See similar code in * pg_newlocale_from_collation(). Note that here we warn instead of error * in any case, so that we don't prevent connecting. */ - datum = SysCacheGetAttr(DATABASEOID, tup, Anum_pg_database_datcollversion, - &isnull); - if (!isnull) + if (collversionstr != NULL) { char *actual_versionstr; - char *collversionstr; - collversionstr = TextDatumGetCString(datum); + actual_versionstr = get_collation_actual_version( + dbform->datlocprovider, + dbform->datlocprovider == COLLPROVIDER_ICU ? iculocale : collate); - actual_versionstr = get_collation_actual_version(dbform->datlocprovider, dbform->datlocprovider == COLLPROVIDER_ICU ? iculocale : collate); if (!actual_versionstr) /* should not happen */ elog(WARNING, @@ -470,6 +469,8 @@ CheckMyDatabase(const char *name, bool am_superuser, bool override_allow_connect "or build PostgreSQL with the right library version.", quote_identifier(name)))); } + else + collversionstr = NULL; /* Make the locale settings visible as GUC variables, too */ SetConfigOption("lc_collate", collate, PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT); diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index c08e6ce07f..7be42fc21e 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -15,22 +15,6 @@ #if defined(LOCALE_T_IN_XLOCALE) || defined(WCSTOMBS_L_IN_XLOCALE) #include <xlocale.h> #endif -#ifdef USE_ICU -#include <unicode/ucol.h> -#endif - -#ifdef USE_ICU -/* - * ucol_strcollUTF8() was introduced in ICU 50, but it is buggy before ICU 53. - * (see - * <https://www.postgresql.org/message-id/flat/f1438ec6-22aa-4029-9a3b-26f79d330e72%40manitou-mail.org>) - */ -#if U_ICU_VERSION_MAJOR_NUM >= 53 -#define HAVE_UCOL_STRCOLLUTF8 1 -#else -#undef HAVE_UCOL_STRCOLLUTF8 -#endif -#endif /* use for libc locale names */ #define LOCALE_NAME_BUFLEN 128 @@ -65,39 +49,12 @@ extern struct lconv *PGLC_localeconv(void); extern void cache_locale_time(void); -/* - * We define our own wrapper around locale_t so we can keep the same - * function signatures for all builds, while not having to create a - * fake version of the standard type locale_t in the global namespace. - * pg_locale_t is occasionally checked for truth, so make it a pointer. - */ -struct pg_locale_struct -{ - char provider; - bool deterministic; - union - { -#ifdef HAVE_LOCALE_T - locale_t lt; -#endif -#ifdef USE_ICU - struct - { - const char *locale; - UCollator *ucol; - } icu; -#endif - int dummy; /* in case we have neither LOCALE_T nor ICU */ - } info; -}; - typedef struct pg_locale_struct *pg_locale_t; -extern PGDLLIMPORT struct pg_locale_struct default_locale; - -extern void make_icu_collator(const char *iculocstr, - struct pg_locale_struct *resultp); - +extern void init_default_locale(char provider, const char *collate, + const char *ctype, const char *iculocale, + const char *version); +extern bool pg_locale_deterministic(pg_locale_t locale); extern pg_locale_t pg_newlocale_from_collation(Oid collid); extern char *get_collation_actual_version(char collprovider, const char *collcollate); @@ -115,10 +72,6 @@ extern size_t pg_strxfrm_prefix(char *dest, const char *src, size_t destsize, extern size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale); -#ifdef USE_ICU -extern int32_t icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes); -extern int32_t icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar); -#endif extern void check_icu_locale(const char *icu_locale); /* These functions convert from/to libc's wchar_t, *not* pg_wchar_t */ diff --git a/src/include/utils/pg_locale_internal.h b/src/include/utils/pg_locale_internal.h new file mode 100644 index 0000000000..33465ad92d --- /dev/null +++ b/src/include/utils/pg_locale_internal.h @@ -0,0 +1,68 @@ +/*----------------------------------------------------------------------- + * + * PostgreSQL locale utilities + * + * src/include/utils/pg_locale_internal.h + * + * Copyright (c) 2002-2022, PostgreSQL Global Development Group + * + *----------------------------------------------------------------------- + */ + + +#ifndef _PG_LOCALE_INTERNAL_ +#define _PG_LOCALE_INTERNAL_ + +#ifdef USE_ICU +#include <unicode/ucol.h> +#endif + +#ifdef USE_ICU +/* + * ucol_strcollUTF8() was introduced in ICU 50, but it is buggy before ICU 53. + * (see + * <https://www.postgresql.org/message-id/flat/f1438ec6-22aa-4029-9a3b-26f79d330e72%40manitou-mail.org>) + */ +#if U_ICU_VERSION_MAJOR_NUM >= 53 +#define HAVE_UCOL_STRCOLLUTF8 1 +#else +#undef HAVE_UCOL_STRCOLLUTF8 +#endif +#endif + +/* + * We define our own wrapper around locale_t so we can keep the same + * function signatures for all builds, while not having to create a + * fake version of the standard type locale_t in the global namespace. + * pg_locale_t is occasionally checked for truth, so make it a pointer. + */ +struct pg_locale_struct +{ + char provider; + bool deterministic; + char *collate; + char *ctype; + union + { +#ifdef HAVE_LOCALE_T + struct + { + locale_t lt; + } libc; +#endif +#ifdef USE_ICU + struct + { + UCollator *ucol; + } icu; +#endif + int dummy; /* in case we have neither LOCALE_T nor ICU */ + } info; +}; + +#ifdef USE_ICU +extern int32_t icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes); +extern int32_t icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar); +#endif + +#endif /* _PG_LOCALE_INTERNAL_ */ -- 2.34.1
From 954ddd0f8b3adb3d66e9b12adc6fc021393e2039 Mon Sep 17 00:00:00 2001 From: Jeff Davis <j...@j-davis.com> Date: Thu, 1 Dec 2022 14:45:15 -0800 Subject: [PATCH v8 1/2] Add pg_strcoll(), pg_strxfrm(), and variants. In preparation for multi-lib ICU, which should be based on a clean separation of the routines required for collation providers. Also offers a generally better separation of responsibilities. Callers with NUL-terminated strings should call pg_strcoll() or pg_strxfrm(); callers with strings and their length should call the variants pg_strncoll() or pg_strnxfrm(). Also remove the TRUST_STRXFRM define, and replace with a developer GUC for easier testing. --- src/backend/access/hash/hashfunc.c | 45 +- src/backend/utils/adt/pg_locale.c | 783 +++++++++++++++++++++++++++- src/backend/utils/adt/varchar.c | 41 +- src/backend/utils/adt/varlena.c | 368 ++----------- src/backend/utils/misc/guc_tables.c | 11 + src/include/utils/pg_locale.h | 14 + 6 files changed, 874 insertions(+), 388 deletions(-) diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c index e3e40d6c21..c0ed995919 100644 --- a/src/backend/access/hash/hashfunc.c +++ b/src/backend/access/hash/hashfunc.c @@ -292,21 +292,19 @@ hashtext(PG_FUNCTION_ARGS) #ifdef USE_ICU if (mylocale->provider == COLLPROVIDER_ICU) { - int32_t ulen = -1; - UChar *uchar = NULL; - Size bsize; - uint8_t *buf; + Size bsize, rsize; + char *buf; + const char *keydata = VARDATA_ANY(key); + size_t keylen = VARSIZE_ANY_EXHDR(key); - ulen = icu_to_uchar(&uchar, VARDATA_ANY(key), VARSIZE_ANY_EXHDR(key)); - - bsize = ucol_getSortKey(mylocale->info.icu.ucol, - uchar, ulen, NULL, 0); + bsize = pg_strnxfrm(NULL, 0, keydata, keylen, mylocale); buf = palloc(bsize); - ucol_getSortKey(mylocale->info.icu.ucol, - uchar, ulen, buf, bsize); - pfree(uchar); - result = hash_any(buf, bsize); + rsize = pg_strnxfrm(buf, bsize, keydata, keylen, mylocale); + if (rsize != bsize) + elog(ERROR, "pg_strnxfrm() returned unexpected result"); + + result = hash_any((uint8_t *) buf, bsize); pfree(buf); } @@ -350,21 +348,20 @@ hashtextextended(PG_FUNCTION_ARGS) #ifdef USE_ICU if (mylocale->provider == COLLPROVIDER_ICU) { - int32_t ulen = -1; - UChar *uchar = NULL; - Size bsize; - uint8_t *buf; + Size bsize, rsize; + char *buf; + const char *keydata = VARDATA_ANY(key); + size_t keylen = VARSIZE_ANY_EXHDR(key); - ulen = icu_to_uchar(&uchar, VARDATA_ANY(key), VARSIZE_ANY_EXHDR(key)); - - bsize = ucol_getSortKey(mylocale->info.icu.ucol, - uchar, ulen, NULL, 0); + bsize = pg_strnxfrm(NULL, 0, keydata, keylen, mylocale); buf = palloc(bsize); - ucol_getSortKey(mylocale->info.icu.ucol, - uchar, ulen, buf, bsize); - pfree(uchar); - result = hash_any_extended(buf, bsize, PG_GETARG_INT64(1)); + rsize = pg_strnxfrm(buf, bsize, keydata, keylen, mylocale); + if (rsize != bsize) + elog(ERROR, "pg_strnxfrm() returned unexpected result"); + + result = hash_any_extended((uint8_t *) buf, bsize, + PG_GETARG_INT64(1)); pfree(buf); } diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 059e4fd79f..509ef0784e 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -79,6 +79,12 @@ #include <shlwapi.h> #endif +/* + * This should be large enough that most strings will fit, but small enough + * that we feel comfortable putting it on the stack + */ +#define TEXTBUFLEN 1024 + #define MAX_L10N_DATA 80 @@ -88,6 +94,9 @@ char *locale_monetary; char *locale_numeric; char *locale_time; +/* GUC to enable use of strxfrm() for abbreviated keys */ +bool trust_strxfrm = false; + /* * lc_time localization cache. * @@ -123,6 +132,19 @@ static char *IsoLocaleName(const char *); #endif #ifdef USE_ICU +/* + * Converter object for converting between ICU's UChar strings and C strings + * in database encoding. Since the database encoding doesn't change, we only + * need one of these per session. + */ +static UConverter *icu_converter = NULL; + +static void init_icu_converter(void); +static size_t uchar_length(UConverter *converter, + const char *str, size_t len); +static int32_t uchar_convert(UConverter *converter, + UChar *dest, int32_t destlen, + const char *str, size_t srclen); static void icu_set_collation_attributes(UCollator *collator, const char *loc); #endif @@ -1731,15 +1753,716 @@ get_collation_actual_version(char collprovider, const char *collcollate) return collversion; } +/* + * pg_strncoll_libc_win32_utf8 + * + * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and + * invoke wcscoll() or wcscoll_l(). + */ +#ifdef WIN32 +static int +pg_strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2, + size_t len2, pg_locale_t locale) +{ + char sbuf[TEXTBUFLEN]; + char *buf = sbuf; + char *a1p, + *a2p; + int a1len = len1 * 2 + 2; + int a2len = len2 * 2 + 2; + int r; + int result; + + Assert(!locale || locale->provider == COLLPROVIDER_LIBC); + Assert(GetDatabaseEncoding() == PG_UTF8); +#ifndef WIN32 + Assert(false); +#endif + + if (a1len + a2len > TEXTBUFLEN) + buf = palloc(a1len + a2len); + + a1p = buf; + a2p = buf + a1len; + + /* API does not work for zero-length input */ + if (len1 == 0) + r = 0; + else + { + r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1, + (LPWSTR) a1p, a1len / 2); + if (!r) + ereport(ERROR, + (errmsg("could not convert string to UTF-16: error code %lu", + GetLastError()))); + } + ((LPWSTR) a1p)[r] = 0; + + if (len2 == 0) + r = 0; + else + { + r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2, + (LPWSTR) a2p, a2len / 2); + if (!r) + ereport(ERROR, + (errmsg("could not convert string to UTF-16: error code %lu", + GetLastError()))); + } + ((LPWSTR) a2p)[r] = 0; + + errno = 0; +#ifdef HAVE_LOCALE_T + if (locale) + result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->info.lt); + else +#endif + result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p); + if (result == 2147483647) /* _NLSCMPERROR; missing from mingw + * headers */ + ereport(ERROR, + (errmsg("could not compare Unicode strings: %m"))); + + if (buf != sbuf) + pfree(buf); + + return result; +} +#endif /* WIN32 */ + +/* + * pg_strcoll_libc + * + * Call strcoll(), strcoll_l(), wcscoll(), or wcscoll_l() as appropriate for + * the given locale, platform, and database encoding. If the locale is NULL, + * use the database collation. + * + * Arguments must be encoded in the database encoding and nul-terminated. + */ +static int +pg_strcoll_libc(const char *arg1, const char *arg2, pg_locale_t locale) +{ + int result; + + Assert(!locale || locale->provider == COLLPROVIDER_LIBC); +#ifdef WIN32 + if (GetDatabaseEncoding() == PG_UTF8) + { + size_t len1 = strlen(arg1); + size_t len2 = strlen(arg2); + result = pg_strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale); + } + else +#endif /* WIN32 */ + if (locale) + { +#ifdef HAVE_LOCALE_T + result = strcoll_l(arg1, arg2, locale->info.lt); +#else + /* shouldn't happen */ + elog(ERROR, "unsupported collprovider: %c", locale->provider); +#endif + } + else + result = strcoll(arg1, arg2); + + return result; +} + +/* + * pg_strncoll_libc + * + * Null-terminate the arguments and call pg_strcoll_libc(). + */ +static int +pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2, + pg_locale_t locale) +{ + char sbuf[TEXTBUFLEN]; + char *buf = sbuf; + size_t bufsize1 = len1 + 1; + size_t bufsize2 = len2 + 1; + char *arg1n; + char *arg2n; + int result; + + Assert(!locale || locale->provider == COLLPROVIDER_LIBC); + +#ifdef WIN32 + /* check for this case before doing the work for nul-termination */ + if (GetDatabaseEncoding() == PG_UTF8) + return pg_strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale); +#endif /* WIN32 */ + + if (bufsize1 + bufsize2 > TEXTBUFLEN) + buf = palloc(bufsize1 + bufsize2); + + arg1n = buf; + arg2n = buf + bufsize1; + + /* nul-terminate arguments */ + memcpy(arg1n, arg1, len1); + arg1n[len1] = '\0'; + memcpy(arg2n, arg2, len2); + arg2n[len2] = '\0'; + + result = pg_strcoll_libc(arg1n, arg2n, locale); + + if (buf != sbuf) + pfree(buf); + + return result; +} #ifdef USE_ICU + /* - * Converter object for converting between ICU's UChar strings and C strings - * in database encoding. Since the database encoding doesn't change, we only - * need one of these per session. + * pg_strncoll_icu_no_utf8 + * + * Convert the arguments from the database encoding to UChar strings, then + * call ucol_strcoll(). + * + * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(), + * caller should call that instead. */ -static UConverter *icu_converter = NULL; +static int +pg_strncoll_icu_no_utf8(const char *arg1, size_t len1, + const char *arg2, size_t len2, pg_locale_t locale) +{ + char sbuf[TEXTBUFLEN]; + char *buf = sbuf; + int32_t ulen1; + int32_t ulen2; + size_t bufsize1; + size_t bufsize2; + UChar *uchar1, + *uchar2; + int result; + + Assert(locale->provider == COLLPROVIDER_ICU); +#ifdef HAVE_UCOL_STRCOLLUTF8 + Assert(GetDatabaseEncoding() != PG_UTF8); +#endif + + init_icu_converter(); + + ulen1 = uchar_length(icu_converter, arg1, len1); + ulen2 = uchar_length(icu_converter, arg2, len2); + + bufsize1 = (ulen1 + 1) * sizeof(UChar); + bufsize2 = (ulen2 + 1) * sizeof(UChar); + + if (bufsize1 + bufsize2 > TEXTBUFLEN) + buf = palloc(bufsize1 + bufsize2); + + uchar1 = (UChar *) buf; + uchar2 = (UChar *) (buf + bufsize1); + + ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1); + ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2); + + result = ucol_strcoll(locale->info.icu.ucol, + uchar1, ulen1, + uchar2, ulen2); + + if (buf != sbuf) + pfree(buf); + + return result; +} + +/* + * pg_strncoll_icu + * + * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given + * database encoding. + * + * Arguments must be encoded in the database encoding. + */ +static int +pg_strncoll_icu(const char *arg1, size_t len1, const char *arg2, size_t len2, + pg_locale_t locale) +{ + int result; + + Assert(locale->provider == COLLPROVIDER_ICU); + +#ifdef HAVE_UCOL_STRCOLLUTF8 + if (GetDatabaseEncoding() == PG_UTF8) + { + UErrorCode status; + + status = U_ZERO_ERROR; + result = ucol_strcollUTF8(locale->info.icu.ucol, + arg1, len1, + arg2, len2, + &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("collation failed: %s", u_errorName(status)))); + } + else +#endif + { + result = pg_strncoll_icu_no_utf8(arg1, len1, arg2, len2, locale); + } + + return result; +} + +/* + * pg_strcoll_icu + * + * Calculate the string lengths and call pg_strncoll_icu(). + */ +static int +pg_strcoll_icu(const char *arg1, const char *arg2, pg_locale_t locale) +{ + Assert(locale->provider == COLLPROVIDER_ICU); + return pg_strncoll_icu(arg1, -1, arg2, -1, locale); +} + +#endif /* USE_ICU */ + +/* + * pg_strcoll + * + * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll(), strcoll_l(), wcscoll(), + * or wcscoll_l() as appropriate for the given locale, platform, and database + * encoding. If the locale is not specified, use the database collation. + * + * Arguments must be encoded in the database encoding and nul-terminated. + * + * The caller is responsible for breaking ties if the collation is + * deterministic; this maintains consistency with pg_strxfrm(), which cannot + * easily account for deterministic collations. + */ +int +pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale) +{ + int result; + + if (!locale || locale->provider == COLLPROVIDER_LIBC) + result = pg_strcoll_libc(arg1, arg2, locale); +#ifdef USE_ICU + else if (locale->provider == COLLPROVIDER_ICU) + result = pg_strcoll_icu(arg1, arg2, locale); +#endif + else + /* shouldn't happen */ + elog(ERROR, "unsupported collprovider: %c", locale->provider); + + return result; +} + +/* + * pg_strncoll + * + * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll(), strcoll_l(), wcscoll(), + * or wcscoll_l() as appropriate for the given locale, platform, and database + * encoding. If the locale is not specified, use the database collation. + * + * Arguments must be encoded in the database encoding. + * + * This function may need to nul-terminate the arguments for libc functions; + * so if the caller already has nul-terminated strings, it should call + * pg_strcoll() instead. + * + * The caller is responsible for breaking ties if the collation is + * deterministic; this maintains consistency with pg_strnxfrm(), which cannot + * easily account for deterministic collations. + */ +int +pg_strncoll(const char *arg1, size_t len1, const char *arg2, size_t len2, + pg_locale_t locale) +{ + int result; + + if (!locale || locale->provider == COLLPROVIDER_LIBC) + result = pg_strncoll_libc(arg1, len1, arg2, len2, locale); +#ifdef USE_ICU + else if (locale->provider == COLLPROVIDER_ICU) + result = pg_strncoll_icu(arg1, len1, arg2, len2, locale); +#endif + else + /* shouldn't happen */ + elog(ERROR, "unsupported collprovider: %c", locale->provider); + + return result; +} + + +static size_t +pg_strxfrm_libc(char *dest, const char *src, size_t destsize, + pg_locale_t locale) +{ + Assert(!locale || locale->provider == COLLPROVIDER_LIBC); + +#ifdef TRUST_STXFRM +#ifdef HAVE_LOCALE_T + if (locale) + return strxfrm_l(dest, src, destsize, locale->info.lt); + else +#endif + return strxfrm(dest, src, destsize); +#else + /* shouldn't happen */ + elog(ERROR, "unsupported collprovider: %c", locale->provider); +#endif +} + +static size_t +pg_strnxfrm_libc(char *dest, const char *src, size_t srclen, size_t destsize, + pg_locale_t locale) +{ + char sbuf[TEXTBUFLEN]; + char *buf = sbuf; + size_t bufsize = srclen + 1; + size_t result; + Assert(!locale || locale->provider == COLLPROVIDER_LIBC); + + if (bufsize > TEXTBUFLEN) + buf = palloc(bufsize); + + /* nul-terminate arguments */ + memcpy(buf, src, srclen); + buf[srclen] = '\0'; + + result = pg_strxfrm_libc(dest, buf, destsize, locale); + + if (buf != sbuf) + pfree(buf); + + return result; +} + +static size_t +pg_strnxfrm_prefix_libc(char *dest, const char *src, size_t srclen, + size_t destsize, pg_locale_t locale) +{ + Assert(!locale || locale->provider == COLLPROVIDER_LIBC); + /* unsupported; shouldn't happen */ + elog(ERROR, "collprovider '%c' does not support pg_strnxfrm_prefix()", + locale->provider); +} + +#ifdef USE_ICU + +static size_t +pg_strnxfrm_icu(char *dest, const char *src, size_t srclen, size_t destsize, + pg_locale_t locale) +{ + char sbuf[TEXTBUFLEN]; + char *buf = sbuf; + UChar *uchar; + int32_t ulen; + size_t uchar_bsize; + Size result_bsize; + + Assert(locale->provider == COLLPROVIDER_ICU); + + init_icu_converter(); + + ulen = uchar_length(icu_converter, src, srclen); + + uchar_bsize = (ulen + 1) * sizeof(UChar); + + if (uchar_bsize > TEXTBUFLEN) + buf = palloc(uchar_bsize); + + uchar = (UChar *) buf; + + ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen); + + result_bsize = ucol_getSortKey(locale->info.icu.ucol, + uchar, ulen, + (uint8_t *) dest, destsize); + + if (buf != sbuf) + pfree(buf); + + return result_bsize; +} + +static size_t +pg_strxfrm_icu(char *dest, const char *src, size_t destsize, + pg_locale_t locale) +{ + Assert(locale->provider == COLLPROVIDER_ICU); + return pg_strnxfrm_icu(dest, src, -1, destsize, locale); +} + +static size_t +pg_strnxfrm_prefix_icu_no_utf8(char *dest, const char *src, size_t srclen, + size_t destsize, pg_locale_t locale) +{ + char sbuf[TEXTBUFLEN]; + char *buf = sbuf; + UCharIterator iter; + uint32_t state[2]; + UErrorCode status; + int32_t ulen = -1; + UChar *uchar = NULL; + size_t uchar_bsize; + Size result_bsize; + + Assert(locale->provider == COLLPROVIDER_ICU); + Assert(GetDatabaseEncoding() != PG_UTF8); + + init_icu_converter(); + + ulen = uchar_length(icu_converter, src, srclen); + + uchar_bsize = (ulen + 1) * sizeof(UChar); + + if (uchar_bsize > TEXTBUFLEN) + buf = palloc(uchar_bsize); + + uchar = (UChar *) buf; + + ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen); + + uiter_setString(&iter, uchar, ulen); + state[0] = state[1] = 0; /* won't need that again */ + status = U_ZERO_ERROR; + result_bsize = ucol_nextSortKeyPart(locale->info.icu.ucol, + &iter, + state, + (uint8_t *) dest, + destsize, + &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("sort key generation failed: %s", + u_errorName(status)))); + + return result_bsize; +} + +static size_t +pg_strnxfrm_prefix_icu(char *dest, const char *src, size_t srclen, + size_t destsize, pg_locale_t locale) +{ + size_t result; + + Assert(locale->provider == COLLPROVIDER_ICU); + + if (GetDatabaseEncoding() == PG_UTF8) + { + UCharIterator iter; + uint32_t state[2]; + UErrorCode status; + + uiter_setUTF8(&iter, src, srclen); + state[0] = state[1] = 0; /* won't need that again */ + status = U_ZERO_ERROR; + result = ucol_nextSortKeyPart(locale->info.icu.ucol, + &iter, + state, + (uint8_t *) dest, + destsize, + &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("sort key generation failed: %s", + u_errorName(status)))); + } + else + result = pg_strnxfrm_prefix_icu_no_utf8(dest, src, srclen, destsize, + locale); + + return result; +} + +static size_t +pg_strxfrm_prefix_icu(char *dest, const char *src, size_t destsize, + pg_locale_t locale) +{ + Assert(locale->provider == COLLPROVIDER_ICU); + return pg_strnxfrm_prefix_icu(dest, src, -1, destsize, locale); +} + +#endif + +/* + * Return true if the collation provider supports pg_strxfrm() and + * pg_strnxfrm(); otherwise false. + * + * Unfortunately, it seems that strxfrm() for non-C collations is broken on + * many common platforms; testing of multiple versions of glibc reveals that, + * for many locales, strcoll() and strxfrm() do not return consistent + * results. While no other libc other than Cygwin has so far been shown to + * have a problem, we take the conservative course of action for right now and + * disable this categorically. (Users who are certain this isn't a problem on + * their system can set the developer GUC "trust_strxfrm".) + * + * No similar problem is known for the ICU provider. + */ +bool +pg_strxfrm_enabled(pg_locale_t locale) +{ + if (!locale || locale->provider == COLLPROVIDER_LIBC) + return trust_strxfrm; + else if (locale->provider == COLLPROVIDER_ICU) + return true; + else + /* shouldn't happen */ + elog(ERROR, "unsupported collprovider: %c", locale->provider); +} + +/* + * pg_strxfrm + * + * Transforms 'src' to a nul-terminated string stored in 'dest' such that + * ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on + * untransformed strings. + * + * The provided 'src' must be nul-terminated. + * + * If destsize is large enough to hold the result, returns the number of bytes + * copied to 'dest'; otherwise, returns the number of bytes needed to hold the + * result and leaves the contents of 'dest' undefined. If destsize is zero, + * 'dest' may be NULL. + */ +size_t +pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale) +{ + size_t result; + + if (!locale || locale->provider == COLLPROVIDER_LIBC) + result = pg_strxfrm_libc(dest, src, destsize, locale); +#ifdef USE_ICU + else if (locale->provider == COLLPROVIDER_ICU) + result = pg_strxfrm_icu(dest, src, destsize, locale); +#endif + else + /* shouldn't happen */ + elog(ERROR, "unsupported collprovider: %c", locale->provider); + + return result; +} + +/* + * pg_strnxfrm + * + * Transforms 'src' to a nul-terminated string stored in 'dest' such that + * ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on + * untransformed strings. + * + * If destsize is large enough to hold the result, returns the number of bytes + * copied to 'dest'; otherwise, returns the number of bytes needed to hold the + * result and leaves the contents of 'dest' undefined. If destsize is zero, + * 'dest' may be NULL. + * + * This function may need to nul-terminate the argument for libc functions; + * so if the caller already has a nul-terminated string, it should call + * pg_strxfrm() instead. + */ +size_t +pg_strnxfrm(char *dest, size_t destsize, const char *src, size_t srclen, + pg_locale_t locale) +{ + size_t result; + + if (!locale || locale->provider == COLLPROVIDER_LIBC) + result = pg_strnxfrm_libc(dest, src, srclen, destsize, locale); +#ifdef USE_ICU + else if (locale->provider == COLLPROVIDER_ICU) + result = pg_strnxfrm_icu(dest, src, srclen, destsize, locale); +#endif + else + /* shouldn't happen */ + elog(ERROR, "unsupported collprovider: %c", locale->provider); + + return result; +} + +/* + * Return true if the collation provider supports pg_strxfrm_prefix() and + * pg_strnxfrm_prefix(); otherwise false. + */ +bool +pg_strxfrm_prefix_enabled(pg_locale_t locale) +{ + if (!locale || locale->provider == COLLPROVIDER_LIBC) + return false; + else if (locale->provider == COLLPROVIDER_ICU) + return true; + else + /* shouldn't happen */ + elog(ERROR, "unsupported collprovider: %c", locale->provider); +} + +/* + * pg_strxfrm_prefix + * + * Transforms 'src' to a byte sequence stored in 'dest' such that ordinary + * memcmp() on the byte sequence is equivalent to pg_strcoll() on + * untransformed strings. The result is not nul-terminated. + * + * The provided 'src' must be nul-terminated. + * + * If destsize is not large enough to hold the entire result, stores just the + * prefix in 'dest'. Returns the number of bytes actually copied to 'dest'. + */ +size_t +pg_strxfrm_prefix(char *dest, const char *src, size_t destsize, + pg_locale_t locale) +{ + size_t result; + + if (!locale || locale->provider == COLLPROVIDER_LIBC) + elog(ERROR, "collprovider '%c' does not support pg_strxfrm_prefix()", + locale->provider); +#ifdef USE_ICU + else if (locale->provider == COLLPROVIDER_ICU) + result = pg_strxfrm_prefix_icu(dest, src, destsize, locale); +#endif + else + /* shouldn't happen */ + elog(ERROR, "unsupported collprovider: %c", locale->provider); + + return result; +} + +/* + * pg_strnxfrm_prefix + * + * Transforms 'src' to a byte sequence stored in 'dest' such that ordinary + * memcmp() on the byte sequence is equivalent to pg_strcoll() on + * untransformed strings. The result is not nul-terminated. + * + * The provided 'src' must be nul-terminated. + * + * If destsize is not large enough to hold the entire result, stores just the + * prefix in 'dest'. Returns the number of bytes actually copied to 'dest'. + * + * This function may need to nul-terminate the argument for libc functions; + * so if the caller already has a nul-terminated string, it should call + * pg_strxfrm_prefix() instead. + */ +size_t +pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, + size_t srclen, pg_locale_t locale) +{ + size_t result; + + if (!locale || locale->provider == COLLPROVIDER_LIBC) + result = pg_strnxfrm_prefix_libc(dest, src, srclen, destsize, locale); +#ifdef USE_ICU + else if (locale->provider == COLLPROVIDER_ICU) + result = pg_strnxfrm_prefix_icu(dest, src, srclen, destsize, locale); +#endif + else + /* shouldn't happen */ + elog(ERROR, "unsupported collprovider: %c", locale->provider); + + return result; +} + +#ifdef USE_ICU static void init_icu_converter(void) { @@ -1767,6 +2490,39 @@ init_icu_converter(void) icu_converter = conv; } +/* + * Find length, in UChars, of given string if converted to UChar string. + */ +static size_t +uchar_length(UConverter *converter, const char *str, size_t len) +{ + UErrorCode status = U_ZERO_ERROR; + int32_t ulen; + ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status); + if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) + ereport(ERROR, + (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status)))); + return ulen; +} + +/* + * Convert the given source string into a UChar string, stored in dest, and + * return the length (in UChars). + */ +static int32_t +uchar_convert(UConverter *converter, UChar *dest, int32_t destlen, + const char *src, size_t srclen) +{ + UErrorCode status = U_ZERO_ERROR; + int32_t ulen; + status = U_ZERO_ERROR; + ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status)))); + return ulen; +} + /* * Convert a string in the database encoding into a string of UChars. * @@ -1782,26 +2538,15 @@ init_icu_converter(void) int32_t icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes) { - UErrorCode status; - int32_t len_uchar; + int32_t len_uchar; init_icu_converter(); - status = U_ZERO_ERROR; - len_uchar = ucnv_toUChars(icu_converter, NULL, 0, - buff, nbytes, &status); - if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) - ereport(ERROR, - (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status)))); + len_uchar = uchar_length(icu_converter, buff, nbytes); *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar)); - - status = U_ZERO_ERROR; - len_uchar = ucnv_toUChars(icu_converter, *buff_uchar, len_uchar + 1, - buff, nbytes, &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status)))); + len_uchar = uchar_convert(icu_converter, + *buff_uchar, len_uchar + 1, buff, nbytes); return len_uchar; } diff --git a/src/backend/utils/adt/varchar.c b/src/backend/utils/adt/varchar.c index 8ddbae8f51..5bbba94687 100644 --- a/src/backend/utils/adt/varchar.c +++ b/src/backend/utils/adt/varchar.c @@ -1024,21 +1024,17 @@ hashbpchar(PG_FUNCTION_ARGS) #ifdef USE_ICU if (mylocale->provider == COLLPROVIDER_ICU) { - int32_t ulen = -1; - UChar *uchar = NULL; - Size bsize; - uint8_t *buf; + Size bsize, rsize; + char *buf; - ulen = icu_to_uchar(&uchar, keydata, keylen); - - bsize = ucol_getSortKey(mylocale->info.icu.ucol, - uchar, ulen, NULL, 0); + bsize = pg_strnxfrm(NULL, 0, keydata, keylen, mylocale); buf = palloc(bsize); - ucol_getSortKey(mylocale->info.icu.ucol, - uchar, ulen, buf, bsize); - pfree(uchar); - result = hash_any(buf, bsize); + rsize = pg_strnxfrm(buf, bsize, keydata, keylen, mylocale); + if (rsize != bsize) + elog(ERROR, "pg_strnxfrm() returned unexpected result"); + + result = hash_any((uint8_t *) buf, bsize); pfree(buf); } @@ -1086,21 +1082,18 @@ hashbpcharextended(PG_FUNCTION_ARGS) #ifdef USE_ICU if (mylocale->provider == COLLPROVIDER_ICU) { - int32_t ulen = -1; - UChar *uchar = NULL; - Size bsize; - uint8_t *buf; + Size bsize, rsize; + char *buf; - ulen = icu_to_uchar(&uchar, keydata, keylen); - - bsize = ucol_getSortKey(mylocale->info.icu.ucol, - uchar, ulen, NULL, 0); + bsize = pg_strnxfrm(NULL, 0, keydata, keylen, mylocale); buf = palloc(bsize); - ucol_getSortKey(mylocale->info.icu.ucol, - uchar, ulen, buf, bsize); - pfree(uchar); - result = hash_any_extended(buf, bsize, PG_GETARG_INT64(1)); + rsize = pg_strnxfrm(buf, bsize, keydata, keylen, mylocale); + if (rsize != bsize) + elog(ERROR, "pg_strnxfrm() returned unexpected result"); + + result = hash_any_extended((uint8_t *) buf, bsize, + PG_GETARG_INT64(1)); pfree(buf); } diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 33ffdb013a..5030328f31 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -1537,10 +1537,6 @@ varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid) } else { - char a1buf[TEXTBUFLEN]; - char a2buf[TEXTBUFLEN]; - char *a1p, - *a2p; pg_locale_t mylocale; mylocale = pg_newlocale_from_collation(collid); @@ -1557,171 +1553,16 @@ varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid) if (len1 == len2 && memcmp(arg1, arg2, len1) == 0) return 0; -#ifdef WIN32 - /* Win32 does not have UTF-8, so we need to map to UTF-16 */ - if (GetDatabaseEncoding() == PG_UTF8 - && (!mylocale || mylocale->provider == COLLPROVIDER_LIBC)) - { - int a1len; - int a2len; - int r; - - if (len1 >= TEXTBUFLEN / 2) - { - a1len = len1 * 2 + 2; - a1p = palloc(a1len); - } - else - { - a1len = TEXTBUFLEN; - a1p = a1buf; - } - if (len2 >= TEXTBUFLEN / 2) - { - a2len = len2 * 2 + 2; - a2p = palloc(a2len); - } - else - { - a2len = TEXTBUFLEN; - a2p = a2buf; - } - - /* stupid Microsloth API does not work for zero-length input */ - if (len1 == 0) - r = 0; - else - { - r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1, - (LPWSTR) a1p, a1len / 2); - if (!r) - ereport(ERROR, - (errmsg("could not convert string to UTF-16: error code %lu", - GetLastError()))); - } - ((LPWSTR) a1p)[r] = 0; - - if (len2 == 0) - r = 0; - else - { - r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2, - (LPWSTR) a2p, a2len / 2); - if (!r) - ereport(ERROR, - (errmsg("could not convert string to UTF-16: error code %lu", - GetLastError()))); - } - ((LPWSTR) a2p)[r] = 0; - - errno = 0; -#ifdef HAVE_LOCALE_T - if (mylocale) - result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale->info.lt); - else -#endif - result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p); - if (result == 2147483647) /* _NLSCMPERROR; missing from mingw - * headers */ - ereport(ERROR, - (errmsg("could not compare Unicode strings: %m"))); - - /* Break tie if necessary. */ - if (result == 0 && - (!mylocale || mylocale->deterministic)) - { - result = memcmp(arg1, arg2, Min(len1, len2)); - if ((result == 0) && (len1 != len2)) - result = (len1 < len2) ? -1 : 1; - } - - if (a1p != a1buf) - pfree(a1p); - if (a2p != a2buf) - pfree(a2p); - - return result; - } -#endif /* WIN32 */ - - if (len1 >= TEXTBUFLEN) - a1p = (char *) palloc(len1 + 1); - else - a1p = a1buf; - if (len2 >= TEXTBUFLEN) - a2p = (char *) palloc(len2 + 1); - else - a2p = a2buf; - - memcpy(a1p, arg1, len1); - a1p[len1] = '\0'; - memcpy(a2p, arg2, len2); - a2p[len2] = '\0'; - - if (mylocale) - { - if (mylocale->provider == COLLPROVIDER_ICU) - { -#ifdef USE_ICU -#ifdef HAVE_UCOL_STRCOLLUTF8 - if (GetDatabaseEncoding() == PG_UTF8) - { - UErrorCode status; - - status = U_ZERO_ERROR; - result = ucol_strcollUTF8(mylocale->info.icu.ucol, - arg1, len1, - arg2, len2, - &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("collation failed: %s", u_errorName(status)))); - } - else -#endif - { - int32_t ulen1, - ulen2; - UChar *uchar1, - *uchar2; - - ulen1 = icu_to_uchar(&uchar1, arg1, len1); - ulen2 = icu_to_uchar(&uchar2, arg2, len2); - - result = ucol_strcoll(mylocale->info.icu.ucol, - uchar1, ulen1, - uchar2, ulen2); - - pfree(uchar1); - pfree(uchar2); - } -#else /* not USE_ICU */ - /* shouldn't happen */ - elog(ERROR, "unsupported collprovider: %c", mylocale->provider); -#endif /* not USE_ICU */ - } - else - { -#ifdef HAVE_LOCALE_T - result = strcoll_l(a1p, a2p, mylocale->info.lt); -#else - /* shouldn't happen */ - elog(ERROR, "unsupported collprovider: %c", mylocale->provider); -#endif - } - } - else - result = strcoll(a1p, a2p); + result = pg_strncoll(arg1, len1, arg2, len2, mylocale); /* Break tie if necessary. */ if (result == 0 && (!mylocale || mylocale->deterministic)) - result = strcmp(a1p, a2p); - - if (a1p != a1buf) - pfree(a1p); - if (a2p != a2buf) - pfree(a2p); + { + result = memcmp(arg1, arg2, Min(len1, len2)); + if ((result == 0) && (len1 != len2)) + result = (len1 < len2) ? -1 : 1; + } } return result; @@ -2057,20 +1898,6 @@ varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid) */ locale = pg_newlocale_from_collation(collid); - /* - * There is a further exception on Windows. When the database - * encoding is UTF-8 and we are not using the C collation, complex - * hacks are required. We don't currently have a comparator that - * handles that case, so we fall back on the slow method of having the - * sort code invoke bttextcmp() (in the case of text) via the fmgr - * trampoline. ICU locales work just the same on Windows, however. - */ -#ifdef WIN32 - if (GetDatabaseEncoding() == PG_UTF8 && - !(locale && locale->provider == COLLPROVIDER_ICU)) - return; -#endif - /* * We use varlenafastcmp_locale except for type NAME. */ @@ -2086,13 +1913,7 @@ varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid) /* * Unfortunately, it seems that abbreviation for non-C collations is - * broken on many common platforms; testing of multiple versions of glibc - * reveals that, for many locales, strcoll() and strxfrm() do not return - * consistent results, which is fatal to this optimization. While no - * other libc other than Cygwin has so far been shown to have a problem, - * we take the conservative course of action for right now and disable - * this categorically. (Users who are certain this isn't a problem on - * their system can define TRUST_STRXFRM.) + * broken on many common platforms; see pg_strxfrm_enabled(). * * Even apart from the risk of broken locales, it's possible that there * are platforms where the use of abbreviated keys should be disabled at @@ -2105,10 +1926,8 @@ varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid) * categorically, we may still want or need to disable it for particular * platforms. */ -#ifndef TRUST_STRXFRM - if (!collate_c && !(locale && locale->provider == COLLPROVIDER_ICU)) + if (!collate_c && !pg_strxfrm_enabled(locale)) abbreviate = false; -#endif /* * If we're using abbreviated keys, or if we're using a locale-aware @@ -2379,60 +2198,7 @@ varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup) return sss->last_returned; } - if (sss->locale) - { - if (sss->locale->provider == COLLPROVIDER_ICU) - { -#ifdef USE_ICU -#ifdef HAVE_UCOL_STRCOLLUTF8 - if (GetDatabaseEncoding() == PG_UTF8) - { - UErrorCode status; - - status = U_ZERO_ERROR; - result = ucol_strcollUTF8(sss->locale->info.icu.ucol, - a1p, len1, - a2p, len2, - &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("collation failed: %s", u_errorName(status)))); - } - else -#endif - { - int32_t ulen1, - ulen2; - UChar *uchar1, - *uchar2; - - ulen1 = icu_to_uchar(&uchar1, a1p, len1); - ulen2 = icu_to_uchar(&uchar2, a2p, len2); - - result = ucol_strcoll(sss->locale->info.icu.ucol, - uchar1, ulen1, - uchar2, ulen2); - - pfree(uchar1); - pfree(uchar2); - } -#else /* not USE_ICU */ - /* shouldn't happen */ - elog(ERROR, "unsupported collprovider: %c", sss->locale->provider); -#endif /* not USE_ICU */ - } - else - { -#ifdef HAVE_LOCALE_T - result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt); -#else - /* shouldn't happen */ - elog(ERROR, "unsupported collprovider: %c", sss->locale->provider); -#endif - } - } - else - result = strcoll(sss->buf1, sss->buf2); + result = pg_strcoll(sss->buf1, sss->buf2, sss->locale); /* Break tie if necessary. */ if (result == 0 && @@ -2455,6 +2221,7 @@ varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup) static Datum varstr_abbrev_convert(Datum original, SortSupport ssup) { + const size_t max_prefix_bytes = sizeof(Datum); VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra; VarString *authoritative = DatumGetVarStringPP(original); char *authoritative_data = VARDATA_ANY(authoritative); @@ -2467,7 +2234,7 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) pres = (char *) &res; /* memset(), so any non-overwritten bytes are NUL */ - memset(pres, 0, sizeof(Datum)); + memset(pres, 0, max_prefix_bytes); len = VARSIZE_ANY_EXHDR(authoritative); /* Get number of bytes, ignoring trailing spaces */ @@ -2502,14 +2269,10 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) * thing: explicitly consider string length. */ if (sss->collate_c) - memcpy(pres, authoritative_data, Min(len, sizeof(Datum))); + memcpy(pres, authoritative_data, Min(len, max_prefix_bytes)); else { Size bsize; -#ifdef USE_ICU - int32_t ulen = -1; - UChar *uchar = NULL; -#endif /* * We're not using the C collation, so fall back on strxfrm or ICU @@ -2527,7 +2290,7 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) if (sss->last_len1 == len && sss->cache_blob && memcmp(sss->buf1, authoritative_data, len) == 0) { - memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2)); + memcpy(pres, sss->buf2, Min(max_prefix_bytes, sss->last_len2)); /* No change affecting cardinality, so no hashing required */ goto done; } @@ -2535,81 +2298,49 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) memcpy(sss->buf1, authoritative_data, len); /* - * Just like strcoll(), strxfrm() expects a NUL-terminated string. Not - * necessary for ICU, but doesn't hurt. + * pg_strxfrm() and pg_strxfrm_prefix expect NUL-terminated + * strings. */ sss->buf1[len] = '\0'; sss->last_len1 = len; -#ifdef USE_ICU - /* When using ICU and not UTF8, convert string to UChar. */ - if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU && - GetDatabaseEncoding() != PG_UTF8) - ulen = icu_to_uchar(&uchar, sss->buf1, len); -#endif - - /* - * Loop: Call strxfrm() or ucol_getSortKey(), possibly enlarge buffer, - * and try again. Both of these functions have the result buffer - * content undefined if the result did not fit, so we need to retry - * until everything fits, even though we only need the first few bytes - * in the end. When using ucol_nextSortKeyPart(), however, we only - * ask for as many bytes as we actually need. - */ - for (;;) + if (pg_strxfrm_prefix_enabled(sss->locale)) { -#ifdef USE_ICU - if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU) + if (sss->buflen2 < max_prefix_bytes) { - /* - * When using UTF8, use the iteration interface so we only - * need to produce as many bytes as we actually need. - */ - if (GetDatabaseEncoding() == PG_UTF8) - { - UCharIterator iter; - uint32_t state[2]; - UErrorCode status; - - uiter_setUTF8(&iter, sss->buf1, len); - state[0] = state[1] = 0; /* won't need that again */ - status = U_ZERO_ERROR; - bsize = ucol_nextSortKeyPart(sss->locale->info.icu.ucol, - &iter, - state, - (uint8_t *) sss->buf2, - Min(sizeof(Datum), sss->buflen2), - &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("sort key generation failed: %s", - u_errorName(status)))); - } - else - bsize = ucol_getSortKey(sss->locale->info.icu.ucol, - uchar, ulen, - (uint8_t *) sss->buf2, sss->buflen2); + sss->buflen2 = Max(max_prefix_bytes, + Min(sss->buflen2 * 2, MaxAllocSize)); + sss->buf2 = repalloc(sss->buf2, sss->buflen2); } - else -#endif -#ifdef HAVE_LOCALE_T - if (sss->locale && sss->locale->provider == COLLPROVIDER_LIBC) - bsize = strxfrm_l(sss->buf2, sss->buf1, - sss->buflen2, sss->locale->info.lt); - else -#endif - bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2); - - sss->last_len2 = bsize; - if (bsize < sss->buflen2) - break; + bsize = pg_strxfrm_prefix(sss->buf2, sss->buf1, + max_prefix_bytes, sss->locale); + } + else + { /* - * Grow buffer and retry. + * Loop: Call pg_strxfrm(), possibly enlarge buffer, and try + * again. The pg_strxfrm() function leaves the result buffer + * content undefined if the result did not fit, so we need to + * retry until everything fits, even though we only need the first + * few bytes in the end. */ - sss->buflen2 = Max(bsize + 1, - Min(sss->buflen2 * 2, MaxAllocSize)); - sss->buf2 = repalloc(sss->buf2, sss->buflen2); + for (;;) + { + bsize = pg_strxfrm(sss->buf2, sss->buf1, sss->buflen2, + sss->locale); + + sss->last_len2 = bsize; + if (bsize < sss->buflen2) + break; + + /* + * Grow buffer and retry. + */ + sss->buflen2 = Max(bsize + 1, + Min(sss->buflen2 * 2, MaxAllocSize)); + sss->buf2 = repalloc(sss->buf2, sss->buflen2); + } } /* @@ -2621,12 +2352,7 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) * (Actually, even if there were NUL bytes in the blob it would be * okay. See remarks on bytea case above.) */ - memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize)); - -#ifdef USE_ICU - if (uchar) - pfree(uchar); -#endif + memcpy(pres, sss->buf2, Min(max_prefix_bytes, bsize)); } /* diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index cd0fc2cb8f..4562b028c7 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -1954,6 +1954,17 @@ struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, + { + {"trust_strxfrm", PGC_USERSET, DEVELOPER_OPTIONS, + gettext_noop("Allow use of strxfrm() for abbreviated keys optimization for libc provider."), + NULL, + GUC_NOT_IN_SAMPLE + }, + &trust_strxfrm, + false, + NULL, NULL, NULL + }, + { {"data_sync_retry", PGC_POSTMASTER, ERROR_HANDLING_OPTIONS, gettext_noop("Whether to continue running after a failure to sync data files."), diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index cede43440b..c08e6ce07f 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -40,6 +40,7 @@ extern PGDLLIMPORT char *locale_messages; extern PGDLLIMPORT char *locale_monetary; extern PGDLLIMPORT char *locale_numeric; extern PGDLLIMPORT char *locale_time; +extern PGDLLIMPORT bool trust_strxfrm; /* lc_time localization cache */ extern PGDLLIMPORT char *localized_abbrev_days[]; @@ -100,6 +101,19 @@ extern void make_icu_collator(const char *iculocstr, extern pg_locale_t pg_newlocale_from_collation(Oid collid); extern char *get_collation_actual_version(char collprovider, const char *collcollate); +extern int pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale); +extern int pg_strncoll(const char *arg1, size_t len1, + const char *arg2, size_t len2, pg_locale_t locale); +extern bool pg_strxfrm_enabled(pg_locale_t locale); +extern size_t pg_strxfrm(char *dest, const char *src, size_t destsize, + pg_locale_t locale); +extern size_t pg_strnxfrm(char *dest, size_t destsize, const char *src, + size_t srclen, pg_locale_t locale); +extern bool pg_strxfrm_prefix_enabled(pg_locale_t locale); +extern size_t pg_strxfrm_prefix(char *dest, const char *src, size_t destsize, + pg_locale_t locale); +extern size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, + size_t srclen, pg_locale_t locale); #ifdef USE_ICU extern int32_t icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes); -- 2.34.1