Hi, I'm bumping this thread on pgsql-hacker, hopefully it will drag some more opinions/discussions.
Should we try to fix this issue or not? This is clearly an upstream bug. It has been reported, including regression tests, but this doesn't move since 2 years now. If we choose not to fix it on our side using eg a workaround (see patch), I suppose this small bug should be documented somewhere so people are not lost alone in the wild. Opinions? Regards, Begin forwarded message: Date: Sat, 13 Jun 2020 00:43:22 +0200 From: Jehan-Guillaume de Rorthais <j...@dalibo.com> To: Thomas Munro <thomas.mu...@gmail.com>, Peter Geoghegan <p...@bowt.ie> Cc: Роман Литовченко <roman.lytovche...@gmail.com>, PostgreSQL mailing lists <pgsql-b...@lists.postgresql.org> Subject: Re: BUG #15285: Query used index over field with ICU collation in some cases wrongly return 0 rows On Fri, 12 Jun 2020 18:40:55 +0200 Jehan-Guillaume de Rorthais <j...@dalibo.com> wrote: > On Wed, 10 Jun 2020 00:29:33 +0200 > Jehan-Guillaume de Rorthais <j...@dalibo.com> wrote: > [...] > > After playing with ICU regression tests, I found functions ucol_strcollIter > > and ucol_nextSortKeyPart are safe. I'll do some performance tests and report > > here. > > I did some benchmarks. See attachment for the script and its header to > reproduce. > > It sorts 935895 french phrases from 0 to 122 chars with an average of 49. > Performance tests were done on current master HEAD (buggy) and using the patch > in attachment, relying on ucol_strcollIter. > > My preliminary test with ucol_getSortKey was catastrophic, as we might > expect. 15-17x slower than the current HEAD. So I removed it from actual > tests. I didn't try with ucol_nextSortKeyPart though. > > Using ucol_strcollIter performs ~20% slower than HEAD on UTF8 databases, but > this might be acceptable. Here are the numbers: > > DB Encoding HEAD strcollIter ratio > UTF8 2.74 3.27 1.19x > LATIN1 5.34 5.40 1.01x > > I plan to add a regression test soon. Please, find in attachment the second version of the patch, with a regression test. Regards, -- Jehan-Guillaume de Rorthais Dalibo
>From bb428135490caafe23562e3dcd9925d7d7c5a7be Mon Sep 17 00:00:00 2001 From: Jehan-Guillaume de Rorthais <j...@dalibo.com> Date: Thu, 11 Jun 2020 18:08:31 +0200 Subject: [PATCH] Replace buggy ucol_strcoll* funcs with ucol_strcollIter Functions ucol_strcoll and ucol_strcollUTF8 are breaking some collation rules. This leads to wrong sort order or wrong result from index using such collations. See bug report #15285 for details: http://postgr.es/m/153201618542.1404.3611626898935613264%40wrigleys.postgresql.org This fix replace ucol_strcoll* functions with ucol_strcollIter() which is not affected by this bug. Reported-by: Roman Lytovchenko Analysed-by: Peter Geoghegan Author: Jehan-Guillaume de Rorthais --- src/backend/utils/adt/varlena.c | 54 ++++++++++++------- src/include/utils/pg_locale.h | 14 ----- .../regress/expected/collate.icu.utf8.out | 12 +++++ src/test/regress/sql/collate.icu.utf8.sql | 8 +++ 4 files changed, 56 insertions(+), 32 deletions(-) diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 2eaabd6231..f156c00a1a 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -1638,34 +1638,43 @@ varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid) if (mylocale->provider == COLLPROVIDER_ICU) { #ifdef USE_ICU -#ifdef HAVE_UCOL_STRCOLLUTF8 if (GetDatabaseEncoding() == PG_UTF8) { UErrorCode status; + UCharIterator iter1, + iter2; status = U_ZERO_ERROR; - result = ucol_strcollUTF8(mylocale->info.icu.ucol, - arg1, len1, - arg2, len2, - &status); + + uiter_setUTF8(&iter1, arg1, len1); + uiter_setUTF8(&iter2, arg2, len2); + + result = ucol_strcollIter(mylocale->info.icu.ucol, + &iter1, &iter2, &status); if (U_FAILURE(status)) ereport(ERROR, (errmsg("collation failed: %s", u_errorName(status)))); } else -#endif { + UErrorCode status; + UCharIterator iter1, + iter2; int32_t ulen1, ulen2; UChar *uchar1, *uchar2; + status = U_ZERO_ERROR; + ulen1 = icu_to_uchar(&uchar1, arg1, len1); ulen2 = icu_to_uchar(&uchar2, arg2, len2); - result = ucol_strcoll(mylocale->info.icu.ucol, - uchar1, ulen1, - uchar2, ulen2); + uiter_setString(&iter1, uchar1, ulen1); + uiter_setString(&iter2, uchar2, ulen2); + + result = ucol_strcollIter(mylocale->info.icu.ucol, + &iter1, &iter2, &status); pfree(uchar1); pfree(uchar2); @@ -2352,34 +2361,43 @@ varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup) if (sss->locale->provider == COLLPROVIDER_ICU) { #ifdef USE_ICU -#ifdef HAVE_UCOL_STRCOLLUTF8 if (GetDatabaseEncoding() == PG_UTF8) { UErrorCode status; + UCharIterator iter1, + iter2; status = U_ZERO_ERROR; - result = ucol_strcollUTF8(sss->locale->info.icu.ucol, - a1p, len1, - a2p, len2, - &status); + + uiter_setUTF8(&iter1, a1p, len1); + uiter_setUTF8(&iter2, a2p, len2); + + result = ucol_strcollIter(sss->locale->info.icu.ucol, + &iter1, &iter2, &status); if (U_FAILURE(status)) ereport(ERROR, (errmsg("collation failed: %s", u_errorName(status)))); } else -#endif { + UErrorCode status; + UCharIterator iter1, + iter2; int32_t ulen1, ulen2; UChar *uchar1, *uchar2; + status = U_ZERO_ERROR; + ulen1 = icu_to_uchar(&uchar1, a1p, len1); ulen2 = icu_to_uchar(&uchar2, a2p, len2); - result = ucol_strcoll(sss->locale->info.icu.ucol, - uchar1, ulen1, - uchar2, ulen2); + uiter_setString(&iter1, uchar1, ulen1); + uiter_setString(&iter2, uchar2, ulen2); + + result = ucol_strcollIter(sss->locale->info.icu.ucol, + &iter1, &iter2, &status); pfree(uchar1); pfree(uchar2); diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index 9cb7d91ddf..2b3ec2c597 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -21,20 +21,6 @@ #include "utils/guc.h" -#ifdef USE_ICU -/* - * ucol_strcollUTF8() was introduced in ICU 50, but it is buggy before ICU 53. - * (see - * <https://www.postgresql.org/message-id/flat/f1438ec6-22aa-4029-9a3b-26f79d330e72%40manitou-mail.org>) - */ -#if U_ICU_VERSION_MAJOR_NUM >= 53 -#define HAVE_UCOL_STRCOLLUTF8 1 -#else -#undef HAVE_UCOL_STRCOLLUTF8 -#endif -#endif - - /* GUC settings */ extern char *locale_messages; extern char *locale_monetary; diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out index 2b86ce9028..06cdb948eb 100644 --- a/src/test/regress/expected/collate.icu.utf8.out +++ b/src/test/regress/expected/collate.icu.utf8.out @@ -1900,6 +1900,18 @@ SELECT (SELECT count(*) FROM test33_0) <> (SELECT count(*) FROM test33_1); t (1 row) +CREATE COLLATION digitslast (provider = icu, locale = '@colReorder=latn-digit'); +CREATE TABLE test34 (b CHAR(4) NOT NULL COLLATE digitslast); +INSERT INTO test34 VALUES ('0000'), ('0001'), ('ABCD'); +CREATE INDEX ON test34(b); +SET enable_seqscan TO off; +SELECT * FROM test34 WHERE b = '0000' ; + b +------ + 0000 +(1 row) + +RESET enable_seqscan; -- cleanup RESET search_path; SET client_min_messages TO warning; diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql index 67de7d9794..50d0be70a8 100644 --- a/src/test/regress/sql/collate.icu.utf8.sql +++ b/src/test/regress/sql/collate.icu.utf8.sql @@ -722,6 +722,14 @@ INSERT INTO test33 VALUES (2, 'DEF'); SELECT (SELECT count(*) FROM test33_0) <> (SELECT count(*) FROM test33_1); +CREATE COLLATION digitslast (provider = icu, locale = '@colReorder=latn-digit'); +CREATE TABLE test34 (b CHAR(4) NOT NULL COLLATE digitslast); +INSERT INTO test34 VALUES ('0000'), ('0001'), ('ABCD'); +CREATE INDEX ON test34(b); +SET enable_seqscan TO off; +SELECT * FROM test34 WHERE b = '0000' ; +RESET enable_seqscan; + -- cleanup RESET search_path; SET client_min_messages TO warning; -- 2.20.1