On Tue, 2023-03-14 at 10:10 -0700, Jeff Davis wrote:
> One loose end is that we really should support language tags like
> "und"
> in those older versions (54 and earlier). Your commit d72900bded
> avoided the problem, but perhaps we should fix it by looking for
> "und"
> and replacing it with "root" while opening, or something.

Attached are a few patches to implement this idea.

-- 
Jeff Davis
PostgreSQL Contributor Team - AWS


From fa789a446b425cd491a0452acb2b2f135ce06f5a Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Tue, 14 Mar 2023 09:58:29 -0700
Subject: [PATCH v4 1/3] Support language tags in older ICU versions (53 and
 earlier).

By calling uloc_canonicalize() before parsing the attributes, the
existing locale attribute parsing logic works on language tags as
well.

Fix a small memory leak, too.
---
 src/backend/commands/collationcmds.c          |  8 +++---
 src/backend/utils/adt/pg_locale.c             | 26 ++++++++++++++++---
 .../regress/expected/collate.icu.utf8.out     |  8 ++++++
 src/test/regress/sql/collate.icu.utf8.sql     |  4 +++
 4 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c
index 8949684afe..b8f2e7059f 100644
--- a/src/backend/commands/collationcmds.c
+++ b/src/backend/commands/collationcmds.c
@@ -950,7 +950,6 @@ pg_import_system_collations(PG_FUNCTION_ARGS)
 			const char *name;
 			char	   *langtag;
 			char	   *icucomment;
-			const char *iculocstr;
 			Oid			collid;
 
 			if (i == -1)
@@ -959,20 +958,19 @@ pg_import_system_collations(PG_FUNCTION_ARGS)
 				name = uloc_getAvailable(i);
 
 			langtag = get_icu_language_tag(name);
-			iculocstr = U_ICU_VERSION_MAJOR_NUM >= 54 ? langtag : name;
 
 			/*
 			 * Be paranoid about not allowing any non-ASCII strings into
 			 * pg_collation
 			 */
-			if (!pg_is_ascii(langtag) || !pg_is_ascii(iculocstr))
+			if (!pg_is_ascii(langtag) || !pg_is_ascii(langtag))
 				continue;
 
 			collid = CollationCreate(psprintf("%s-x-icu", langtag),
 									 nspid, GetUserId(),
 									 COLLPROVIDER_ICU, true, -1,
-									 NULL, NULL, iculocstr, NULL,
-									 get_collation_actual_version(COLLPROVIDER_ICU, iculocstr),
+									 NULL, NULL, langtag, NULL,
+									 get_collation_actual_version(COLLPROVIDER_ICU, langtag),
 									 true, true);
 			if (OidIsValid(collid))
 			{
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 1d3d4d86d3..b9c7fbd511 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -2643,9 +2643,28 @@ pg_attribute_unused()
 static void
 icu_set_collation_attributes(UCollator *collator, const char *loc)
 {
-	char	   *str = asc_tolower(loc, strlen(loc));
+	UErrorCode	status;
+	int32_t		len;
+	char	   *icu_locale_id;
+	char	   *lower_str;
+	char	   *str;
 
-	str = strchr(str, '@');
+	/* first, make sure the string is an ICU format locale ID */
+	status = U_ZERO_ERROR;
+	len = uloc_canonicalize(loc, NULL, 0, &status);
+	icu_locale_id = palloc(len + 1);
+	status = U_ZERO_ERROR;
+	len = uloc_canonicalize(loc, icu_locale_id, len + 1, &status);
+	if (U_FAILURE(status))
+		ereport(ERROR,
+				(errmsg("canonicalization failed for locale string \"%s\": %s",
+						loc, u_errorName(status))));
+
+	lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id));
+
+	pfree(icu_locale_id);
+
+	str = strchr(lower_str, '@');
 	if (!str)
 		return;
 	str++;
@@ -2660,7 +2679,6 @@ icu_set_collation_attributes(UCollator *collator, const char *loc)
 			char	   *value;
 			UColAttribute uattr;
 			UColAttributeValue uvalue;
-			UErrorCode	status;
 
 			status = U_ZERO_ERROR;
 
@@ -2727,6 +2745,8 @@ icu_set_collation_attributes(UCollator *collator, const char *loc)
 								loc, u_errorName(status))));
 		}
 	}
+
+	pfree(lower_str);
 }
 
 #endif							/* USE_ICU */
diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out
index 9a3e12e42d..6225b575ce 100644
--- a/src/test/regress/expected/collate.icu.utf8.out
+++ b/src/test/regress/expected/collate.icu.utf8.out
@@ -1304,6 +1304,14 @@ SELECT 'abc' <= 'ABC' COLLATE case_insensitive, 'abc' >= 'ABC' COLLATE case_inse
  t        | t
 (1 row)
 
+-- test language tags
+CREATE COLLATION lt_insensitive (provider = icu, locale = 'en-u-ks-level1', deterministic = false);
+SELECT 'aBcD' COLLATE lt_insensitive = 'AbCd' COLLATE lt_insensitive;
+ ?column? 
+----------
+ t
+(1 row)
+
 CREATE TABLE test1cs (x text COLLATE case_sensitive);
 CREATE TABLE test2cs (x text COLLATE case_sensitive);
 CREATE TABLE test3cs (x text COLLATE case_sensitive);
diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql
index 0790068f31..64cbfd0a5b 100644
--- a/src/test/regress/sql/collate.icu.utf8.sql
+++ b/src/test/regress/sql/collate.icu.utf8.sql
@@ -518,6 +518,10 @@ CREATE COLLATION case_insensitive (provider = icu, locale = '@colStrength=second
 SELECT 'abc' <= 'ABC' COLLATE case_sensitive, 'abc' >= 'ABC' COLLATE case_sensitive;
 SELECT 'abc' <= 'ABC' COLLATE case_insensitive, 'abc' >= 'ABC' COLLATE case_insensitive;
 
+-- test language tags
+CREATE COLLATION lt_insensitive (provider = icu, locale = 'en-u-ks-level1', deterministic = false);
+SELECT 'aBcD' COLLATE lt_insensitive = 'AbCd' COLLATE lt_insensitive;
+
 CREATE TABLE test1cs (x text COLLATE case_sensitive);
 CREATE TABLE test2cs (x text COLLATE case_sensitive);
 CREATE TABLE test3cs (x text COLLATE case_sensitive);
-- 
2.34.1

From c9f089eeedb6c145ad36b8618da7a71d0e69fd8c Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Tue, 14 Mar 2023 21:21:17 -0700
Subject: [PATCH v4 2/3] Wrap ICU ucol_open().

Hide details of supporting older ICU versions in a wrapper
function. The current code only needs to handle
icu_set_collation_attributes(), but a subsequent commit will add
additional version-specific code.
---
 src/backend/utils/adt/pg_locale.c | 52 +++++++++++++++----------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index b9c7fbd511..04d1fa72a8 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1420,24 +1420,35 @@ lc_ctype_is_c(Oid collation)
 
 struct pg_locale_struct default_locale;
 
-void
-make_icu_collator(const char *iculocstr,
-				  const char *icurules,
-				  struct pg_locale_struct *resultp)
+static UCollator *
+pg_ucol_open(const char *locale_str)
 {
-#ifdef USE_ICU
 	UCollator  *collator;
 	UErrorCode	status;
 
 	status = U_ZERO_ERROR;
-	collator = ucol_open(iculocstr, &status);
+	collator = ucol_open(locale_str, &status);
 	if (U_FAILURE(status))
 		ereport(ERROR,
 				(errmsg("could not open collator for locale \"%s\": %s",
-						iculocstr, u_errorName(status))));
+						locale_str, u_errorName(status))));
+
+#if U_ICU_VERSION_MAJOR_NUM < 54
+	icu_set_collation_attributes(collator, locale_str);
+#endif
+
+	return collator;
+}
+
+void
+make_icu_collator(const char *iculocstr,
+				  const char *icurules,
+				  struct pg_locale_struct *resultp)
+{
+#ifdef USE_ICU
+	UCollator  *collator;
 
-	if (U_ICU_VERSION_MAJOR_NUM < 54)
-		icu_set_collation_attributes(collator, iculocstr);
+	collator = pg_ucol_open(iculocstr);
 
 	/*
 	 * If rules are specified, we extract the rules of the standard collation,
@@ -1448,6 +1459,7 @@ make_icu_collator(const char *iculocstr,
 		const UChar *default_rules;
 		UChar	   *agg_rules;
 		UChar	   *my_rules;
+		UErrorCode	status;
 		int32_t		length;
 
 		default_rules = ucol_getRules(collator, &length);
@@ -1719,16 +1731,11 @@ get_collation_actual_version(char collprovider, const char *collcollate)
 	if (collprovider == COLLPROVIDER_ICU)
 	{
 		UCollator  *collator;
-		UErrorCode	status;
 		UVersionInfo versioninfo;
 		char		buf[U_MAX_VERSION_STRING_LENGTH];
 
-		status = U_ZERO_ERROR;
-		collator = ucol_open(collcollate, &status);
-		if (U_FAILURE(status))
-			ereport(ERROR,
-					(errmsg("could not open collator for locale \"%s\": %s",
-							collcollate, u_errorName(status))));
+		collator = pg_ucol_open(collcollate);
+
 		ucol_getVersion(collator, versioninfo);
 		ucol_close(collator);
 
@@ -2639,6 +2646,7 @@ icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar)
  * ucol_open(), so this is only necessary for emulating this behavior on older
  * versions.
  */
+#if U_ICU_VERSION_MAJOR_NUM < 54
 pg_attribute_unused()
 static void
 icu_set_collation_attributes(UCollator *collator, const char *loc)
@@ -2748,6 +2756,7 @@ icu_set_collation_attributes(UCollator *collator, const char *loc)
 
 	pfree(lower_str);
 }
+#endif
 
 #endif							/* USE_ICU */
 
@@ -2759,17 +2768,8 @@ check_icu_locale(const char *icu_locale)
 {
 #ifdef USE_ICU
 	UCollator  *collator;
-	UErrorCode	status;
-
-	status = U_ZERO_ERROR;
-	collator = ucol_open(icu_locale, &status);
-	if (U_FAILURE(status))
-		ereport(ERROR,
-				(errmsg("could not open collator for locale \"%s\": %s",
-						icu_locale, u_errorName(status))));
 
-	if (U_ICU_VERSION_MAJOR_NUM < 54)
-		icu_set_collation_attributes(collator, icu_locale);
+	collator = pg_ucol_open(icu_locale);
 	ucol_close(collator);
 #else
 	ereport(ERROR,
-- 
2.34.1

From 0ae14118d593d73ea441648fc3c8df067253f627 Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Tue, 14 Mar 2023 22:28:21 -0700
Subject: [PATCH v4 3/3] Handle the "und" locale in ICU versions 54 and older.

The "und" locale is an alternative spelling of the root locale, but it
was not recognized until ICU 55. To maintain common behavior across
all supported ICU versions, check for "und" and replace with "root"
before opening.

Previously, the lack of support for "und" was dangerous, because
versions 54 and older fall back to the environment when a locale is
not found. If the user specified "und" for the language (which is
expected and documented), it could not only resolve to the wrong
collator, but it could unexpectedly change (which could lead to
corrupt indexes).

This effectively reverts commit d72900bded, which worked around the
problem for the built-in "unicode" collation, and is no longer
necessary.
---
 src/backend/utils/adt/pg_locale.c             | 30 +++++++++++++++++++
 src/bin/initdb/initdb.c                       |  2 +-
 src/include/catalog/catversion.h              |  2 +-
 .../regress/expected/collate.icu.utf8.out     |  7 +++++
 src/test/regress/sql/collate.icu.utf8.sql     |  2 ++
 5 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 04d1fa72a8..4dee0f40cb 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1420,12 +1420,35 @@ lc_ctype_is_c(Oid collation)
 
 struct pg_locale_struct default_locale;
 
+#ifdef USE_ICU
+
 static UCollator *
 pg_ucol_open(const char *locale_str)
 {
 	UCollator  *collator;
 	UErrorCode	status;
 
+	/*
+	 * In ICU versions 55 and earlier, "und" is not a recognized spelling of
+	 * the root locale. If the first component of the locale is "und", replace
+	 * with "root" before opening.
+	 */
+#if U_ICU_VERSION_MAJOR_NUM < 55
+	char *fixed_str = NULL;
+
+	if (strncasecmp(locale_str, "und", strlen("und")) == 0 &&
+		!isalnum(locale_str[strlen("und")]))
+	{
+		const char *remainder = locale_str + strlen("und");
+
+		fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
+		strcpy(fixed_str, "root");
+		strcat(fixed_str, remainder);
+
+		locale_str = fixed_str;
+	}
+#endif
+
 	status = U_ZERO_ERROR;
 	collator = ucol_open(locale_str, &status);
 	if (U_FAILURE(status))
@@ -1437,9 +1460,16 @@ pg_ucol_open(const char *locale_str)
 	icu_set_collation_attributes(collator, locale_str);
 #endif
 
+#if U_ICU_VERSION_MAJOR_NUM < 55
+	if (fixed_str != NULL)
+		pfree(fixed_str);
+#endif
+
 	return collator;
 }
 
+#endif
+
 void
 make_icu_collator(const char *iculocstr,
 				  const char *icurules,
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 68d430ed63..d48b7b6060 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -1498,7 +1498,7 @@ setup_collation(FILE *cmdfd)
 	 * that they win if libc defines a locale with the same name.
 	 */
 	PG_CMD_PRINTF("INSERT INTO pg_collation (oid, collname, collnamespace, collowner, collprovider, collisdeterministic, collencoding, colliculocale)"
-				  "VALUES (pg_nextoid('pg_catalog.pg_collation', 'oid', 'pg_catalog.pg_collation_oid_index'), 'unicode', 'pg_catalog'::regnamespace, %u, '%c', true, -1, '');\n\n",
+				  "VALUES (pg_nextoid('pg_catalog.pg_collation', 'oid', 'pg_catalog.pg_collation_oid_index'), 'unicode', 'pg_catalog'::regnamespace, %u, '%c', true, -1, 'und');\n\n",
 				  BOOTSTRAP_SUPERUSERID, COLLPROVIDER_ICU);
 
 	PG_CMD_PRINTF("INSERT INTO pg_collation (oid, collname, collnamespace, collowner, collprovider, collisdeterministic, collencoding, collcollate, collctype)"
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 309aed3703..b2eed22d46 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -57,6 +57,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	202303141
+#define CATALOG_VERSION_NO	202303151
 
 #endif
diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out
index 6225b575ce..f135200c99 100644
--- a/src/test/regress/expected/collate.icu.utf8.out
+++ b/src/test/regress/expected/collate.icu.utf8.out
@@ -1312,6 +1312,13 @@ SELECT 'aBcD' COLLATE lt_insensitive = 'AbCd' COLLATE lt_insensitive;
  t
 (1 row)
 
+CREATE COLLATION lt_upperfirst (provider = icu, locale = 'und-u-kf-upper');
+SELECT 'Z' COLLATE lt_upperfirst < 'z' COLLATE lt_upperfirst;
+ ?column? 
+----------
+ t
+(1 row)
+
 CREATE TABLE test1cs (x text COLLATE case_sensitive);
 CREATE TABLE test2cs (x text COLLATE case_sensitive);
 CREATE TABLE test3cs (x text COLLATE case_sensitive);
diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql
index 64cbfd0a5b..8105ebc8ae 100644
--- a/src/test/regress/sql/collate.icu.utf8.sql
+++ b/src/test/regress/sql/collate.icu.utf8.sql
@@ -521,6 +521,8 @@ SELECT 'abc' <= 'ABC' COLLATE case_insensitive, 'abc' >= 'ABC' COLLATE case_inse
 -- test language tags
 CREATE COLLATION lt_insensitive (provider = icu, locale = 'en-u-ks-level1', deterministic = false);
 SELECT 'aBcD' COLLATE lt_insensitive = 'AbCd' COLLATE lt_insensitive;
+CREATE COLLATION lt_upperfirst (provider = icu, locale = 'und-u-kf-upper');
+SELECT 'Z' COLLATE lt_upperfirst < 'z' COLLATE lt_upperfirst;
 
 CREATE TABLE test1cs (x text COLLATE case_sensitive);
 CREATE TABLE test2cs (x text COLLATE case_sensitive);
-- 
2.34.1

Reply via email to