On Thu, 2023-02-09 at 14:09 -0800, Jeff Davis wrote:
> It feels like BCP 47 is the right catalog representation. We are
> already using it for the import of initial collations, and it's a
> standard, and there seems to be good support in ICU.

Patch attached.

We should have been canonicalizing all along -- either with
uloc_toLanguageTag(), as this patch does, or at least with
uloc_canonicalize() -- before passing to ucol_open().

ucol_open() is documented[1] to work on either language tags or ICU
format locale IDs. Anything else is invalid and ends up going through
some fallback logic, probably after being mis-parsed. For instance, in
ICU 72, "fr_CA.UTF-8" is not a valid ICU format locale ID or a valid
language tag, and is resolved by ucol_open() to the actual locale
"root"; but if you canonicalize it first (to the ICU format locale ID
"fr_CA" or the language tag "fr-CA"), it correctly resolves to the
actual locale "fr_CA".

The correct thing to do is canonicalize first and then pass to
ucol_open().

But because we didn't canonicalize in the past, there could be raw
locale strings stored in the catalog that resolve to the wrong actual
collator, and there could be indexes depending on the wrong collator,
so we have to be careful during pg_upgrade.

Say someone created two ICU collations, one with locale "en_US.UTF-8"
and one with locale "fr_CA.UTF-8" in PG15. When they upgrade to PG16,
this patch will check the language tag "en-US" and see that it resolves
to the same locale as "en_US.UTF-8", and change to the language tag
during upgrade (so "en-US" will be in the new catalog). But when it
checks the language tag "fr-CA", it will notice that it resolves to a
different locale than "fr_CA.UTF-8", and keep the latter string even
though it's wrong, because some indexes might be dependent on that
wrong collator.


[1]
https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/ucol_8h.html#a3b0bf34733dc208040e4157b0fe5fcd6

-- 
Jeff Davis
PostgreSQL Contributor Team - AWS


From bb873845bf1c9bc019606a691d66c2d148a745e2 Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Wed, 15 Feb 2023 23:05:08 -0800
Subject: [PATCH v1] For ICU collations, canonicalize locale names to language
 tags.

Before storing the locale name in the catalog, convert to a BCP47
language tag. The language tag should hold all of the necessary
information and also be an unambiguous representation of the locale.

During pg_upgrade, the previous locale string may need to be preserved
if the language tag resolves to a different actual locale.
---
 src/backend/commands/collationcmds.c |  39 ++++----
 src/backend/commands/dbcommands.c    |  58 +++++++++++
 src/backend/utils/adt/pg_locale.c    | 139 +++++++++++++++++++++++++--
 src/include/commands/dbcommands.h    |   1 +
 src/include/utils/pg_locale.h        |   3 +
 5 files changed, 211 insertions(+), 29 deletions(-)

diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c
index eb62d285ea..3a98bbbd35 100644
--- a/src/backend/commands/collationcmds.c
+++ b/src/backend/commands/collationcmds.c
@@ -240,10 +240,23 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e
 		}
 		else if (collprovider == COLLPROVIDER_ICU)
 		{
+#ifdef USE_ICU
+			char *iculocale;
+
 			if (!colliculocale)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
 						 errmsg("parameter \"locale\" must be specified")));
+
+			check_icu_locale(colliculocale);
+			iculocale = get_icu_locale(colliculocale);
+			if (iculocale)
+				colliculocale = iculocale;
+#else
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("ICU is not supported in this build")));
+#endif
 		}
 
 		/*
@@ -556,26 +569,6 @@ cmpaliases(const void *a, const void *b)
 
 
 #ifdef USE_ICU
-/*
- * Get the ICU language tag for a locale name.
- * The result is a palloc'd string.
- */
-static char *
-get_icu_language_tag(const char *localename)
-{
-	char		buf[ULOC_FULLNAME_CAPACITY];
-	UErrorCode	status;
-
-	status = U_ZERO_ERROR;
-	uloc_toLanguageTag(localename, buf, sizeof(buf), true, &status);
-	if (U_FAILURE(status))
-		ereport(ERROR,
-				(errmsg("could not convert locale name \"%s\" to language tag: %s",
-						localename, u_errorName(status))));
-
-	return pstrdup(buf);
-}
-
 /*
  * Get a comment (specifically, the display name) for an ICU locale.
  * The result is a palloc'd string, or NULL if we can't get a comment
@@ -938,7 +931,11 @@ pg_import_system_collations(PG_FUNCTION_ARGS)
 			else
 				name = uloc_getAvailable(i);
 
-			langtag = get_icu_language_tag(name);
+			langtag = icu_language_tag(name);
+			if (langtag == NULL)
+				ereport(ERROR,
+						(errmsg("could not convert locale name \"%s\" to language tag",
+								name)));
 			iculocstr = U_ICU_VERSION_MAJOR_NUM >= 54 ? langtag : name;
 
 			/*
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index ef05633bb0..ccb5304250 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -1029,6 +1029,9 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 
 	if (dblocprovider == COLLPROVIDER_ICU)
 	{
+#ifdef USE_ICU
+		char *iculocale;
+
 		if (!(is_encoding_supported_by_icu(encoding)))
 			ereport(ERROR,
 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
@@ -1045,6 +1048,14 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 					 errmsg("ICU locale must be specified")));
 
 		check_icu_locale(dbiculocale);
+		iculocale = get_icu_locale(dbiculocale);
+		if (iculocale)
+			dbiculocale = iculocale;
+#else
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("ICU is not supported in this build")));
+#endif
 	}
 	else
 	{
@@ -1463,6 +1474,53 @@ check_encoding_locale_matches(int encoding, const char *collate, const char *cty
 						   pg_encoding_to_char(collate_encoding))));
 }
 
+/*
+ * Given the input ICU locale string, return a new string in a form suitable
+ * for storing in the catalog.
+ *
+ * Ordinarily this just converts to a language tag, but we need to make an
+ * allowance for invalid locale strings that come from earlier versions of
+ * Postgres while upgrading.
+ *
+ * Converting to a language tag performs "level 2 canonicalization". In
+ * addition to producing a consistent result format, level 2 canonicalization
+ * is able to more accurately interpret different input locale string formats,
+ * such as POSIX and .NET IDs. But prior to Postgres version 16, input locale
+ * strings were not canonicalized; the raw string provided by the user was
+ * stored in the catalog and passed directly to ucol_open().
+ *
+ * The raw string may resolve to the wrong actual collator when passed to
+ * directly ucol_open(), but indexes in older versions may depend on that
+ * actual collator. Therefore, during binary upgrade, we preserve the invalid
+ * raw string if it resolves to a different actual collator than the language
+ * tag. If it resolves to the same actual collator, then we proceed using the
+ * language tag.
+ */
+char *
+get_icu_locale(const char *requested_locale)
+{
+#ifdef USE_ICU
+	char *lang_tag = icu_language_tag(requested_locale);
+
+	if (lang_tag != NULL && IsBinaryUpgrade &&
+		!check_equivalent_icu_locales(requested_locale, lang_tag))
+	{
+		ereport(WARNING,
+				(errmsg("language tag \"%s\" resolves to different actual collator "
+						"than raw locale string \"%s\"",
+						lang_tag, requested_locale)));
+		pfree(lang_tag);
+		return pstrdup(requested_locale);
+	}
+
+	return lang_tag;
+#else
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("ICU is not supported in this build")));
+#endif
+}
+
 /* Error cleanup callback for createdb */
 static void
 createdb_failure_callback(int code, Datum arg)
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 059e4fd79f..52d9c8b5a6 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1945,15 +1945,12 @@ icu_set_collation_attributes(UCollator *collator, const char *loc)
 	}
 }
 
-#endif							/* USE_ICU */
-
 /*
  * Check if the given locale ID is valid, and ereport(ERROR) if it isn't.
  */
 void
 check_icu_locale(const char *icu_locale)
 {
-#ifdef USE_ICU
 	UCollator  *collator;
 	UErrorCode	status;
 
@@ -1967,13 +1964,139 @@ check_icu_locale(const char *icu_locale)
 	if (U_ICU_VERSION_MAJOR_NUM < 54)
 		icu_set_collation_attributes(collator, icu_locale);
 	ucol_close(collator);
-#else
-	ereport(ERROR,
-			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-			 errmsg("ICU is not supported in this build")));
-#endif
 }
 
+/*
+ * Test if the given locales resolve to the same actual collator with the same
+ * attributes and version.
+ */
+bool
+check_equivalent_icu_locales(const char *locale1, const char *locale2)
+{
+	const UColAttribute collAtt[]	   = {
+		UCOL_FRENCH_COLLATION,
+		UCOL_ALTERNATE_HANDLING,
+		UCOL_CASE_FIRST,
+		UCOL_CASE_LEVEL,
+		UCOL_NORMALIZATION_MODE,
+		UCOL_DECOMPOSITION_MODE,
+		UCOL_STRENGTH,
+		UCOL_HIRAGANA_QUATERNARY_MODE,
+		UCOL_NUMERIC_COLLATION};
+	int              n_collAtt		   = sizeof(collAtt)/sizeof(*collAtt);
+	const char		*actual1, *actual2;
+	UVersionInfo     versionInfo1;
+	UVersionInfo	 versionInfo2;
+	char             version1[U_MAX_VERSION_STRING_LENGTH];
+	char             version2[U_MAX_VERSION_STRING_LENGTH];
+	UCollator		*collator1		   = NULL;
+	UCollator		*collator2		   = NULL;
+	UErrorCode		 status;
+	bool			 result			   = false;
+
+	/*
+	 * Be careful not to return without closing the collators.
+	 */
+
+	status = U_ZERO_ERROR;
+	collator1 = ucol_open(locale1, &status);
+	if (U_FAILURE(status))
+		goto cleanup;
+
+	status = U_ZERO_ERROR;
+	collator2 = ucol_open(locale2, &status);
+	if (U_FAILURE(status))
+		goto cleanup;
+
+	/* actual locale */
+	status = U_ZERO_ERROR;
+	actual1 = ucol_getLocaleByType(collator1, ULOC_ACTUAL_LOCALE, &status);
+	if (U_FAILURE(status))
+		goto cleanup;
+
+	status = U_ZERO_ERROR;
+	actual2 = ucol_getLocaleByType(collator2, ULOC_ACTUAL_LOCALE, &status);
+	if (U_FAILURE(status))
+		goto cleanup;
+
+	if (strcmp(actual1, actual2) != 0)
+		goto cleanup;
+
+	/* version */
+	ucol_getVersion(collator1, versionInfo1);
+	u_versionToString(versionInfo1, version1);
+	ucol_getVersion(collator2, versionInfo2);
+	u_versionToString(versionInfo2, version2);
+	if (strcmp(version1, version2) != 0)
+		goto cleanup;
+
+	/* attributes */
+	for (int i = 0; i < n_collAtt; i++)
+	{
+		UColAttributeValue val1, val2;
+
+		status = U_ZERO_ERROR;
+		val1 = ucol_getAttribute(collator1, collAtt[i], &status);
+		if (U_FAILURE(status))
+			goto cleanup;
+
+		status = U_ZERO_ERROR;
+		val2 = ucol_getAttribute(collator2, collAtt[i], &status);
+		if (U_FAILURE(status))
+			goto cleanup;
+
+		if (val1 != val2)
+			goto cleanup;
+	}
+
+	/* passed all the best-effort checks for equivalence */
+	result = true;
+
+cleanup:
+	if (collator2)
+		ucol_close(collator2);
+	if (collator1)
+		ucol_close(collator1);
+
+	return result;
+}
+
+/*
+ * Return the BCP47 language tag representation of the requested locale; or
+ * NULL if a problem is encountered.
+ *
+ * This function should be called before passing the string to ucol_open(),
+ * because conversion to a language tag also performs "level 2
+ * canonicalization". In addition to producing a consistent result format,
+ * level 2 canonicalization is able to more accurately interpret different
+ * input locale string formats, such as POSIX and .NET IDs.
+ */
+char *
+icu_language_tag(const char *requested_locale)
+{
+	UErrorCode	 status;
+	char		*result;
+	int32_t		 len;
+	const bool	 strict = true;
+
+	status = U_ZERO_ERROR;
+	len = uloc_toLanguageTag(requested_locale, NULL, 0, strict, &status);
+
+	result = palloc(len + 1);
+
+	status = U_ZERO_ERROR;
+	uloc_toLanguageTag(requested_locale, result, len + 1, strict, &status);
+	if (U_FAILURE(status))
+	{
+		pfree(result);
+		return NULL;
+	}
+
+	return result;
+}
+
+#endif							/* USE_ICU */
+
 /*
  * These functions convert from/to libc's wchar_t, *not* pg_wchar_t.
  * Therefore we keep them here rather than with the mbutils code.
diff --git a/src/include/commands/dbcommands.h b/src/include/commands/dbcommands.h
index 5fbc3ca752..0f0e827ff2 100644
--- a/src/include/commands/dbcommands.h
+++ b/src/include/commands/dbcommands.h
@@ -33,5 +33,6 @@ extern char *get_database_name(Oid dbid);
 extern bool have_createdb_privilege(void);
 
 extern void check_encoding_locale_matches(int encoding, const char *collate, const char *ctype);
+extern char *get_icu_locale(const char *requested_locale);
 
 #endif							/* DBCOMMANDS_H */
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index cede43440b..5f9626985f 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -104,8 +104,11 @@ extern char *get_collation_actual_version(char collprovider, const char *collcol
 #ifdef USE_ICU
 extern int32_t icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes);
 extern int32_t icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar);
+extern bool check_equivalent_icu_locales(const char *locale1,
+										 const char *locale2);
 #endif
 extern void check_icu_locale(const char *icu_locale);
+extern char *icu_language_tag(const char *requested_locale);
 
 /* These functions convert from/to libc's wchar_t, *not* pg_wchar_t */
 extern size_t wchar2char(char *to, const wchar_t *from, size_t tolen,
-- 
2.34.1

Reply via email to