Re: ICU for global collation
On Fri, Oct 21, 2022 at 05:32:38PM +0300, Marina Polyakova wrote: > Hello! > > I discovered an interesting behaviour during installcheck runs when the > cluster was initialized with ICU locale provider: > > $ initdb --locale-provider icu --icu-locale en-US -D data && > 2) The option --no-locale in pg_regress is described as "use C locale" [2]. > But in this case the created databases actually use the ICU locale provider > with the ICU cluster locale from template0 (see > diff_check_backend_used_provider.patch): > > $ make NO_LOCALE=1 installcheck Yes, this looks wrong on the ground on what -no-locale is expected to do, aka use a C locale. Peter? -- Michael signature.asc Description: PGP signature
Re: ICU for global collation
Hello! I discovered an interesting behaviour during installcheck runs when the cluster was initialized with ICU locale provider: $ initdb --locale-provider icu --icu-locale en-US -D data && pg_ctl -D data -l logfile start 1) The ECPG tests fail because they use the SQL_ASCII encoding [1], the database template0 uses the ICU locale provider and SQL_ASCII is not supported by ICU: $ make -C src/interfaces/ecpg/ installcheck ... == creating database "ecpg1_regression" == ERROR: encoding "SQL_ASCII" is not supported with ICU provider ERROR: database "ecpg1_regression" does not exist command failed: "/home/marina/postgresql/master/my/inst/bin/psql" -X -c "CREATE DATABASE \"ecpg1_regression\" TEMPLATE=template0 ENCODING='SQL_ASCII'" -c "ALTER DATABASE \"ecpg1_regression\" SET lc_messages TO 'C';ALTER DATABASE \"ecpg1_regression\" SET lc_monetary TO 'C';ALTER DATABASE \"ecpg1_regression\" SET lc_numeric TO 'C';ALTER DATABASE \"ecpg1_regression\" SET lc_time TO 'C';ALTER DATABASE \"ecpg1_regression\" SET bytea_output TO 'hex';ALTER DATABASE \"ecpg1_regression\" SET timezone_abbreviations TO 'Default';" "postgres" 2) The option --no-locale in pg_regress is described as "use C locale" [2]. But in this case the created databases actually use the ICU locale provider with the ICU cluster locale from template0 (see diff_check_backend_used_provider.patch): $ make NO_LOCALE=1 installcheck In regression.diffs: diff -U3 /home/marina/postgresql/master/src/test/regress/expected/test_setup.out /home/marina/postgresql/master/src/test/regress/results/test_setup.out --- /home/marina/postgresql/master/src/test/regress/expected/test_setup.out 2022-09-27 05:31:27.674628815 +0300 +++ /home/marina/postgresql/master/src/test/regress/results/test_setup.out 2022-10-21 15:09:31.232992885 +0300 @@ -143,6 +143,798 @@ \set filename :abs_srcdir '/data/person.data' COPY person FROM :'filename'; VACUUM ANALYZE person; +NOTICE: varstrfastcmp_locale sss->collate_c 0 sss->locale 0xefacd0 +NOTICE: varstrfastcmp_locale sss->locale->provider i +NOTICE: varstrfastcmp_locale sss->locale->info.icu.locale en-US ... The patch diff_fix_pg_regress_create_database.patch fixes both issues for me. [1] https://github.com/postgres/postgres/blob/ce20f8b9f4354b46b40fd6ebf7ce5c37d08747e0/src/interfaces/ecpg/test/Makefile#L18 [2] https://github.com/postgres/postgres/blob/ce20f8b9f4354b46b40fd6ebf7ce5c37d08747e0/src/test/regress/pg_regress.c#L1992 -- Marina Polyakova Postgres Professional: http://www.postgrespro.com The Russian Postgres Companydiff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c index b57ed946c42bb54ede800e95045aa937a8dbad85..b3c0f6f753f8428274389844ccf9778a7ed47ea4 100644 --- a/src/backend/access/hash/hashfunc.c +++ b/src/backend/access/hash/hashfunc.c @@ -281,6 +281,14 @@ hashtext(PG_FUNCTION_ARGS) if (!lc_collate_is_c(collid)) mylocale = pg_newlocale_from_collation(collid); + elog(NOTICE, "hashtext lc_collate_is_c(collid) %d mylocale %p", lc_collate_is_c(collid), mylocale); + if (mylocale) + { + elog(NOTICE, "hashtext mylocale->provider %c", mylocale->provider); + if (mylocale->provider == COLLPROVIDER_ICU) + elog(NOTICE, "hashtext mylocale->info.icu.locale %s", mylocale->info.icu.locale ? mylocale->info.icu.locale : "(null)"); + } + if (!mylocale || mylocale->deterministic) { result = hash_any((unsigned char *) VARDATA_ANY(key), @@ -337,6 +345,14 @@ hashtextextended(PG_FUNCTION_ARGS) if (!lc_collate_is_c(collid)) mylocale = pg_newlocale_from_collation(collid); + elog(NOTICE, "hashtextextended lc_collate_is_c(collid) %d mylocale %p", lc_collate_is_c(collid), mylocale); + if (mylocale) + { + elog(NOTICE, "hashtextextended mylocale->provider %c", mylocale->provider); + if (mylocale->provider == COLLPROVIDER_ICU) + elog(NOTICE, "hashtextextended mylocale->info.icu.locale %s", mylocale->info.icu.locale ? mylocale->info.icu.locale : "(null)"); + } + if (!mylocale || mylocale->deterministic) { result = hash_any_extended((unsigned char *) VARDATA_ANY(key), diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c index 02d462a659778016f3c4479d425ba0a84feb6e26..9627c84a7ccfb4c4013556a51c989e9e6d611634 100644 --- a/src/backend/regex/regc_pg_locale.c +++ b/src/backend/regex/regc_pg_locale.c @@ -243,6 +243,8 @@ pg_set_regex_collation(Oid collation) errhint("Use the COLLATE clause to set the collation explicitly."))); } + elog(NOTICE, "pg_set_regex_collation lc_ctype_is_c(collid) %d", lc_ctype_is_c(collation)); + if (lc_ctype_is_c(collation)) { /* C/POSIX collations use this path regardless of database encoding */ @@ -259,6 +261,14 @@ pg_set_regex_collation(Oid collation) */ pg_regex_locale = pg_newlocale_from_collation(collation); + elog(NOTICE, "pg_set_regex_collation pg_regex_locale %p", pg_regex_locale); + if (pg_regex_locale) + { + elog(NOTICE,
Re: ICU for global collation
On 2022-10-01 15:07, Peter Eisentraut wrote: On 22.09.22 20:06, Marina Polyakova wrote: On 2022-09-21 17:53, Peter Eisentraut wrote: Committed with that test, thanks. I think that covers all the ICU issues you reported for PG15 for now? I thought about the order of the ICU checks - if it is ok to check that the selected encoding is supported by ICU after printing all the locale & encoding information, why not to move almost all the ICU checks here?.. It's possible that we can do better, but I'm not going to add things like that to PG 15 at this point unless it fixes a faulty behavior. Will PG 15 always have this order of ICU checks, is the current behaviour correct enough? On the other hand, there may be a better fix for PG 16+ and not all changes can be backported... On 2022-09-16 10:56, Peter Eisentraut wrote: On 15.09.22 17:41, Marina Polyakova wrote: I agree with you. Here's another version of the patch. The locale/encoding checks and reports in initdb have been reordered, because now the encoding is set first and only then the ICU locale is checked. I committed something based on the first version of your patch. This reordering of the messages here was a little too much surgery for me at this point. For instance, there are also messages in #ifdef WIN32 code that would need to be reordered as well. I kept the overall structure of the code the same and just inserted the additional proposed checks. If you want to pursue the reordering of the checks and messages overall, a patch for the master branch could be considered. I've worked on this again (see attached patch) but I'm not sure if the messages of encoding mismatches are clear enough without the full locale information. For $ initdb -D data --icu-locale en --locale-provider icu compare the outputs: The database cluster will be initialized with this locale configuration: provider:icu ICU locale: en LC_COLLATE: de_DE.iso885915@euro LC_CTYPE:de_DE.iso885915@euro LC_MESSAGES: en_US.utf8 LC_MONETARY: de_DE.iso885915@euro LC_NUMERIC: de_DE.iso885915@euro LC_TIME: de_DE.iso885915@euro The default database encoding has been set to "UTF8". initdb: error: encoding mismatch initdb: detail: The encoding you selected (UTF8) and the encoding that the selected locale uses (LATIN9) do not match. This would lead to misbehavior in various character string processing functions. initdb: hint: Rerun initdb and either do not specify an encoding explicitly, or choose a matching combination. and Encoding "UTF8" implied by locale will be set as the default database encoding. initdb: error: encoding mismatch initdb: detail: The encoding you selected (UTF8) and the encoding that the selected locale uses (LATIN9) do not match. This would lead to misbehavior in various character string processing functions. initdb: hint: Rerun initdb and either do not specify an encoding explicitly, or choose a matching combination. The same without ICU, e.g. for $ initdb -D data the output with locale information: The database cluster will be initialized with this locale configuration: provider:libc LC_COLLATE: en_US.utf8 LC_CTYPE:de_DE.iso885915@euro LC_MESSAGES: en_US.utf8 LC_MONETARY: de_DE.iso885915@euro LC_NUMERIC: de_DE.iso885915@euro LC_TIME: de_DE.iso885915@euro The default database encoding has accordingly been set to "LATIN9". initdb: error: encoding mismatch initdb: detail: The encoding you selected (LATIN9) and the encoding that the selected locale uses (UTF8) do not match. This would lead to misbehavior in various character string processing functions. initdb: hint: Rerun initdb and either do not specify an encoding explicitly, or choose a matching combination. and the "shorter" version: Encoding "LATIN9" implied by locale will be set as the default database encoding. initdb: error: encoding mismatch initdb: detail: The encoding you selected (LATIN9) and the encoding that the selected locale uses (UTF8) do not match. This would lead to misbehavior in various character string processing functions. initdb: hint: Rerun initdb and either do not specify an encoding explicitly, or choose a matching combination. BTW, what did you mean that "there are also messages in #ifdef WIN32 code that would need to be reordered as well"?.. -- Marina Polyakova Postgres Professional: http://www.postgrespro.com The Russian Postgres Companydiff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 96b46cbc020ef8b85b6d54d3d4ca8ad116277832..242d68f58287aeb6f95619c2ce8b78e38433cf18 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -1030,6 +1030,11 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) if (dblocprovider == COLLPROVIDER_ICU) { +#ifndef USE_ICU + ereport(ERROR, +(errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("ICU is not supported in this build"))); +#else if
Re: ICU for global collation
On 22.09.22 20:06, Marina Polyakova wrote: On 2022-09-21 17:53, Peter Eisentraut wrote: Committed with that test, thanks. I think that covers all the ICU issues you reported for PG15 for now? I thought about the order of the ICU checks - if it is ok to check that the selected encoding is supported by ICU after printing all the locale & encoding information, why not to move almost all the ICU checks here?.. It's possible that we can do better, but I'm not going to add things like that to PG 15 at this point unless it fixes a faulty behavior.
Re: ICU for global collation
On 2022-09-21 17:53, Peter Eisentraut wrote: Committed with that test, thanks. I think that covers all the ICU issues you reported for PG15 for now? I thought about the order of the ICU checks - if it is ok to check that the selected encoding is supported by ICU after printing all the locale & encoding information, why not to move almost all the ICU checks here?.. Examples of the work of the attached patch: 1. ICU locale vs supported encoding: 1.1. $ initdb --encoding sql-ascii --locale-provider icu hoge ... initdb: error: encoding mismatch initdb: detail: The encoding you selected (SQL_ASCII) is not supported with the ICU provider. initdb: hint: Rerun initdb and either do not specify an encoding explicitly, or choose a matching combination. 1.2. (like before) $ initdb --encoding sql-ascii --icu-locale en-US hoge initdb: error: --icu-locale cannot be specified unless locale provider "icu" is chosen $ createdb --encoding sql-ascii --icu-locale en-US hoge createdb: error: database creation failed: ERROR: ICU locale cannot be specified unless locale provider is ICU 2. For builds without ICU: 2.1. $ initdb --locale-provider icu hoge ... initdb: error: ICU is not supported in this build $ createdb --locale-provider icu hoge createdb: error: database creation failed: ERROR: ICU is not supported in this build 2.2. (like before) $ initdb --icu-locale en-US hoge initdb: error: --icu-locale cannot be specified unless locale provider "icu" is chosen $ createdb --icu-locale en-US hoge createdb: error: database creation failed: ERROR: ICU locale cannot be specified unless locale provider is ICU 2.3. $ createdb --locale-provider icu --icu-locale en-US --encoding sql-ascii hoge createdb: error: database creation failed: ERROR: ICU is not supported in this build 4. About errors in initdb: 4.1. If icu_locale is not specified, but it is required, then we get this: $ initdb --locale-provider icu hoge The files belonging to this database system will be owned by user "marina". This user must also own the server process. The database cluster will be initialized with this locale configuration: provider:icu LC_COLLATE: en_US.UTF-8 LC_CTYPE:en_US.UTF-8 LC_MESSAGES: en_US.UTF-8 LC_MONETARY: ru_RU.UTF-8 LC_NUMERIC: ru_RU.UTF-8 LC_TIME: ru_RU.UTF-8 The default database encoding has been set to "UTF8". initdb: error: ICU locale must be specified Almost the same if ICU is not supported in this build: $ initdb --locale-provider icu hoge The files belonging to this database system will be owned by user "marina". This user must also own the server process. The database cluster will be initialized with this locale configuration: provider:icu LC_COLLATE: en_US.UTF-8 LC_CTYPE:en_US.UTF-8 LC_MESSAGES: en_US.UTF-8 LC_MONETARY: ru_RU.UTF-8 LC_NUMERIC: ru_RU.UTF-8 LC_TIME: ru_RU.UTF-8 The default database encoding has been set to "UTF8". initdb: error: ICU is not supported in this build 4.2. If icu_locale is specified for the wrong provider, the error will be at the beginning of the program start as before: $ initdb --icu-locale en-US hoge initdb: error: --icu-locale cannot be specified unless locale provider "icu" is chosen -- Marina Polyakova Postgres Professional: http://www.postgrespro.com The Russian Postgres Companydiff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index e0753c1badcc299e2bac45f3bdd2f23f59d70cbc..2589a2523e07c9543c99c7d7b446438d62382b89 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -1030,6 +1030,11 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) if (dblocprovider == COLLPROVIDER_ICU) { +#ifndef USE_ICU + ereport(ERROR, +(errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("ICU is not supported in this build"))); +#else if (!(is_encoding_supported_by_icu(encoding))) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), @@ -1046,6 +1051,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) errmsg("ICU locale must be specified"))); check_icu_locale(dbiculocale); +#endif } else { diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 2b42d9ccd8b751bcb5cda3a1f4c7803a68bc0a4a..743f11e1d1fd62d500fc2846976472d13e08046b 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -1945,15 +1945,12 @@ icu_set_collation_attributes(UCollator *collator, const char *loc) } } -#endif /* USE_ICU */ - /* * Check if the given locale ID is valid, and ereport(ERROR) if it isn't. */ void check_icu_locale(const char *icu_locale) { -#ifdef USE_ICU UCollator *collator; UErrorCode status; @@ -1967,13 +1964,10 @@ check_icu_locale(const char *icu_locale) if (U_ICU_VERSION_MAJOR_NUM < 54) icu_set_collation_attributes(collator, icu_locale); ucol_close(collator); -#else - ereport(ERROR, -
Re: ICU for global collation
On 21.09.22 08:50, Marina Polyakova wrote: On 2022-09-20 12:59, Peter Eisentraut wrote: On 17.09.22 10:33, Marina Polyakova wrote: 3. The locale provider is ICU, but it has not yet been set from the template database: $ initdb --locale-provider icu --icu-locale en-US -D data && pg_ctl -D data -l logfile start && createdb --icu-locale ru-RU --template template0 mydb ... createdb: error: database creation failed: ERROR: ICU locale cannot be specified unless locale provider is ICU Please see attached patch for a fix. Does that work for you? Yes, it works. The following test checks this fix: Committed with that test, thanks. I think that covers all the ICU issues you reported for PG15 for now?
Re: ICU for global collation
On 2022-09-20 12:59, Peter Eisentraut wrote: On 17.09.22 10:33, Marina Polyakova wrote: 3. The locale provider is ICU, but it has not yet been set from the template database: $ initdb --locale-provider icu --icu-locale en-US -D data && pg_ctl -D data -l logfile start && createdb --icu-locale ru-RU --template template0 mydb ... createdb: error: database creation failed: ERROR: ICU locale cannot be specified unless locale provider is ICU Please see attached patch for a fix. Does that work for you? Yes, it works. The following test checks this fix: diff --git a/src/bin/scripts/t/020_createdb.pl b/src/bin/scripts/t/020_createdb.pl index b87d8fc63b5246b02bcd4499aae815269b60df7c..c2464a99618cd7ca5616cc21121e1e4379b52baf 100644 --- a/src/bin/scripts/t/020_createdb.pl +++ b/src/bin/scripts/t/020_createdb.pl @@ -71,6 +71,14 @@ if ($ENV{with_icu} eq 'yes') $node2->command_ok( [ 'createdb', '-T', 'template0', '--locale-provider=libc', 'foobar55' ], 'create database with libc provider from template database with icu provider'); + + $node2->command_ok( + [ + 'createdb', '-T', 'template0', '--icu-locale', + 'en-US', 'foobar56' + ], + 'create database with icu locale from template database with icu provider' + ); } else { -- Marina Polyakova Postgres Professional: http://www.postgrespro.com The Russian Postgres Company
Re: ICU for global collation
On 17.09.22 10:33, Marina Polyakova wrote: Thanks to Kyotaro Horiguchi review we found out that there're interesting cases due to the order of some ICU checks: 1. ICU locale vs supported encoding: 1.1. On 2022-09-15 09:52, Kyotaro Horiguchi wrote: If I executed initdb as follows, I would be told to specify --icu-locale option. $ initdb --encoding sql-ascii --locale-provider icu hoge ... initdb: error: ICU locale must be specified However, when I reran the command, it complains about incompatible encoding this time. I think it's more user-friendly to check for the encoding compatibility before the check for missing --icu-locale option. This a valid point, but it would require quite a bit of work to move all those checks around and re-verify the result, so I don't want to do it in PG15. 1.2. (ok?) $ initdb --encoding sql-ascii --icu-locale en-US hoge initdb: error: --icu-locale cannot be specified unless locale provider "icu" is chosen $ initdb --encoding sql-ascii --icu-locale en-US --locale-provider icu hoge ... initdb: error: encoding mismatch initdb: detail: The encoding you selected (SQL_ASCII) is not supported with the ICU provider. initdb: hint: Rerun initdb and either do not specify an encoding explicitly, or choose a matching combination. $ createdb --encoding sql-ascii --icu-locale en-US hoge createdb: error: database creation failed: ERROR: ICU locale cannot be specified unless locale provider is ICU $ createdb --encoding sql-ascii --icu-locale en-US --locale-provider icu hoge createdb: error: database creation failed: ERROR: encoding "SQL_ASCII" is not supported with ICU provider I don't see a problem here. 2. For builds without ICU: 2.1. $ initdb --locale-provider icu hoge ... initdb: error: ICU locale must be specified $ initdb --locale-provider icu --icu-locale en-US hoge ... initdb: error: ICU is not supported in this build $ createdb --locale-provider icu hoge createdb: error: database creation failed: ERROR: ICU locale must be specified $ createdb --locale-provider icu --icu-locale en-US hoge createdb: error: database creation failed: ERROR: ICU is not supported in this build IMO, it would be more user-friendly to inform an unsupported build in the first runs too.. Again, this would require reorganizing a bunch of code to get some cosmetic benefit, which isn't a good idea now for PG15. 2.2. (ok?) 2.3. same here 3. The locale provider is ICU, but it has not yet been set from the template database: $ initdb --locale-provider icu --icu-locale en-US -D data && pg_ctl -D data -l logfile start && createdb --icu-locale ru-RU --template template0 mydb ... createdb: error: database creation failed: ERROR: ICU locale cannot be specified unless locale provider is ICU Please see attached patch for a fix. Does that work for you?From 5bb07e390baf8a471f7d30582469268c8a6d71ca Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Tue, 20 Sep 2022 05:50:41 -0400 Subject: [PATCH] Improve ICU option handling in CREATE DATABASE We check that the ICU locale is only specified if the ICU locale provider is selected. But we did that too early. We need to wait until we load the settings of the template database, since that could also set what the locale provider is. Reported-by: Marina Polyakova Discussion: https://www.postgresql.org/message-id/9ba4cd1ea6ed6b7b15c0ff15e6f54...@postgrespro.ru --- src/backend/commands/dbcommands.c | 11 +++ 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index cba929b7f998..5dfec5c6b056 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -907,10 +907,6 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) errmsg("unrecognized locale provider: %s", locproviderstr))); } - if (diculocale && dblocprovider != COLLPROVIDER_ICU) - ereport(ERROR, - (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), -errmsg("ICU locale cannot be specified unless locale provider is ICU"))); if (distemplate && distemplate->arg) dbistemplate = defGetBoolean(distemplate); if (dallowconnections && dallowconnections->arg) @@ -1050,6 +1046,13 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) check_icu_locale(dbiculocale); } + else + { + if (dbiculocale) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), +errmsg("ICU locale cannot be specified unless locale provider is ICU"))); + } /* * Check that the new encoding and locale settings match the source -- 2.37.3
Re: ICU for global collation
On 2022-09-16 10:56, Peter Eisentraut wrote: On 15.09.22 17:41, Marina Polyakova wrote: I agree with you. Here's another version of the patch. The locale/encoding checks and reports in initdb have been reordered, because now the encoding is set first and only then the ICU locale is checked. I committed something based on the first version of your patch. This reordering of the messages here was a little too much surgery for me at this point. For instance, there are also messages in #ifdef WIN32 code that would need to be reordered as well. I kept the overall structure of the code the same and just inserted the additional proposed checks. If you want to pursue the reordering of the checks and messages overall, a patch for the master branch could be considered. Thank you! I already wrote about the order of the ICU checks in initdb/create database, they were the only reason to propose such changes... -- Marina Polyakova Postgres Professional: http://www.postgrespro.com The Russian Postgres Company
Re: ICU for global collation
Thanks to Kyotaro Horiguchi review we found out that there're interesting cases due to the order of some ICU checks: 1. ICU locale vs supported encoding: 1.1. On 2022-09-15 09:52, Kyotaro Horiguchi wrote: If I executed initdb as follows, I would be told to specify --icu-locale option. $ initdb --encoding sql-ascii --locale-provider icu hoge ... initdb: error: ICU locale must be specified However, when I reran the command, it complains about incompatible encoding this time. I think it's more user-friendly to check for the encoding compatibility before the check for missing --icu-locale option. 1.2. (ok?) $ initdb --encoding sql-ascii --icu-locale en-US hoge initdb: error: --icu-locale cannot be specified unless locale provider "icu" is chosen $ initdb --encoding sql-ascii --icu-locale en-US --locale-provider icu hoge ... initdb: error: encoding mismatch initdb: detail: The encoding you selected (SQL_ASCII) is not supported with the ICU provider. initdb: hint: Rerun initdb and either do not specify an encoding explicitly, or choose a matching combination. $ createdb --encoding sql-ascii --icu-locale en-US hoge createdb: error: database creation failed: ERROR: ICU locale cannot be specified unless locale provider is ICU $ createdb --encoding sql-ascii --icu-locale en-US --locale-provider icu hoge createdb: error: database creation failed: ERROR: encoding "SQL_ASCII" is not supported with ICU provider 2. For builds without ICU: 2.1. $ initdb --locale-provider icu hoge ... initdb: error: ICU locale must be specified $ initdb --locale-provider icu --icu-locale en-US hoge ... initdb: error: ICU is not supported in this build $ createdb --locale-provider icu hoge createdb: error: database creation failed: ERROR: ICU locale must be specified $ createdb --locale-provider icu --icu-locale en-US hoge createdb: error: database creation failed: ERROR: ICU is not supported in this build IMO, it would be more user-friendly to inform an unsupported build in the first runs too.. 2.2. (ok?) $ initdb --icu-locale en-US hoge initdb: error: --icu-locale cannot be specified unless locale provider "icu" is chosen $ initdb --icu-locale en-US --locale-provider icu hoge ... initdb: error: ICU is not supported in this build $ createdb --icu-locale en-US hoge createdb: error: database creation failed: ERROR: ICU locale cannot be specified unless locale provider is ICU $ createdb --icu-locale en-US --locale-provider icu hoge createdb: error: database creation failed: ERROR: ICU is not supported in this build 2.3. $ createdb --locale-provider icu --icu-locale en-US --encoding sql-ascii hoge createdb: error: database creation failed: ERROR: encoding "SQL_ASCII" is not supported with ICU provider $ createdb --locale-provider icu --icu-locale en-US --encoding utf8 hoge createdb: error: database creation failed: ERROR: ICU is not supported in this build IMO, it would be more user-friendly to inform an unsupported build in the first run too.. 3. The locale provider is ICU, but it has not yet been set from the template database: $ initdb --locale-provider icu --icu-locale en-US -D data && pg_ctl -D data -l logfile start && createdb --icu-locale ru-RU --template template0 mydb ... createdb: error: database creation failed: ERROR: ICU locale cannot be specified unless locale provider is ICU -- Marina Polyakova Postgres Professional: http://www.postgrespro.com The Russian Postgres Company
Re: ICU for global collation
On 2022-09-16 11:11, Kyotaro Horiguchi wrote: At Fri, 16 Sep 2022 09:49:28 +0300, Marina Polyakova wrote in In continuation of options check: AFAICS the following checks in initdb if (locale_provider == COLLPROVIDER_ICU) { if (!icu_locale) pg_fatal("ICU locale must be specified"); /* * In supported builds, the ICU locale ID will be checked by the * backend during post-bootstrap initialization. */ #ifndef USE_ICU pg_fatal("ICU is not supported in this build"); #endif } are executed approximately when they are executed in create database after getting all the necessary data from the template database: initdb doesn't work that way, but anyway, I realized that I am proposing to move that code in setlocales() to the caller function as the result. I don't think setlocales() is the place for the code because icu locale has no business with what the function does. That being said there's no obvious reason we *need* to move the code out to its caller. Excuse me, but could you explain your last sentence in more detail? I read that this code is not for setlocales and then - that it should not moved from here, so I'm confused... +errmsg("encoding \"%s\" is not supported with ICU provider", + pg_log_error("encoding \"%s\" is not supported with ICU provider", +pg_encoding_to_char(encodingid)); I might be wrong, but the messages look wrong to me. The alternatives below might work. "encoding \"%s\" is not supported by ICU" "encoding \"%s\" cannot be used for/with ICU locales" The message indicates that the selected encoding cannot be used with the ICU provider because it does not support it. But if the text of the message becomes better and clearer, I will only be glad. + pg_log_error_hint("Rerun %s and choose a matching combination.", + progname); This doesn't seem to provide users with useful information. It was commited in more verbose form: pg_log_error_hint("Rerun %s and either do not specify an encoding explicitly, " "or choose a matching combination.", -- Marina Polyakova Postgres Professional: http://www.postgrespro.com The Russian Postgres Company
Re: ICU for global collation
On 2022-09-16 10:57, Peter Eisentraut wrote: On 16.09.22 09:31, Marina Polyakova wrote: IMO it is hardly understantable from the program output either - it looks like I manually chose the encoding UTF8. Maybe first inform about selected encoding?.. Yes, I included something like that in the patch just committed. diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 6aeec8d426c52414b827686781c245291f27ed1f..348bbbeba0f5bc7ff601912bf883510d580b814c 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -2310,7 +2310,11 @@ setup_locale_encoding(void) } if (!encoding && locale_provider == COLLPROVIDER_ICU) + { encodingid = PG_UTF8; + printf(_("The default database encoding has been set to \"%s\" for a better experience with the ICU provider.\n"), + pg_encoding_to_char(encodingid)); + } else if (!encoding) { int ctype_enc; Thank you! -- Marina Polyakova Postgres Professional: http://www.postgrespro.com The Russian Postgres Company
Re: ICU for global collation
On 16.09.22 08:49, Marina Polyakova wrote: But perhaps the check that --icu-locale cannot be specified unless locale provider icu is chosen should also be moved here? So all these checks will be in one place and it will use the provider from the template database (which could be icu): $ initdb --locale-provider icu --icu-locale en-US -D data && pg_ctl -D data -l logfile start && createdb --icu-locale ru-RU --template template0 mydb ... createdb: error: database creation failed: ERROR: ICU locale cannot be specified unless locale provider is ICU Can you be more specific about what you are proposing here? I'm not following.
Re: ICU for global collation
At Fri, 16 Sep 2022 09:56:31 +0200, Peter Eisentraut wrote in > On 15.09.22 17:41, Marina Polyakova wrote: > > I agree with you. Here's another version of the patch. The > > locale/encoding checks and reports in initdb have been reordered, > > because now the encoding is set first and only then the ICU locale is > > checked. > > I committed something based on the first version of your patch. This > reordering of the messages here was a little too much surgery for me > at this point. For instance, there are also messages in #ifdef WIN32 > code that would need to be reordered as well. I kept the overall > structure of the code the same and just inserted the additional > proposed checks. Yeah, as I sent just before, I reached the same conclusion. > If you want to pursue the reordering of the checks and messages > overall, a patch for the master branch could be considered. regards. -- Kyotaro Horiguchi NTT Open Source Software Center
Re: ICU for global collation
At Fri, 16 Sep 2022 09:49:28 +0300, Marina Polyakova wrote in > On 2022-09-15 09:52, Kyotaro Horiguchi wrote: > > However, when I reran the command, it complains about incompatible > > encoding this time. I think it's more user-friendly to check for the > > encoding compatibility before the check for missing --icu-locale > > option. > > regards. > > In continuation of options check: AFAICS the following checks in > initdb > > if (locale_provider == COLLPROVIDER_ICU) > { > if (!icu_locale) > pg_fatal("ICU locale must be specified"); > > /* >* In supported builds, the ICU locale ID will be checked by the >* backend during post-bootstrap initialization. >*/ > #ifndef USE_ICU > pg_fatal("ICU is not supported in this build"); > #endif > } > > are executed approximately when they are executed in create database > after getting all the necessary data from the template database: initdb doesn't work that way, but anyway, I realized that I am proposing to move that code in setlocales() to the caller function as the result. I don't think setlocales() is the place for the code because icu locale has no business with what the function does. That being said there's no obvious reason we *need* to move the code out to its caller. > if (dblocprovider == COLLPROVIDER_ICU) > { > /* >* This would happen if template0 uses the libc provider but the new >* database uses icu. >*/ > if (!dbiculocale) > ereport(ERROR, > (errcode(ERRCODE_INVALID_PARAMETER_VALUE), >errmsg("ICU locale must be specified"))); > } > > if (dblocprovider == COLLPROVIDER_ICU) > check_icu_locale(dbiculocale); > > But perhaps the check that --icu-locale cannot be specified unless > locale provider icu is chosen should also be moved here? So all these > checks will be in one place and it will use the provider from the > template database (which could be icu): > > $ initdb --locale-provider icu --icu-locale en-US -D data && > pg_ctl -D data -l logfile start && > createdb --icu-locale ru-RU --template template0 mydb > ... > createdb: error: database creation failed: ERROR: ICU locale cannot be > specified unless locale provider is ICU And, I realized that this causes bigger churn than I thought. So, I'm sorry but I withdraw the comment. Thus the first proposed patch will be more or less the direction we would go. And the patch looks good to me as a whole. +errmsg("encoding \"%s\" is not supported with ICU provider", + pg_log_error("encoding \"%s\" is not supported with ICU provider", +pg_encoding_to_char(encodingid)); I might be wrong, but the messages look wrong to me. The alternatives below might work. "encoding \"%s\" is not supported by ICU" "encoding \"%s\" cannot be used for/with ICU locales" + pg_log_error_hint("Rerun %s and choose a matching combination.", + progname); This doesn't seem to provide users with useful information. regards. -- Kyotaro Horiguchi NTT Open Source Software Center
Re: ICU for global collation
On 16.09.22 09:31, Marina Polyakova wrote: IMO it is hardly understantable from the program output either - it looks like I manually chose the encoding UTF8. Maybe first inform about selected encoding?.. Yes, I included something like that in the patch just committed. diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 6aeec8d426c52414b827686781c245291f27ed1f..348bbbeba0f5bc7ff601912bf883510d580b814c 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -2310,7 +2310,11 @@ setup_locale_encoding(void) } if (!encoding && locale_provider == COLLPROVIDER_ICU) + { encodingid = PG_UTF8; + printf(_("The default database encoding has been set to \"%s\" for a better experience with the ICU provider.\n"), + pg_encoding_to_char(encodingid)); + } else if (!encoding) { int ctype_enc;
Re: ICU for global collation
On 15.09.22 17:41, Marina Polyakova wrote: I agree with you. Here's another version of the patch. The locale/encoding checks and reports in initdb have been reordered, because now the encoding is set first and only then the ICU locale is checked. I committed something based on the first version of your patch. This reordering of the messages here was a little too much surgery for me at this point. For instance, there are also messages in #ifdef WIN32 code that would need to be reordered as well. I kept the overall structure of the code the same and just inserted the additional proposed checks. If you want to pursue the reordering of the checks and messages overall, a patch for the master branch could be considered.
Re: ICU for global collation
On 2022-09-16 07:55, Kyotaro Horiguchi wrote: At Thu, 15 Sep 2022 18:41:31 +0300, Marina Polyakova wrote in P.S. While working on the patch, I discovered that UTF8 encoding is always used for the ICU provider in initdb unless it is explicitly specified by the user: if (!encoding && locale_provider == COLLPROVIDER_ICU) encodingid = PG_UTF8; IMO this creates additional errors for locales with other encodings: $ initdb --locale de_DE.iso885915@euro --locale-provider icu --icu-locale de-DE ... initdb: error: encoding mismatch initdb: detail: The encoding you selected (UTF8) and the encoding that the selected locale uses (LATIN9) do not match. This would lead to misbehavior in various character string processing functions. initdb: hint: Rerun initdb and either do not specify an encoding explicitly, or choose a matching combination. And ICU supports many encodings, see the contents of pg_enc2icu_tbl in encnames.c... It seems to me the best default that fits almost all cases using icu locales. So, we need to specify encoding explicitly in that case. $ initdb --encoding iso-8859-15 --locale de_DE.iso885915@euro --locale-provider icu --icu-locale de-DE However, I think it is hardly understantable from the documentation. (I checked this using euc-jp [1] so it might be wrong..) [1] initdb --encoding euc-jp --locale ja_JP.eucjp --locale-provider icu --icu-locale ja-x-icu regards. Thank you! IMO it is hardly understantable from the program output either - it looks like I manually chose the encoding UTF8. Maybe first inform about selected encoding?.. diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 6aeec8d426c52414b827686781c245291f27ed1f..348bbbeba0f5bc7ff601912bf883510d580b814c 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -2310,7 +2310,11 @@ setup_locale_encoding(void) } if (!encoding && locale_provider == COLLPROVIDER_ICU) + { encodingid = PG_UTF8; + printf(_("The default database encoding has been set to \"%s\" for a better experience with the ICU provider.\n"), + pg_encoding_to_char(encodingid)); + } else if (!encoding) { int ctype_enc; ISTM that such choices (e.g. UTF8 for Windows in some cases) are described in the documentation [1] as By default, initdb uses the locale provider libc, takes the locale settings from the environment, and determines the encoding from the locale settings. This is almost always sufficient, unless there are special requirements. [1] https://www.postgresql.org/docs/devel/app-initdb.html -- Marina Polyakova Postgres Professional: http://www.postgrespro.com The Russian Postgres Company
Re: ICU for global collation
On 2022-09-15 09:52, Kyotaro Horiguchi wrote: If I executed initdb as follows, I would be told to specify --icu-locale option. $ initdb --encoding sql-ascii --locale-provider icu hoge ... initdb: error: ICU locale must be specified However, when I reran the command, it complains about incompatible encoding this time. I think it's more user-friendly to check for the encoding compatibility before the check for missing --icu-locale option. regards. In continuation of options check: AFAICS the following checks in initdb if (locale_provider == COLLPROVIDER_ICU) { if (!icu_locale) pg_fatal("ICU locale must be specified"); /* * In supported builds, the ICU locale ID will be checked by the * backend during post-bootstrap initialization. */ #ifndef USE_ICU pg_fatal("ICU is not supported in this build"); #endif } are executed approximately when they are executed in create database after getting all the necessary data from the template database: if (dblocprovider == COLLPROVIDER_ICU) { /* * This would happen if template0 uses the libc provider but the new * database uses icu. */ if (!dbiculocale) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("ICU locale must be specified"))); } if (dblocprovider == COLLPROVIDER_ICU) check_icu_locale(dbiculocale); But perhaps the check that --icu-locale cannot be specified unless locale provider icu is chosen should also be moved here? So all these checks will be in one place and it will use the provider from the template database (which could be icu): $ initdb --locale-provider icu --icu-locale en-US -D data && pg_ctl -D data -l logfile start && createdb --icu-locale ru-RU --template template0 mydb ... createdb: error: database creation failed: ERROR: ICU locale cannot be specified unless locale provider is ICU -- Marina Polyakova Postgres Professional: http://www.postgrespro.com The Russian Postgres Company
Re: ICU for global collation
At Thu, 15 Sep 2022 18:41:31 +0300, Marina Polyakova wrote in > P.S. While working on the patch, I discovered that UTF8 encoding is > always used for the ICU provider in initdb unless it is explicitly > specified by the user: > > if (!encoding && locale_provider == COLLPROVIDER_ICU) > encodingid = PG_UTF8; > > IMO this creates additional errors for locales with other encodings: > > $ initdb --locale de_DE.iso885915@euro --locale-provider icu > --icu-locale de-DE > ... > initdb: error: encoding mismatch > initdb: detail: The encoding you selected (UTF8) and the encoding that > the selected locale uses (LATIN9) do not match. This would lead to > misbehavior in various character string processing functions. > initdb: hint: Rerun initdb and either do not specify an encoding > explicitly, or choose a matching combination. > > And ICU supports many encodings, see the contents of pg_enc2icu_tbl in > encnames.c... It seems to me the best default that fits almost all cases using icu locales. So, we need to specify encoding explicitly in that case. $ initdb --encoding iso-8859-15 --locale de_DE.iso885915@euro --locale-provider icu --icu-locale de-DE However, I think it is hardly understantable from the documentation. (I checked this using euc-jp [1] so it might be wrong..) [1] initdb --encoding euc-jp --locale ja_JP.eucjp --locale-provider icu --icu-locale ja-x-icu regards. -- Kyotaro Horiguchi NTT Open Source Software Center
Re: ICU for global collation
On Wed, Sep 14, 2022 at 05:19:34PM +0300, Marina Polyakova wrote: > I was surprised that it is allowed to create clusters/databases where the > default ICU collations do not actually work due to unsupported encodings: > > $ initdb --encoding SQL_ASCII --locale-provider icu --icu-locale en-US -D > data && > pg_ctl -D data -l logfile start && > psql -c "SELECT 'a' < 'b'" template1 > ... > waiting for server to start done > server started > ERROR: encoding "SQL_ASCII" not supported by ICU > > $ createdb --encoding SQL_ASCII --locale-provider icu --icu-locale en-US > --template template0 mydb && > psql -c "SELECT 'a' < 'b'" mydb > ERROR: encoding "SQL_ASCII" not supported by ICU > > The patch diff_check_icu_encoding.patch prohibits the creation of such > objects... Agreed that it is a bit confusing to get this type of error after the database has been created when querying it due to a mix of unsupported options. Peter? -- Michael signature.asc Description: PGP signature
Re: ICU for global collation
On 2022-09-15 09:52, Kyotaro Horiguchi wrote: If I executed initdb as follows, I would be told to specify --icu-locale option. $ initdb --encoding sql-ascii --locale-provider icu hoge ... initdb: error: ICU locale must be specified However, when I reran the command, it complains about incompatible encoding this time. I think it's more user-friendly to check for the encoding compatibility before the check for missing --icu-locale option. regards. I agree with you. Here's another version of the patch. The locale/encoding checks and reports in initdb have been reordered, because now the encoding is set first and only then the ICU locale is checked. P.S. While working on the patch, I discovered that UTF8 encoding is always used for the ICU provider in initdb unless it is explicitly specified by the user: if (!encoding && locale_provider == COLLPROVIDER_ICU) encodingid = PG_UTF8; IMO this creates additional errors for locales with other encodings: $ initdb --locale de_DE.iso885915@euro --locale-provider icu --icu-locale de-DE ... initdb: error: encoding mismatch initdb: detail: The encoding you selected (UTF8) and the encoding that the selected locale uses (LATIN9) do not match. This would lead to misbehavior in various character string processing functions. initdb: hint: Rerun initdb and either do not specify an encoding explicitly, or choose a matching combination. And ICU supports many encodings, see the contents of pg_enc2icu_tbl in encnames.c... -- Marina Polyakova Postgres Professional: http://www.postgrespro.com The Russian Postgres Companydiff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 6ff48bb18f3639ae45d9528b32df51a4aebc60c0..f248ad42b77c8c0cf2089963d4357b120914ce20 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -1034,6 +1034,12 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) if (dblocprovider == COLLPROVIDER_ICU) { + if (!(is_encoding_supported_by_icu(encoding))) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("encoding \"%s\" is not supported with ICU provider", + pg_encoding_to_char(encoding; + /* * This would happen if template0 uses the libc provider but the new * database uses icu. @@ -1042,10 +1048,9 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("ICU locale must be specified"))); - } - if (dblocprovider == COLLPROVIDER_ICU) check_icu_locale(dbiculocale); + } /* * Check that the new encoding and locale settings match the source diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 6aeec8d426c52414b827686781c245291f27ed1f..999e7c2bf8dc707063eddf19a5c27eed03d3ca09 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -2092,20 +2092,30 @@ setlocales(void) check_locale_name(LC_CTYPE, lc_messages, ); lc_messages = canonname; #endif +} - if (locale_provider == COLLPROVIDER_ICU) +static void +check_icu_locale_encoding(void) +{ + if (!(is_encoding_supported_by_icu(encodingid))) { - if (!icu_locale) - pg_fatal("ICU locale must be specified"); + pg_log_error("encoding \"%s\" is not supported with ICU provider", + pg_encoding_to_char(encodingid)); + pg_log_error_hint("Rerun %s and choose a matching combination.", + progname); + exit(1); + } - /* - * In supported builds, the ICU locale ID will be checked by the - * backend during post-bootstrap initialization. - */ + if (!icu_locale) + pg_fatal("ICU locale must be specified"); + + /* + * In supported builds, the ICU locale ID will be checked by the backend + * during post-bootstrap initialization. + */ #ifndef USE_ICU - pg_fatal("ICU is not supported in this build"); + pg_fatal("ICU is not supported in this build"); #endif - } } /* @@ -2281,34 +2291,6 @@ setup_locale_encoding(void) { setlocales(); - if (locale_provider == COLLPROVIDER_LIBC && - strcmp(lc_ctype, lc_collate) == 0 && - strcmp(lc_ctype, lc_time) == 0 && - strcmp(lc_ctype, lc_numeric) == 0 && - strcmp(lc_ctype, lc_monetary) == 0 && - strcmp(lc_ctype, lc_messages) == 0 && - (!icu_locale || strcmp(lc_ctype, icu_locale) == 0)) - printf(_("The database cluster will be initialized with locale \"%s\".\n"), lc_ctype); - else - { - printf(_("The database cluster will be initialized with this locale configuration:\n")); - printf(_(" provider:%s\n"), collprovider_name(locale_provider)); - if (icu_locale) - printf(_(" ICU locale: %s\n"), icu_locale); - printf(_(" LC_COLLATE: %s\n" - " LC_CTYPE:%s\n" - " LC_MESSAGES: %s\n" - " LC_MONETARY: %s\n" - " LC_NUMERIC: %s\n" - " LC_TIME: %s\n"), - lc_collate, - lc_ctype, - lc_messages, - lc_monetary, - lc_numeric, - lc_time); - } - if (!encoding && locale_provider == COLLPROVIDER_ICU) encodingid =
Re: ICU for global collation
At Wed, 14 Sep 2022 17:19:34 +0300, Marina Polyakova wrote in > Hello! > > I was surprised that it is allowed to create clusters/databases where > the default ICU collations do not actually work due to unsupported > encodings: > > $ initdb --encoding SQL_ASCII --locale-provider icu --icu-locale en-US > -D data && > pg_ctl -D data -l logfile start && > psql -c "SELECT 'a' < 'b'" template1 > ... > waiting for server to start done > server started > ERROR: encoding "SQL_ASCII" not supported by ICU Indeed. If I did the following, the direction of the patch looks fine to me. If I executed initdb as follows, I would be told to specify --icu-locale option. > $ initdb --encoding sql-ascii --locale-provider icu hoge > ... > initdb: error: ICU locale must be specified However, when I reran the command, it complains about incompatible encoding this time. I think it's more user-friendly to check for the encoding compatibility before the check for missing --icu-locale option. regards. -- Kyotaro Horiguchi NTT Open Source Software Center
Re: ICU for global collation
Hello! I was surprised that it is allowed to create clusters/databases where the default ICU collations do not actually work due to unsupported encodings: $ initdb --encoding SQL_ASCII --locale-provider icu --icu-locale en-US -D data && pg_ctl -D data -l logfile start && psql -c "SELECT 'a' < 'b'" template1 ... waiting for server to start done server started ERROR: encoding "SQL_ASCII" not supported by ICU $ createdb --encoding SQL_ASCII --locale-provider icu --icu-locale en-US --template template0 mydb && psql -c "SELECT 'a' < 'b'" mydb ERROR: encoding "SQL_ASCII" not supported by ICU The patch diff_check_icu_encoding.patch prohibits the creation of such objects... -- Marina Polyakova Postgres Professional: http://www.postgrespro.com The Russian Postgres Companydiff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 6ff48bb18f3639ae45d9528b32df51a4aebc60c0..07758d15e8613d5a049537ddf2c5992e57ad6424 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -1042,11 +1042,16 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("ICU locale must be specified"))); - } - if (dblocprovider == COLLPROVIDER_ICU) check_icu_locale(dbiculocale); + if (!(is_encoding_supported_by_icu(encoding))) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("encoding \"%s\" is not supported with ICU provider", + pg_encoding_to_char(encoding; + } + /* * Check that the new encoding and locale settings match the source * database. We insist on this because we simply copy the source data --- diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index e00837ecacf4885cf2a176168c283f3e67c6eb53..8a762ced8340c9d8256f7832a4c19a43f1d5538a 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -2362,6 +2362,16 @@ setup_locale_encoding(void) if (!check_locale_encoding(lc_ctype, encodingid) || !check_locale_encoding(lc_collate, encodingid)) exit(1);/* check_locale_encoding printed the error */ + + if (locale_provider == COLLPROVIDER_ICU && + !(is_encoding_supported_by_icu(encodingid))) + { + pg_log_error("encoding \"%s\" is not supported with ICU provider", + pg_encoding_to_char(encodingid)); + pg_log_error_hint("Rerun %s and choose a matching combination.", + progname); + exit(1); + } } diff --git a/src/bin/initdb/t/001_initdb.pl b/src/bin/initdb/t/001_initdb.pl index a37f6dd9b334b6ee22d9fdd4d51422795cb54a39..e4bb3d0cdd9c23729c5fb97886374f8df558f239 100644 --- a/src/bin/initdb/t/001_initdb.pl +++ b/src/bin/initdb/t/001_initdb.pl @@ -118,6 +118,15 @@ if ($ENV{with_icu} eq 'yes') ], qr/FATAL: could not open collator for locale/, 'fails for invalid ICU locale'); + + command_fails_like( + [ + 'initdb','--no-sync', + '--locale-provider=icu', '--icu-locale=en', + '--encoding=SQL_ASCII', "$tempdir/dataX" + ], + qr/error: encoding "SQL_ASCII" is not supported with ICU provider/, + 'encoding "SQL_ASCII" is not supported with ICU provider'); } else { diff --git a/src/bin/scripts/t/020_createdb.pl b/src/bin/scripts/t/020_createdb.pl index e91c1d013d08d8bd1e3a92f2aba958c5c7713ca6..eaab3caa32669ead068719d98bb953c5c6ff5a17 100644 --- a/src/bin/scripts/t/020_createdb.pl +++ b/src/bin/scripts/t/020_createdb.pl @@ -50,6 +50,16 @@ if ($ENV{with_icu} eq 'yes') ], 'fails for invalid ICU locale'); + $node->command_fails_like( + [ + 'createdb','-T', + 'template0', '--locale-provider=icu', + '--icu-locale=en', '--encoding=SQL_ASCII', + 'foobarX' + ], + qr/ERROR: encoding "SQL_ASCII" is not supported with ICU provider/, + 'encoding "SQL_ASCII" is not supported with ICU provider'); + # additional node, which uses the icu provider my $node2 = PostgreSQL::Test::Cluster->new('icu'); $node2->init(extra => ['--locale-provider=icu', '--icu-locale=en']);
Re: ICU for global collation
On 2022-09-13 15:41, Peter Eisentraut wrote: On 13.09.22 07:34, Marina Polyakova wrote: I agree with you that it is more comfortable and more similar to what has already been done in initdb. IMO it would be easier to do it like this: diff --git a/src/bin/scripts/createdb.c b/src/bin/scripts/createdb.c index e523e58b2189275dc603a06324a2f28b0f49d8b7..a1482df3d981a680dd3322052e7c03ddacc8dc26 100644 --- a/src/bin/scripts/createdb.c +++ b/src/bin/scripts/createdb.c @@ -161,12 +161,10 @@ main(int argc, char *argv[]) if (locale) { - if (lc_ctype) - pg_fatal("only one of --locale and --lc-ctype can be specified"); - if (lc_collate) - pg_fatal("only one of --locale and --lc-collate can be specified"); - lc_ctype = locale; - lc_collate = locale; + if (!lc_ctype) + lc_ctype = locale; + if (!lc_collate) + lc_collate = locale; } if (encoding) done that way Thank you! BTW it's somewhat crummy that it uses a string comparison, so if you write "UTF8" without a dash, it says this; it took me a few minutes to see the difference... postgres=# create database a LC_COLLATE "en_US.UTF8" LC_CTYPE "en_US.UTF8" LOCALE "en_US.UTF8"; ERROR: new collation (en_US.UTF8) is incompatible with the collation of the template database (en_US.UTF-8) Perhaps we could check the locale itself with the function normalize_libc_locale_name (collationcmds.c). But ISTM that the current check is a safety net in case the function pg_get_encoding_from_locale (chklocale.c) returns -1 or PG_SQL_ASCII... This is not new behavior in PG15, is it? No, it has always existed [1] AFAICS.. [1] https://github.com/postgres/postgres/commit/61d967498802ab86d8897cb3c61740d7e9d712f6 -- Marina Polyakova Postgres Professional: http://www.postgrespro.com The Russian Postgres Company
Re: ICU for global collation
On 13.09.22 07:34, Marina Polyakova wrote: I agree with you that it is more comfortable and more similar to what has already been done in initdb. IMO it would be easier to do it like this: diff --git a/src/bin/scripts/createdb.c b/src/bin/scripts/createdb.c index e523e58b2189275dc603a06324a2f28b0f49d8b7..a1482df3d981a680dd3322052e7c03ddacc8dc26 100644 --- a/src/bin/scripts/createdb.c +++ b/src/bin/scripts/createdb.c @@ -161,12 +161,10 @@ main(int argc, char *argv[]) if (locale) { - if (lc_ctype) - pg_fatal("only one of --locale and --lc-ctype can be specified"); - if (lc_collate) - pg_fatal("only one of --locale and --lc-collate can be specified"); - lc_ctype = locale; - lc_collate = locale; + if (!lc_ctype) + lc_ctype = locale; + if (!lc_collate) + lc_collate = locale; } if (encoding) done that way Should we change the behaviour of createdb and CREATE DATABASE in previous major versions?.. I don't see a need for that. BTW it's somewhat crummy that it uses a string comparison, so if you write "UTF8" without a dash, it says this; it took me a few minutes to see the difference... postgres=# create database a LC_COLLATE "en_US.UTF8" LC_CTYPE "en_US.UTF8" LOCALE "en_US.UTF8"; ERROR: new collation (en_US.UTF8) is incompatible with the collation of the template database (en_US.UTF-8) Perhaps we could check the locale itself with the function normalize_libc_locale_name (collationcmds.c). But ISTM that the current check is a safety net in case the function pg_get_encoding_from_locale (chklocale.c) returns -1 or PG_SQL_ASCII... This is not new behavior in PG15, is it?
Re: ICU for global collation
On 2022-09-09 19:46, Justin Pryzby wrote: In pg14: |postgres=# create database a LC_COLLATE C LC_CTYPE C LOCALE C; |ERROR: conflicting or redundant options |DETAIL: LOCALE cannot be specified together with LC_COLLATE or LC_CTYPE. In pg15: |postgres=# create database a LC_COLLATE "en_US.UTF-8" LC_CTYPE "en_US.UTF-8" LOCALE "en_US.UTF-8" ; |CREATE DATABASE f2553d430 actually relaxed the restriction by removing this check: - if (dlocale && (dcollate || dctype)) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), -errmsg("conflicting or redundant options"), -errdetail("LOCALE cannot be specified together with LC_COLLATE or LC_CTYPE."))); But isn't the right fix to do the corresponding thing in createdb (relaxing the frontend restriction rather than reverting its relaxation in the backend). diff --git a/src/bin/scripts/createdb.c b/src/bin/scripts/createdb.c index e523e58b218..5b80e56dfd9 100644 --- a/src/bin/scripts/createdb.c +++ b/src/bin/scripts/createdb.c @@ -159,15 +159,10 @@ main(int argc, char *argv[]) exit(1); } - if (locale) - { - if (lc_ctype) - pg_fatal("only one of --locale and --lc-ctype can be specified"); - if (lc_collate) - pg_fatal("only one of --locale and --lc-collate can be specified"); + if (locale && !lc_ctype) lc_ctype = locale; + if (locale && !lc_collate) lc_collate = locale; - } if (encoding) { I agree with you that it is more comfortable and more similar to what has already been done in initdb. IMO it would be easier to do it like this: diff --git a/src/bin/scripts/createdb.c b/src/bin/scripts/createdb.c index e523e58b2189275dc603a06324a2f28b0f49d8b7..a1482df3d981a680dd3322052e7c03ddacc8dc26 100644 --- a/src/bin/scripts/createdb.c +++ b/src/bin/scripts/createdb.c @@ -161,12 +161,10 @@ main(int argc, char *argv[]) if (locale) { - if (lc_ctype) - pg_fatal("only one of --locale and --lc-ctype can be specified"); - if (lc_collate) - pg_fatal("only one of --locale and --lc-collate can be specified"); - lc_ctype = locale; - lc_collate = locale; + if (!lc_ctype) + lc_ctype = locale; + if (!lc_collate) + lc_collate = locale; } if (encoding) Should we change the behaviour of createdb and CREATE DATABASE in previous major versions?.. BTW it's somewhat crummy that it uses a string comparison, so if you write "UTF8" without a dash, it says this; it took me a few minutes to see the difference... postgres=# create database a LC_COLLATE "en_US.UTF8" LC_CTYPE "en_US.UTF8" LOCALE "en_US.UTF8"; ERROR: new collation (en_US.UTF8) is incompatible with the collation of the template database (en_US.UTF-8) Perhaps we could check the locale itself with the function normalize_libc_locale_name (collationcmds.c). But ISTM that the current check is a safety net in case the function pg_get_encoding_from_locale (chklocale.c) returns -1 or PG_SQL_ASCII... -- Marina Polyakova Postgres Professional: http://www.postgrespro.com The Russian Postgres Company
Re: ICU for global collation
In pg14: |postgres=# create database a LC_COLLATE C LC_CTYPE C LOCALE C; |ERROR: conflicting or redundant options |DETAIL: LOCALE cannot be specified together with LC_COLLATE or LC_CTYPE. In pg15: |postgres=# create database a LC_COLLATE "en_US.UTF-8" LC_CTYPE "en_US.UTF-8" LOCALE "en_US.UTF-8" ; |CREATE DATABASE f2553d430 actually relaxed the restriction by removing this check: - if (dlocale && (dcollate || dctype)) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), -errmsg("conflicting or redundant options"), -errdetail("LOCALE cannot be specified together with LC_COLLATE or LC_CTYPE."))); But isn't the right fix to do the corresponding thing in createdb (relaxing the frontend restriction rather than reverting its relaxation in the backend). diff --git a/src/bin/scripts/createdb.c b/src/bin/scripts/createdb.c index e523e58b218..5b80e56dfd9 100644 --- a/src/bin/scripts/createdb.c +++ b/src/bin/scripts/createdb.c @@ -159,15 +159,10 @@ main(int argc, char *argv[]) exit(1); } - if (locale) - { - if (lc_ctype) - pg_fatal("only one of --locale and --lc-ctype can be specified"); - if (lc_collate) - pg_fatal("only one of --locale and --lc-collate can be specified"); + if (locale && !lc_ctype) lc_ctype = locale; + if (locale && !lc_collate) lc_collate = locale; - } if (encoding) { BTW it's somewhat crummy that it uses a string comparison, so if you write "UTF8" without a dash, it says this; it took me a few minutes to see the difference... postgres=# create database a LC_COLLATE "en_US.UTF8" LC_CTYPE "en_US.UTF8" LOCALE "en_US.UTF8"; ERROR: new collation (en_US.UTF8) is incompatible with the collation of the template database (en_US.UTF-8)
Re: ICU for global collation
Hello! IMO after adding ICU for global collations [1] the behaviour of createdb and CREATE DATABASE is a bit inconsistent when both locale and lc_collate (or locale and lc_ctype) options are used: $ createdb mydb --locale C --lc-collate C --template template0 createdb: error: only one of --locale and --lc-collate can be specified $ psql -c "create database mydb locale = 'C' lc_collate = 'C' template = 'template0'" postgres CREATE DATABASE From the CREATE DATABASE documentation [2]: locale This is a shortcut for setting LC_COLLATE and LC_CTYPE at once. If you specify this, you cannot specify either of those parameters. The patch diff_return_back_create_database_error.patch returns back the removed code for CREATE DATABASE so it behaves like createdb as before... [1] https://github.com/postgres/postgres/commit/f2553d43060edb210b36c63187d52a632448e1d2 [2] https://www.postgresql.org/docs/devel/sql-createdatabase.html -- Marina Polyakova Postgres Professional: http://www.postgrespro.com The Russian Postgres Companydiff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 6ff48bb18f3639ae45d9528b32df51a4aebc60c0..0a22cace11d9df6b5fc085bfd7b86319f4b13165 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -851,6 +851,12 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) parser_errposition(pstate, defel->location))); } + if (dlocale && (dcollate || dctype)) + ereport(ERROR, +(errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"), + errdetail("LOCALE cannot be specified together with LC_COLLATE or LC_CTYPE."))); + if (downer && downer->arg) dbowner = defGetString(downer); if (dtemplate && dtemplate->arg)
Re: ICU for global collation
On Wed, Aug 24, 2022 at 08:28:24PM +0200, Peter Eisentraut wrote: > Committed, thanks. > > (This should conclude all the issues discussed in this thread recently.) Please note that this open item was still listed as open. I have closed it now. -- Michael signature.asc Description: PGP signature
Re: ICU for global collation
On 24.08.22 10:59, Julien Rouhaud wrote: I have not tested the patch in details but this looks rather sane to me on a quick read. Peter? Patch looks good to me too. Committed, thanks. (This should conclude all the issues discussed in this thread recently.)
Re: ICU for global collation
On Wed, Aug 24, 2022 at 01:38:44PM +0900, Michael Paquier wrote: > On Tue, Aug 23, 2022 at 08:59:02PM +0300, Marina Polyakova wrote: > > My colleague Andrew Bille found another bug in master > > (b4e936859dc441102eb0b6fb7a104f3948c90490) and REL_15_STABLE > > (2c63b0930aee1bb5c265fad4a65c9d0b62b1f9da): pg_collation.colliculocale is > > not dumped. See check_icu_locale.sh: > > > > In the old cluster: > > SELECT collname, colliculocale FROM pg_collation WHERE collname = > > 'testcoll_backwards' > > collname | colliculocale > > +--- > > testcoll_backwards | @colBackwards=yes > > (1 row) > > > > In the new cluster: > > SELECT collname, colliculocale FROM pg_collation WHERE collname = > > 'testcoll_backwards' > > collname | colliculocale > > +--- > > testcoll_backwards | > > (1 row) > > > > diff_dump_colliculocale.patch works for me. > > Ugh. Good catch, again! +1 > I have not tested the patch in details but > this looks rather sane to me on a quick read. Peter? Patch looks good to me too.
Re: ICU for global collation
On Tue, Aug 23, 2022 at 08:59:02PM +0300, Marina Polyakova wrote: > My colleague Andrew Bille found another bug in master > (b4e936859dc441102eb0b6fb7a104f3948c90490) and REL_15_STABLE > (2c63b0930aee1bb5c265fad4a65c9d0b62b1f9da): pg_collation.colliculocale is > not dumped. See check_icu_locale.sh: > > In the old cluster: > SELECT collname, colliculocale FROM pg_collation WHERE collname = > 'testcoll_backwards' > collname | colliculocale > +--- > testcoll_backwards | @colBackwards=yes > (1 row) > > In the new cluster: > SELECT collname, colliculocale FROM pg_collation WHERE collname = > 'testcoll_backwards' > collname | colliculocale > +--- > testcoll_backwards | > (1 row) > > diff_dump_colliculocale.patch works for me. Ugh. Good catch, again! I have not tested the patch in details but this looks rather sane to me on a quick read. Peter? -- Michael signature.asc Description: PGP signature
Re: ICU for global collation
My colleague Andrew Bille found another bug in master (b4e936859dc441102eb0b6fb7a104f3948c90490) and REL_15_STABLE (2c63b0930aee1bb5c265fad4a65c9d0b62b1f9da): pg_collation.colliculocale is not dumped. See check_icu_locale.sh: In the old cluster: SELECT collname, colliculocale FROM pg_collation WHERE collname = 'testcoll_backwards' collname | colliculocale +--- testcoll_backwards | @colBackwards=yes (1 row) In the new cluster: SELECT collname, colliculocale FROM pg_collation WHERE collname = 'testcoll_backwards' collname | colliculocale +--- testcoll_backwards | (1 row) diff_dump_colliculocale.patch works for me. -- Marina Polyakova Postgres Professional: http://www.postgrespro.com The Russian Postgres Companyinitdb -D data_old && pg_ctl -D data_old -l logfile_old start && psql -ac "CREATE COLLATION testcoll_backwards (provider = icu, locale = '@colBackwards=yes')" postgres && echo "In the old cluster:" && psql -ac "SELECT collname, colliculocale FROM pg_collation WHERE collname = 'testcoll_backwards'" postgres && pg_dump postgres > dump_postgres.sql && pg_ctl -D data_old stop && initdb -D data_new && pg_ctl -D data_new -l logfile_new start && psql -v ON_ERROR_STOP=1 -f dump_postgres.sql postgres && echo "In the new cluster:" && psql -ac "SELECT collname, colliculocale FROM pg_collation WHERE collname = 'testcoll_backwards'" postgres && pg_ctl -D data_new stop diff --git a/src/bin/pg_dump/Makefile b/src/bin/pg_dump/Makefile index 2f524b09bf53a55037013d78148f8cbca4fa7eee..9dc5a784dd2d1ce58a6284f19c54730364779c4d 100644 --- a/src/bin/pg_dump/Makefile +++ b/src/bin/pg_dump/Makefile @@ -17,6 +17,7 @@ top_builddir = ../../.. include $(top_builddir)/src/Makefile.global export GZIP_PROGRAM=$(GZIP) +export with_icu override CPPFLAGS := -I$(libpq_srcdir) $(CPPFLAGS) LDFLAGS_INTERNAL += -L$(top_builddir)/src/fe_utils -lpgfeutils $(libpq_pgport) diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 2c6891573296b395c5bd6b627d061de657355e9c..e8e8f69e30c8a787e5dff157f5d1619917638d7d 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -13084,9 +13084,11 @@ dumpCollation(Archive *fout, const CollInfo *collinfo) int i_collisdeterministic; int i_collcollate; int i_collctype; + int i_colliculocale; const char *collprovider; const char *collcollate; const char *collctype; + const char *colliculocale; /* Do nothing in data-only dump */ if (dopt->dataOnly) @@ -13117,6 +13119,13 @@ dumpCollation(Archive *fout, const CollInfo *collinfo) appendPQExpBufferStr(query, "true AS collisdeterministic, "); + if (fout->remoteVersion >= 15) + appendPQExpBufferStr(query, + "colliculocale, "); + else + appendPQExpBufferStr(query, + "NULL AS colliculocale, "); + appendPQExpBuffer(query, "collcollate, " "collctype " @@ -13130,10 +13139,24 @@ dumpCollation(Archive *fout, const CollInfo *collinfo) i_collisdeterministic = PQfnumber(res, "collisdeterministic"); i_collcollate = PQfnumber(res, "collcollate"); i_collctype = PQfnumber(res, "collctype"); + i_colliculocale = PQfnumber(res, "colliculocale"); collprovider = PQgetvalue(res, 0, i_collprovider); - collcollate = PQgetvalue(res, 0, i_collcollate); - collctype = PQgetvalue(res, 0, i_collctype); + + if (!PQgetisnull(res, 0, i_collcollate)) + collcollate = PQgetvalue(res, 0, i_collcollate); + else + collcollate = NULL; + + if (!PQgetisnull(res, 0, i_collctype)) + collctype = PQgetvalue(res, 0, i_collctype); + else + collctype = NULL; + + if (!PQgetisnull(res, 0, i_colliculocale)) + colliculocale = PQgetvalue(res, 0, i_colliculocale); + else + colliculocale = NULL; appendPQExpBuffer(delq, "DROP COLLATION %s;\n", fmtQualifiedDumpable(collinfo)); @@ -13156,17 +13179,28 @@ dumpCollation(Archive *fout, const CollInfo *collinfo) if (strcmp(PQgetvalue(res, 0, i_collisdeterministic), "f") == 0) appendPQExpBufferStr(q, ", deterministic = false"); - if (strcmp(collcollate, collctype) == 0) + if (colliculocale != NULL) { appendPQExpBufferStr(q, ", locale = "); - appendStringLiteralAH(q, collcollate, fout); + appendStringLiteralAH(q, colliculocale, fout); } else { - appendPQExpBufferStr(q, ", lc_collate = "); - appendStringLiteralAH(q, collcollate, fout); - appendPQExpBufferStr(q, ", lc_ctype = "); - appendStringLiteralAH(q, collctype, fout); + Assert(collcollate != NULL); + Assert(collctype != NULL); + + if (strcmp(collcollate, collctype) == 0) + { + appendPQExpBufferStr(q, ", locale = "); + appendStringLiteralAH(q, collcollate, fout); + } + else + { + appendPQExpBufferStr(q, ", lc_collate = "); + appendStringLiteralAH(q, collcollate, fout); + appendPQExpBufferStr(q, ", lc_ctype = "); + appendStringLiteralAH(q, collctype, fout); + } } /* diff --git a/src/bin/pg_dump/t/002_pg_dump.pl
Re: ICU for global collation
On Mon, Aug 22, 2022 at 04:10:59PM +0200, Peter Eisentraut wrote: > I think this piece of code was left over from some earlier attempts to > specify the libc locale and the icu locale with one option, which never > really worked well. The CREATE DATABASE man page does not mention that > LOCALE provides the default for ICU_LOCALE. Hence, I think we should delete > this. As of 36f729e, is there anything left to address on this thread or should this open item be closed? -- Michael signature.asc Description: PGP signature
Re: ICU for global collation
On 2022-08-22 17:10, Peter Eisentraut wrote: On 15.08.22 14:06, Marina Polyakova wrote: 1.1) It looks like there's a bug in the function get_db_infos (src/bin/pg_upgrade/info.c), where the version of the old cluster is always checked: if (GET_MAJOR_VERSION(old_cluster.major_version) < 1500) snprintf(query + strlen(query), sizeof(query) - strlen(query), "'c' AS datlocprovider, NULL AS daticulocale, "); else snprintf(query + strlen(query), sizeof(query) - strlen(query), "datlocprovider, daticulocale, "); With the simple patch diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c index df374ce4b362b4c6c87fc1fd0e476e5d6d353d9e..53ea348e211d3ac38334292bc16cb814bc13bb87 100644 --- a/src/bin/pg_upgrade/info.c +++ b/src/bin/pg_upgrade/info.c @@ -319,7 +319,7 @@ get_db_infos(ClusterInfo *cluster) snprintf(query, sizeof(query), "SELECT d.oid, d.datname, d.encoding, d.datcollate, d.datctype, "); - if (GET_MAJOR_VERSION(old_cluster.major_version) < 1500) + if (GET_MAJOR_VERSION(cluster->major_version) < 1500) snprintf(query + strlen(query), sizeof(query) - strlen(query), "'c' AS datlocprovider, NULL AS daticulocale, "); else fixed 1.2) It looks like the mentioned asserion in dbcommands.c conflicts with the following lines earlier: if (dbiculocale == NULL) dbiculocale = src_iculocale; fixed I'm wondering if this is not a fully-supported feature (because createdb creates an SQL command with LC_COLLATE and LC_CTYPE options instead of LOCALE option) or is it a bug in CREATE DATABASE?.. From src/backend/commands/dbcommands.c: if (dblocprovider == COLLPROVIDER_ICU && !dbiculocale) { if (dlocale && dlocale->arg) dbiculocale = defGetString(dlocale); } I think this piece of code was left over from some earlier attempts to specify the libc locale and the icu locale with one option, which never really worked well. The CREATE DATABASE man page does not mention that LOCALE provides the default for ICU_LOCALE. Hence, I think we should delete this. Thank you! -- Marina Polyakova Postgres Professional: http://www.postgrespro.com The Russian Postgres Company
Re: ICU for global collation
On 15.08.22 14:06, Marina Polyakova wrote: 1.1) It looks like there's a bug in the function get_db_infos (src/bin/pg_upgrade/info.c), where the version of the old cluster is always checked: if (GET_MAJOR_VERSION(old_cluster.major_version) < 1500) snprintf(query + strlen(query), sizeof(query) - strlen(query), "'c' AS datlocprovider, NULL AS daticulocale, "); else snprintf(query + strlen(query), sizeof(query) - strlen(query), "datlocprovider, daticulocale, "); With the simple patch diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c index df374ce4b362b4c6c87fc1fd0e476e5d6d353d9e..53ea348e211d3ac38334292bc16cb814bc13bb87 100644 --- a/src/bin/pg_upgrade/info.c +++ b/src/bin/pg_upgrade/info.c @@ -319,7 +319,7 @@ get_db_infos(ClusterInfo *cluster) snprintf(query, sizeof(query), "SELECT d.oid, d.datname, d.encoding, d.datcollate, d.datctype, "); - if (GET_MAJOR_VERSION(old_cluster.major_version) < 1500) + if (GET_MAJOR_VERSION(cluster->major_version) < 1500) snprintf(query + strlen(query), sizeof(query) - strlen(query), "'c' AS datlocprovider, NULL AS daticulocale, "); else fixed 1.2) It looks like the mentioned asserion in dbcommands.c conflicts with the following lines earlier: if (dbiculocale == NULL) dbiculocale = src_iculocale; fixed I'm wondering if this is not a fully-supported feature (because createdb creates an SQL command with LC_COLLATE and LC_CTYPE options instead of LOCALE option) or is it a bug in CREATE DATABASE?.. From src/backend/commands/dbcommands.c: if (dblocprovider == COLLPROVIDER_ICU && !dbiculocale) { if (dlocale && dlocale->arg) dbiculocale = defGetString(dlocale); } I think this piece of code was left over from some earlier attempts to specify the libc locale and the icu locale with one option, which never really worked well. The CREATE DATABASE man page does not mention that LOCALE provides the default for ICU_LOCALE. Hence, I think we should delete this.
Re: ICU for global collation
On 2022-08-17 19:53, Julien Rouhaud wrote: Good catch. There's unfortunately not a lot of regression tests for ICU-initialized clusters. I'm wondering if the build-farm client could be taught about the locale provider rather than assuming libc. Clearly that wouldn't have caught this issue, but it should still increase the coverage a bit (I'm thinking of the recent problem with the abbreviated keys). Looking at installchecks with different locales (e.g. see [1] with sv_SE.UTF-8) - why not?.. diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index b31a30550b025d48ba3cc250dc4c15f41f9a80be..17a2942341e528c01182fb6d4878580f2706bec9 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -1048,6 +1048,8 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("ICU locale must be specified"))); } + else + dbiculocale = NULL; if (dblocprovider == COLLPROVIDER_ICU) check_icu_locale(dbiculocale); I think it would be better to do that in the various variable initialization. Maybe switch the dblocprovider and dbiculocale initialization, and only initialize dbiculocale if dblocprovider is COLLPROVIDER_ICU. diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index b31a30550b025d48ba3cc250dc4c15f41f9a80be..883f381f3453142790f728a3725586cebe2e2f20 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -1012,10 +1012,10 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) dbcollate = src_collate; if (dbctype == NULL) dbctype = src_ctype; - if (dbiculocale == NULL) - dbiculocale = src_iculocale; if (dblocprovider == '\0') dblocprovider = src_locprovider; + if (dbiculocale == NULL && dblocprovider == COLLPROVIDER_ICU) + dbiculocale = src_iculocale; /* Some encodings are client only */ if (!PG_VALID_BE_ENCODING(encoding)) Then it seemed to me that it was easier to first get all the parameters from the template database as usual and then process them as needed. But with your suggestion the failed assertion will check the code above more accurately... This discrepancy between createdb and CREATE DATABASE looks like an oversight, as createdb indeed interprets --locale as: if (locale) { if (lc_ctype) pg_fatal("only one of --locale and --lc-ctype can be specified"); if (lc_collate) pg_fatal("only one of --locale and --lc-collate can be specified"); lc_ctype = locale; lc_collate = locale; } AFAIK the fallback in the CREATE DATABASE case is expected as POSIX locale names should be accepted by icu, so this should work for createdb too. Oh, great, thanks! > > I spent some time looking at the ICU api trying to figure out if using a > > posix locale name (e.g. en_US) was actually compatible with an ICU locale name. > > It seems that ICU accept any of 'en-us', 'en-US', 'en_us' or 'en_US' as the > > same locale, but I might be wrong. I also didn't find a way to figure out how > > to ask ICU if the locale identifier passed is complete garbage or not. One > > sure thing is that the system collation we import are of the form 'en-us', so > > it seems weird to have this form in pg_collation and by default another form in > > pg_database. > > Yeah it seems to be inconsistent about that. The locale ID documentation > appears to indicate that "en_US" is the canonical form, but when you ask it > to list all the locales it knows about it returns "en-US". Yeah I saw that too when checking is POSIX locale names were valid, and that's not great. I'm sorry but IIUC pg_import_system_collations uses uloc_getAvailable to get the locale ID and then specifically calls uloc_toLanguageTag?.. I don't think that initdb --collation-provider icu should be allowed without --icu-locale, same for --collation-provider libc *with* --icu-locale. > initdb has some specific processing to transform the default libc locale to > something more appropriate, but as far as I can see creatdb / CREATE DATABASE > aren't doing that. It seems inconsistent, and IMHO another reason why > defaulting to the libc locale looks like a bad idea. This has all been removed. The separate ICU locale option should now be required everywhere (initdb, createdb, CREATE DATABASE). If it's a feature and not a bug in CREATE DATABASE, why should not it work in initdb too? Here we define locale/lc_collate/lc_ctype for the first 3 databases in the cluster in much the same way... P.S. FYI there seems to be a bug for very old ICU versions: in master
Re: ICU for global collation
Hi, On Mon, Aug 15, 2022 at 03:06:32PM +0300, Marina Polyakova wrote: > > 1.1) It looks like there's a bug in the function get_db_infos > (src/bin/pg_upgrade/info.c), where the version of the old cluster is always > checked: > > if (GET_MAJOR_VERSION(old_cluster.major_version) < 1500) > snprintf(query + strlen(query), sizeof(query) - strlen(query), >"'c' AS datlocprovider, NULL AS daticulocale, "); > else > snprintf(query + strlen(query), sizeof(query) - strlen(query), >"datlocprovider, daticulocale, "); > > With the simple patch > > diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c > index > df374ce4b362b4c6c87fc1fd0e476e5d6d353d9e..53ea348e211d3ac38334292bc16cb814bc13bb87 > 100644 > --- a/src/bin/pg_upgrade/info.c > +++ b/src/bin/pg_upgrade/info.c > @@ -319,7 +319,7 @@ get_db_infos(ClusterInfo *cluster) > > snprintf(query, sizeof(query), >"SELECT d.oid, d.datname, d.encoding, d.datcollate, > d.datctype, "); > - if (GET_MAJOR_VERSION(old_cluster.major_version) < 1500) > + if (GET_MAJOR_VERSION(cluster->major_version) < 1500) > snprintf(query + strlen(query), sizeof(query) - strlen(query), >"'c' AS datlocprovider, NULL AS daticulocale, > "); > else > > I got the expected error during the upgrade: > > locale providers for database "template1" do not match: old "libc", new > "icu" > Failure, exiting Good catch. There's unfortunately not a lot of regression tests for ICU-initialized clusters. I'm wondering if the build-farm client could be taught about the locale provider rather than assuming libc. Clearly that wouldn't have caught this issue, but it should still increase the coverage a bit (I'm thinking of the recent problem with the abbreviated keys). > 1.2) It looks like the mentioned asserion in dbcommands.c conflicts with the > following lines earlier: > > if (dbiculocale == NULL) > dbiculocale = src_iculocale; > > The following patch works for me: > > diff --git a/src/backend/commands/dbcommands.c > b/src/backend/commands/dbcommands.c > index > b31a30550b025d48ba3cc250dc4c15f41f9a80be..17a2942341e528c01182fb6d4878580f2706bec9 > 100644 > --- a/src/backend/commands/dbcommands.c > +++ b/src/backend/commands/dbcommands.c > @@ -1048,6 +1048,8 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) > > (errcode(ERRCODE_INVALID_PARAMETER_VALUE), >errmsg("ICU locale must be > specified"))); > } > + else > + dbiculocale = NULL; > > if (dblocprovider == COLLPROVIDER_ICU) > check_icu_locale(dbiculocale); I think it would be better to do that in the various variable initialization. Maybe switch the dblocprovider and dbiculocale initialization, and only initialize dbiculocale if dblocprovider is COLLPROVIDER_ICU. > 2) CREATE DATABASE does not always require the icu locale unlike initdb and > createdb: > > $ createdb mydb --locale en_US.UTF-8 --template template0 --locale-provider > icu > createdb: error: database creation failed: ERROR: ICU locale must be > specified > > $ psql -c "CREATE DATABASE mydb LOCALE \"en_US.UTF-8\" TEMPLATE template0 > LOCALE_PROVIDER icu" postgres > CREATE DATABASE > > I'm wondering if this is not a fully-supported feature (because createdb > creates an SQL command with LC_COLLATE and LC_CTYPE options instead of > LOCALE option) or is it a bug in CREATE DATABASE?.. From > src/backend/commands/dbcommands.c: > > if (dblocprovider == COLLPROVIDER_ICU && !dbiculocale) > { > if (dlocale && dlocale->arg) > dbiculocale = defGetString(dlocale); > } This discrepancy between createdb and CREATE DATABASE looks like an oversight, as createdb indeed interprets --locale as: if (locale) { if (lc_ctype) pg_fatal("only one of --locale and --lc-ctype can be specified"); if (lc_collate) pg_fatal("only one of --locale and --lc-collate can be specified"); lc_ctype = locale; lc_collate = locale; } AFAIK the fallback in the CREATE DATABASE case is expected as POSIX locale names should be accepted by icu, so this should work for createdb too.
Re: ICU for global collation
Hello everyone in this thread! While reading and testing the patch that adds ICU for global collations [1] I noticed on master (1c5818b9c68e5c2ac8f19d372f24cce409de1a26) and REL_15_STABLE (63b64d8270691894a9a8f2d4e929e7780020edb8) that: 1) pg_upgrade from REL_14_STABLE (63b64d8270691894a9a8f2d4e929e7780020edb8) does not always work: For REL_14_STABLE: $ initdb -D data_old For REL_15_STABLE or master: $ initdb -D data_new --locale-provider icu --icu-locale ru-RU $ pg_upgrade -d .../data_old -D data_new -b ... -B ... ... Restoring database schemas in the new cluster template1 *failure* Consult the last few lines of "data_new/pg_upgrade_output.d/20220815T142454.223/log/pg_upgrade_dump_1.log" for the probable cause of the failure. Failure, exiting In data_new/pg_upgrade_output.d/20220815T142454.223/log/pg_upgrade_dump_1.log: pg_restore: error: could not execute query: server closed the connection unexpectedly This probably means the server terminated abnormally before or while processing the request. Command was: CREATE DATABASE "template1" WITH TEMPLATE = template0 OID = 1 ENCODING = 'UTF8' LOCALE_PROVIDER = libc LOCALE = 'en_US.UTF-8'; In data_new/pg_upgrade_output.d/20220815T142454.223/log/pg_upgrade_server.log: TRAP: FailedAssertion("(dblocprovider == COLLPROVIDER_ICU && dbiculocale) || (dblocprovider != COLLPROVIDER_ICU && !dbiculocale)", File: "dbcommands.c", Line: 1292, PID: 69247) postgres: marina postgres [local] CREATE DATABASE(ExceptionalCondition+0xb9)[0xb4d8ec] postgres: marina postgres [local] CREATE DATABASE(createdb+0x1abc)[0x68ca99] postgres: marina postgres [local] CREATE DATABASE(standard_ProcessUtility+0x651)[0x9b1d82] postgres: marina postgres [local] CREATE DATABASE(ProcessUtility+0x122)[0x9b172a] postgres: marina postgres [local] CREATE DATABASE[0x9b01cf] postgres: marina postgres [local] CREATE DATABASE[0x9b0433] postgres: marina postgres [local] CREATE DATABASE(PortalRun+0x2fe)[0x9af95d] postgres: marina postgres [local] CREATE DATABASE[0x9a953b] postgres: marina postgres [local] CREATE DATABASE(PostgresMain+0x733)[0x9ada6b] postgres: marina postgres [local] CREATE DATABASE[0x8ec632] postgres: marina postgres [local] CREATE DATABASE[0x8ebfbb] postgres: marina postgres [local] CREATE DATABASE[0x8e8653] postgres: marina postgres [local] CREATE DATABASE(PostmasterMain+0x1226)[0x8e7f26] postgres: marina postgres [local] CREATE DATABASE[0x7bbccb] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf3)[0x7eff082f90b3] postgres: marina postgres [local] CREATE DATABASE(_start+0x2e)[0x49c29e] 2022-08-15 14:24:56.124 MSK [69231] LOG: server process (PID 69247) was terminated by signal 6: Aborted 2022-08-15 14:24:56.124 MSK [69231] DETAIL: Failed process was running: CREATE DATABASE "template1" WITH TEMPLATE = template0 OID = 1 ENCODING = 'UTF8' LOCALE_PROVIDER = libc LOCALE = 'en_US.UTF-8'; 1.1) It looks like there's a bug in the function get_db_infos (src/bin/pg_upgrade/info.c), where the version of the old cluster is always checked: if (GET_MAJOR_VERSION(old_cluster.major_version) < 1500) snprintf(query + strlen(query), sizeof(query) - strlen(query), "'c' AS datlocprovider, NULL AS daticulocale, "); else snprintf(query + strlen(query), sizeof(query) - strlen(query), "datlocprovider, daticulocale, "); With the simple patch diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c index df374ce4b362b4c6c87fc1fd0e476e5d6d353d9e..53ea348e211d3ac38334292bc16cb814bc13bb87 100644 --- a/src/bin/pg_upgrade/info.c +++ b/src/bin/pg_upgrade/info.c @@ -319,7 +319,7 @@ get_db_infos(ClusterInfo *cluster) snprintf(query, sizeof(query), "SELECT d.oid, d.datname, d.encoding, d.datcollate, d.datctype, "); - if (GET_MAJOR_VERSION(old_cluster.major_version) < 1500) + if (GET_MAJOR_VERSION(cluster->major_version) < 1500) snprintf(query + strlen(query), sizeof(query) - strlen(query), "'c' AS datlocprovider, NULL AS daticulocale, "); else I got the expected error during the upgrade: locale providers for database "template1" do not match: old "libc", new "icu" Failure, exiting 1.2) It looks like the mentioned asserion in dbcommands.c conflicts with the following lines earlier: if (dbiculocale == NULL) dbiculocale = src_iculocale; The following patch works for me: diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index b31a30550b025d48ba3cc250dc4c15f41f9a80be..17a2942341e528c01182fb6d4878580f2706bec9 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -1048,6 +1048,8 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("ICU locale must be
Re: ICU for global collation
On Mon, Jun 27, 2022 at 09:10:29AM +0200, Peter Eisentraut wrote: > On 27.06.22 08:42, Michael Paquier wrote: > > On Mon, Jun 27, 2022 at 08:23:59AM +0200, Peter Eisentraut wrote: > > > On 26.06.22 05:51, Julien Rouhaud wrote: > > > > On Sat, Jun 25, 2022 at 10:19:30AM -0500, Justin Pryzby wrote: > > > > > > + if (GET_MAJOR_VERSION(old_cluster.major_version) <= 1500) > > > > > > I think the fix here is to change <= to < ? > > > > Yes. > > Ok, committed. > > (I see now that in the context of pg_upgrade, writing <= 1400 is equivalent, > but I find that confusing, so I did < 1500.) I suggested using <= 1400 for consistency with the other code, and per bc1fbc960. But YMMV. -- Justin
Re: ICU for global collation
On 27.06.22 08:42, Michael Paquier wrote: On Mon, Jun 27, 2022 at 08:23:59AM +0200, Peter Eisentraut wrote: On 26.06.22 05:51, Julien Rouhaud wrote: On Sat, Jun 25, 2022 at 10:19:30AM -0500, Justin Pryzby wrote: + if (GET_MAJOR_VERSION(old_cluster.major_version) <= 1500) I think the fix here is to change <= to < ? Yes. Ok, committed. (I see now that in the context of pg_upgrade, writing <= 1400 is equivalent, but I find that confusing, so I did < 1500.)
Re: ICU for global collation
On Mon, Jun 27, 2022 at 08:23:59AM +0200, Peter Eisentraut wrote: > On 26.06.22 05:51, Julien Rouhaud wrote: >> On Sat, Jun 25, 2022 at 10:19:30AM -0500, Justin Pryzby wrote: + if (GET_MAJOR_VERSION(old_cluster.major_version) <= 1500) > > I think the fix here is to change <= to < ? Yes. -- Michael signature.asc Description: PGP signature
Re: ICU for global collation
On 26.06.22 05:51, Julien Rouhaud wrote: Hi, On Sat, Jun 25, 2022 at 10:19:30AM -0500, Justin Pryzby wrote: commit f2553d43060edb210b36c63187d52a632448e1d2 says >=1500 in a few places, but in pg_upgrade says <=1500, which looks wrong for upgrades from v15. I think it should say <= 1400. On Wed, Feb 02, 2022 at 02:01:23PM +0100, Peter Eisentraut wrote: diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c index 69ef23119f..2a9ca0e389 100644 --- a/src/bin/pg_upgrade/info.c +++ b/src/bin/pg_upgrade/info.c @@ -312,11 +312,20 @@ get_db_infos(ClusterInfo *cluster) i_spclocation; charquery[QUERY_ALLOC]; snprintf(query, sizeof(query), -"SELECT d.oid, d.datname, d.encoding, d.datcollate, d.datctype, " +"SELECT d.oid, d.datname, d.encoding, d.datcollate, d.datctype, "); + if (GET_MAJOR_VERSION(old_cluster.major_version) <= 1500) I think the fix here is to change <= to < ? + snprintf(query + strlen(query), sizeof(query) - strlen(query), +"'c' AS datcollprovider, NULL AS daticucoll, "); + else + snprintf(query + strlen(query), sizeof(query) - strlen(query), +"datcollprovider, daticucoll, "); + snprintf(query + strlen(query), sizeof(query) - strlen(query), "pg_catalog.pg_tablespace_location(t.oid) AS spclocation " "FROM pg_catalog.pg_database d " " LEFT OUTER JOIN pg_catalog.pg_tablespace t " Indeed!
Re: ICU for global collation
On Sun, Jun 26, 2022 at 11:51:24AM +0800, Julien Rouhaud wrote: > On Sat, Jun 25, 2022 at 10:19:30AM -0500, Justin Pryzby wrote: >>> + if (GET_MAJOR_VERSION(old_cluster.major_version) <= 1500) >>> + snprintf(query + strlen(query), sizeof(query) - strlen(query), >>> +"'c' AS datcollprovider, NULL AS daticucoll, >>> "); >>> + else >>> + snprintf(query + strlen(query), sizeof(query) - strlen(query), >>> +"datcollprovider, daticucoll, "); >>> + snprintf(query + strlen(query), sizeof(query) - strlen(query), >>> "pg_catalog.pg_tablespace_location(t.oid) AS >>> spclocation " >>> "FROM pg_catalog.pg_database d " >>> " LEFT OUTER JOIN pg_catalog.pg_tablespace t " > > Indeed! Oops. Beta2 tagging is very close by, so I think that it would be better to not take a risk on that now, and this is an issue only when upgrading from v15 where datcollprovider is ICU for a database. As things stand, someone using beta1 with this new feature, running pg_upgrade to beta2 would lose any non-libc locale provider set. -- Michael signature.asc Description: PGP signature
Re: ICU for global collation
Hi, On Sat, Jun 25, 2022 at 10:19:30AM -0500, Justin Pryzby wrote: > commit f2553d43060edb210b36c63187d52a632448e1d2 says >=1500 in a few places, > but in pg_upgrade says <=1500, which looks wrong for upgrades from v15. > I think it should say <= 1400. > > On Wed, Feb 02, 2022 at 02:01:23PM +0100, Peter Eisentraut wrote: > > diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c > > index 69ef23119f..2a9ca0e389 100644 > > --- a/src/bin/pg_upgrade/info.c > > +++ b/src/bin/pg_upgrade/info.c > > @@ -312,11 +312,20 @@ get_db_infos(ClusterInfo *cluster) > > i_spclocation; > > charquery[QUERY_ALLOC]; > > > > snprintf(query, sizeof(query), > > -"SELECT d.oid, d.datname, d.encoding, d.datcollate, > > d.datctype, " > > +"SELECT d.oid, d.datname, d.encoding, d.datcollate, > > d.datctype, "); > > + if (GET_MAJOR_VERSION(old_cluster.major_version) <= 1500) > > + snprintf(query + strlen(query), sizeof(query) - strlen(query), > > +"'c' AS datcollprovider, NULL AS daticucoll, > > "); > > + else > > + snprintf(query + strlen(query), sizeof(query) - strlen(query), > > +"datcollprovider, daticucoll, "); > > + snprintf(query + strlen(query), sizeof(query) - strlen(query), > > "pg_catalog.pg_tablespace_location(t.oid) AS > > spclocation " > > "FROM pg_catalog.pg_database d " > > " LEFT OUTER JOIN pg_catalog.pg_tablespace t " Indeed!
Re: ICU for global collation
commit f2553d43060edb210b36c63187d52a632448e1d2 says >=1500 in a few places, but in pg_upgrade says <=1500, which looks wrong for upgrades from v15. I think it should say <= 1400. On Wed, Feb 02, 2022 at 02:01:23PM +0100, Peter Eisentraut wrote: > --- a/src/bin/pg_dump/pg_dump.c > +++ b/src/bin/pg_dump/pg_dump.c > @@ -2792,6 +2794,10 @@ dumpDatabase(Archive *fout) > appendPQExpBuffer(dbQry, "datminmxid, "); > else > appendPQExpBuffer(dbQry, "0 AS datminmxid, "); > + if (fout->remoteVersion >= 15) > + appendPQExpBuffer(dbQry, "datcollprovider, "); > + else > + appendPQExpBuffer(dbQry, "'c' AS datcollprovider, "); > appendPQExpBuffer(dbQry, > "(SELECT spcname FROM pg_tablespace t > WHERE t.oid = dattablespace) AS tablespace, " > "shobj_description(oid, > 'pg_database') AS description " > diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c > index 69ef23119f..2a9ca0e389 100644 > --- a/src/bin/pg_upgrade/info.c > +++ b/src/bin/pg_upgrade/info.c > @@ -312,11 +312,20 @@ get_db_infos(ClusterInfo *cluster) > i_spclocation; > charquery[QUERY_ALLOC]; > > snprintf(query, sizeof(query), > - "SELECT d.oid, d.datname, d.encoding, d.datcollate, > d.datctype, " > + "SELECT d.oid, d.datname, d.encoding, d.datcollate, > d.datctype, "); > + if (GET_MAJOR_VERSION(old_cluster.major_version) <= 1500) > + snprintf(query + strlen(query), sizeof(query) - strlen(query), > + "'c' AS datcollprovider, NULL AS daticucoll, > "); > + else > + snprintf(query + strlen(query), sizeof(query) - strlen(query), > + "datcollprovider, daticucoll, "); > + snprintf(query + strlen(query), sizeof(query) - strlen(query), >"pg_catalog.pg_tablespace_location(t.oid) AS > spclocation " >"FROM pg_catalog.pg_database d " >" LEFT OUTER JOIN pg_catalog.pg_tablespace t " > --- a/src/bin/psql/describe.c > +++ b/src/bin/psql/describe.c > @@ -896,6 +896,18 @@ listAllDbs(const char *pattern, bool verbose) > gettext_noop("Encoding"), > gettext_noop("Collate"), > gettext_noop("Ctype")); > + if (pset.sversion >= 15) > + appendPQExpBuffer(, > + " d.daticucoll as > \"%s\",\n" > + " CASE > d.datcollprovider WHEN 'c' THEN 'libc' WHEN 'i' THEN 'icu' END AS \"%s\",\n", > + gettext_noop("ICU Collation"), > + gettext_noop("Coll. > Provider")); > + else > + appendPQExpBuffer(, > + " d.datcollate as > \"%s\",\n" > + " 'libc' AS \"%s\",\n", > + gettext_noop("ICU Collation"), > + gettext_noop("Coll. > Provider")); > appendPQExpBufferStr(, " "); > printACLColumn(, "d.datacl"); > if (verbose) > @@ -4617,6 +4629,15 @@ listCollations(const char *pattern, bool verbose, bool > showSystem) > gettext_noop("Collate"), > gettext_noop("Ctype")); > > + if (pset.sversion >= 15) > + appendPQExpBuffer(, > + ",\n c.collicucoll AS > \"%s\"", > + gettext_noop("ICU > Collation")); > + else > + appendPQExpBuffer(, > + ",\n c.collcollate AS > \"%s\"", > + gettext_noop("ICU > Collation")); > + > if (pset.sversion >= 10) > appendPQExpBuffer(, > ",\n CASE > c.collprovider WHEN 'd' THEN 'default' WHEN 'c' THEN 'libc' WHEN 'i' THEN > 'icu' END AS \"%s\"",
Re: ICU for global collation
Hi, On 2022-03-17 14:14:52 +0100, Peter Eisentraut wrote: > committed, thanks Just noticed that this adds a new warning when building with -O3: In file included from /home/andres/src/postgresql/src/include/catalog/pg_collation.h:22, from /home/andres/src/postgresql/src/backend/commands/dbcommands.c:39: In function ‘collprovider_name’, inlined from ‘createdb’ at /home/andres/src/postgresql/src/backend/commands/dbcommands.c:514:4: ../../../src/include/catalog/pg_collation_d.h:47:9: warning: ‘src_locprovider’ may be used uninitialized [-Wmaybe-uninitialized] 47 | switch (c) | ^~ /home/andres/src/postgresql/src/backend/commands/dbcommands.c: In function ‘createdb’: /home/andres/src/postgresql/src/backend/commands/dbcommands.c:112:25: note: ‘src_locprovider’ was declared here 112 | charsrc_locprovider; | ^~~ I'd fixed that for nearby variables in 3f6b3be39ca9... Gonna just NULL initialize it as well. Greetings, Andres Freund
Re: ICU for global collation
On Thu, Mar 17, 2022 at 02:14:52PM +0100, Peter Eisentraut wrote: > On 17.03.22 13:01, Shinoda, Noriyoshi (PN Japan FSIP) wrote: > > Thank you to all the developers. > > I found that the description of the pg_database.daticulocale column was not > > written in the documentation. > > The attached small patch adds a description of the daticulocale column to > > catalogs.sgml. > > committed, thanks Thanks a lot both! Glad to finally have that feature, as soon as we'll fix the few reported problems.
Re: ICU for global collation
Hi, On 2022-03-17 14:14:52 +0100, Peter Eisentraut wrote: > On 17.03.22 13:01, Shinoda, Noriyoshi (PN Japan FSIP) wrote: > > Thank you to all the developers. > > I found that the description of the pg_database.daticulocale column was not > > written in the documentation. > > The attached small patch adds a description of the daticulocale column to > > catalogs.sgml. > > committed, thanks Wee! That's a long time weakness addressed... Just saw a weird failure after rebasing my meson branch ontop of this. Tests passed on debian, suse, centos 8 stream, fedora rawhide (failed due to an independent reason), but not on centos 7. all runs: https://cirrus-ci.com/build/5190538184884224 centos 7: https://cirrus-ci.com/task/4786632883699712?logs=tests_world#L204 centos 7 failure: https://api.cirrus-ci.com/v1/artifact/task/4786632883699712/log/build/testrun/icu/t/010_database/log/regress_log_010_database not ok 1 - sort by database default locale # Failed test 'sort by database default locale' # at /tmp/cirrus-ci-build/src/test/icu/t/010_database.pl line 28. # got: 'a # A # b # B' # expected: 'A # a # B # b' ok 2 - sort by explicit collation standard not ok 3 - sort by explicit collation upper first # Failed test 'sort by explicit collation upper first' # at /tmp/cirrus-ci-build/src/test/icu/t/010_database.pl line 42. # got: 'a # A # b # B' # expected: 'A # a # B # b' ok 4 - ICU locale must be specified for ICU provider: exit code not 0 ok 5 - ICU locale must be specified for ICU provider: error message 1..5 This is a run building with meson. But I've now triggered builds with autoconf on centos 7 as well and that also failed. See https://cirrus-ci.com/task/6194007767252992?logs=test_world#L378 So it looks like older ICU versions don't work? Greetings, Andres Freund PS: I had not yet passed with_icu in the initdb tests for meson, that's why there's two failures with autoconf but only one with meson.
Re: ICU for global collation
On Thu, Mar 17, 2022 at 6:15 AM Peter Eisentraut wrote: > committed, thanks Glad that this finally happened. Thanks to everybody involved! -- Peter Geoghegan
Re: ICU for global collation
On 17.03.22 13:01, Shinoda, Noriyoshi (PN Japan FSIP) wrote: Thank you to all the developers. I found that the description of the pg_database.daticulocale column was not written in the documentation. The attached small patch adds a description of the daticulocale column to catalogs.sgml. committed, thanks
RE: ICU for global collation
Hi, Thank you to all the developers. I found that the description of the pg_database.daticulocale column was not written in the documentation. The attached small patch adds a description of the daticulocale column to catalogs.sgml. Regards, Noriyoshi Shinoda -Original Message- From: Peter Eisentraut Sent: Thursday, March 17, 2022 7:29 PM To: Julien Rouhaud Cc: pgsql-hackers ; Daniel Verite Subject: Re: ICU for global collation On 15.03.22 08:41, Julien Rouhaud wrote: >>>> The locale object in ICU is an identifier that specifies a >>>> particular locale and has fields for language, country, and an >>>> optional code to specify further variants or subdivisions. These >>>> fields also can be represented as a string with the fields separated by an >>>> underscore. >> >> I think the Localization chapter needs to be reorganized a bit, but >> I'll leave that for a separate patch. > > WFM. I ended up writing a bit of content for that chapter. >>> While on that topic, the doc should probably mention that default >>> ICU collations can only be deterministic. >> >> Well, there is no option to do otherwise, so I'm not sure where/how >> to mention that. We usually don't document options that don't exist. >> ;-) > > Sure, but I'm afraid that users may still be tempted to use ICU > locales like > und-u-ks-level2 from the case_insensitive example in the doc and hope > that it will work accordingly. Or maybe it's just me that still sees > ICU as dark magic and want to be extra cautious. I have added this to the CREATE DATABASE ref page. > Unrelated, but I just realized that we have PGC_INTERNAL gucs for > lc_ctype and lc_collate. Should we add one for icu_locale too? I'm not sure. I think the existing ones are more for backward compatibility with the time before it was settable per database. >>> in AlterCollation(), pg_collation_actual_version(), >>> AlterDatabaseRefreshColl() and pg_database_collation_actual_version(): >>> >>> - datum = SysCacheGetAttr(COLLOID, tup, Anum_pg_collation_collcollate, >>> ); >>> - Assert(!isnull); >>> - newversion = get_collation_actual_version(collForm->collprovider, >>> TextDatumGetCString(datum)); >>> + datum = SysCacheGetAttr(COLLOID, tup, collForm->collprovider == >>> COLLPROVIDER_ICU ? Anum_pg_collation_colliculocale : >>> Anum_pg_collation_collcollate, ); >>> + if (!isnull) >>> + newversion = get_collation_actual_version(collForm->collprovider, >>> TextDatumGetCString(datum)); >>> + else >>> + newversion = NULL; >>> >>> The columns are now nullable, but can you actually end up with a >>> null locale name in the expected field without manual DML on the >>> catalog, corruption or similar? I think it should be a plain error >>> explaining the inconsistency rather then silently assuming there's >>> no version. Note that at least >>> pg_newlocale_from_collation() asserts that the specific libc/icu >>> field it's interested in isn't null. >> >> This is required because the default collations's fields are null now. > > Yes I saw that, but that's a specific exception. Detecting whether > it's the DEFAULT_COLLATION_OID or not and raise an error when a null > value isn't expected seems like it could be worthwhile. I have fixed that as you suggest. > So apart from the few details mentioned above I'm happy with this patch! committed! pg_database_iculocale_v1.diff Description: pg_database_iculocale_v1.diff
Re: ICU for global collation
On 15.03.22 08:41, Julien Rouhaud wrote: The locale object in ICU is an identifier that specifies a particular locale and has fields for language, country, and an optional code to specify further variants or subdivisions. These fields also can be represented as a string with the fields separated by an underscore. I think the Localization chapter needs to be reorganized a bit, but I'll leave that for a separate patch. WFM. I ended up writing a bit of content for that chapter. While on that topic, the doc should probably mention that default ICU collations can only be deterministic. Well, there is no option to do otherwise, so I'm not sure where/how to mention that. We usually don't document options that don't exist. ;-) Sure, but I'm afraid that users may still be tempted to use ICU locales like und-u-ks-level2 from the case_insensitive example in the doc and hope that it will work accordingly. Or maybe it's just me that still sees ICU as dark magic and want to be extra cautious. I have added this to the CREATE DATABASE ref page. Unrelated, but I just realized that we have PGC_INTERNAL gucs for lc_ctype and lc_collate. Should we add one for icu_locale too? I'm not sure. I think the existing ones are more for backward compatibility with the time before it was settable per database. in AlterCollation(), pg_collation_actual_version(), AlterDatabaseRefreshColl() and pg_database_collation_actual_version(): - datum = SysCacheGetAttr(COLLOID, tup, Anum_pg_collation_collcollate, ); - Assert(!isnull); - newversion = get_collation_actual_version(collForm->collprovider, TextDatumGetCString(datum)); + datum = SysCacheGetAttr(COLLOID, tup, collForm->collprovider == COLLPROVIDER_ICU ? Anum_pg_collation_colliculocale : Anum_pg_collation_collcollate, ); + if (!isnull) + newversion = get_collation_actual_version(collForm->collprovider, TextDatumGetCString(datum)); + else + newversion = NULL; The columns are now nullable, but can you actually end up with a null locale name in the expected field without manual DML on the catalog, corruption or similar? I think it should be a plain error explaining the inconsistency rather then silently assuming there's no version. Note that at least pg_newlocale_from_collation() asserts that the specific libc/icu field it's interested in isn't null. This is required because the default collations's fields are null now. Yes I saw that, but that's a specific exception. Detecting whether it's the DEFAULT_COLLATION_OID or not and raise an error when a null value isn't expected seems like it could be worthwhile. I have fixed that as you suggest. So apart from the few details mentioned above I'm happy with this patch! committed!
Re: ICU for global collation
On 15.03.22 18:28, Robert Haas wrote: On Tue, Mar 15, 2022 at 12:58 PM Peter Eisentraut wrote: On 14.03.22 19:57, Robert Haas wrote: 1. What will happen if I set the ICU collation to something that doesn't match the libc collation? How bad are the consequences? These are unrelated, so there are no consequences. Can you please elaborate on this? The code that is aware of ICU generally works like this: if (locale_provider == ICU) result = call ICU code else result = call libc code return result However, there is code out there, both within PostgreSQL itself and in extensions, that does not do that yet. Ideally, we would eventually change all that over, but it's not happening now. So we ought to preserve the ability to set the libc to keep that legacy code working for now. This legacy code by definition doesn't know about ICU, so it doesn't care whether the ICU setting "matches" the libc setting or anything like that. It will just do its thing depending on its own setting. The only consequence of settings that don't match is that the different pieces of code behave semantically inconsistently (e.g., some routine thinks the data is Greek and other code thinks the data is French). But that's up to the user to set correctly. And the actual scenarios where you can actually do anything semantically relevant this way are very limited. A second point is that the LC_CTYPE setting tells other parts of libc what the current encoding is. This affects gettext for example. So you need to set this to something sensible even if you don't use libc locale routines otherwise. 2. If I want to avoid a mismatch between the two, then I will need a way to figure out which libc collation corresponds to a given ICU collation. How do I do that? You can specify the same name for both. Hmm. If every name were valid in both systems, I don't think you'd be proposing two fields. Earlier versions of this patch and predecessor patches indeed had common fields. But in fact the two systems accept different values if you want to delve into the advanced features. But for basic usage something like "en_US" will work for both.
Re: ICU for global collation
Finnerty, Jim wrote: > In ICU, the "locale" is just the first part of what we can pass to the > "locale" parameter in CREATE COLLATION - the part before the optional '@' > delimiter. The ICU locale does not include the secondary or tertiary > properties, Why not? Please see https://unicode-org.github.io/icu/userguide/locale It says that "a locale consists of one or more pieces of ordered information", the pieces being a language code, a script code, a country code, a variant code, and keywords. Best regards, -- Daniel Vérité https://postgresql.verite.pro/ Twitter: @DanielVerite
Re: ICU for global collation
Julien Rouhaud wrote: > > > While on that topic, the doc should probably mention that default ICU > > > collations can only be deterministic. > > > > Well, there is no option to do otherwise, so I'm not sure where/how to > > mention that. We usually don't document options that don't exist. ;-) > > Sure, but I'm afraid that users may still be tempted to use ICU locales like > und-u-ks-level2 from the case_insensitive example in the doc and hope that > it will work accordingly. +1. The CREATE DATABASE doc says this currently: icu_locale Specifies the ICU locale ID if the ICU locale provider is used. ISTM that we need to say explicitly that this locale will be used by default to compare all collatable strings, except that it's overruled by a bytewise comparison to break ties in case of equality. The idea is to describe what the backend will do with the setting rather than saying that we don't have a nondeterministic option. Best regards, -- Daniel Vérité https://postgresql.verite.pro/ Twitter: @DanielVerite
Re: ICU for global collation
Can we get some more consistent terminology around the term "locale"? In ICU, the "locale" is just the first part of what we can pass to the "locale" parameter in CREATE COLLATION - the part before the optional '@' delimiter. The ICU locale does not include the secondary or tertiary properties, so it is usually just the country and the language, e.g. en_US (or en-US), but it can also be something like es_TRADITIONAL for traditional Spanish. I think it would be an improvement in clarity if we consistently use the term 'locale' to mean the same thing that ICU means by that term, and not to have the thing that we call the "locale" also include collation modifiers, or to imply that a locale is the same thing as a collation. /Jim
Re: ICU for global collation
On Tue, Mar 15, 2022 at 12:58 PM Peter Eisentraut wrote: > On 14.03.22 19:57, Robert Haas wrote: > > 1. What will happen if I set the ICU collation to something that > > doesn't match the libc collation? How bad are the consequences? > > These are unrelated, so there are no consequences. Can you please elaborate on this? > > 2. If I want to avoid a mismatch between the two, then I will need a > > way to figure out which libc collation corresponds to a given ICU > > collation. How do I do that? > > You can specify the same name for both. Hmm. If every name were valid in both systems, I don't think you'd be proposing two fields. -- Robert Haas EDB: http://www.enterprisedb.com
Re: ICU for global collation
On 14.03.22 19:57, Robert Haas wrote: 1. What will happen if I set the ICU collation to something that doesn't match the libc collation? How bad are the consequences? These are unrelated, so there are no consequences. 2. If I want to avoid a mismatch between the two, then I will need a way to figure out which libc collation corresponds to a given ICU collation. How do I do that? You can specify the same name for both.
Re: ICU for global collation
On Mon, Mar 14, 2022 at 01:50:50PM +0100, Peter Eisentraut wrote: > On 05.03.22 09:38, Julien Rouhaud wrote: > > I say it works because I did manually check, as far as I can see there isn't > > any test that ensures it. > > > > I'm using this naive scenario: > > > > DROP DATABASE IF EXISTS dbicu; > > CREATE DATABASE dbicu LOCALE_PROVIDER icu LOCALE 'en_US' ICU_LOCALE > > 'en-u-kf-upper' template 'template0'; > > \c dbicu > > CREATE COLLATION upperfirst (provider = icu, locale = 'en-u-kf-upper'); > > CREATE TABLE icu(def text, en text COLLATE "en_US", upfirst text COLLATE > > upperfirst); > > INSERT INTO icu VALUES ('a', 'a', 'a'), ('b','b','b'), ('A','A','A'), > > ('B','B','B'); > > SELECT def AS def FROM icu ORDER BY def; > > SELECT def AS en FROM icu ORDER BY en; > > SELECT def AS upfirst FROM icu ORDER BY upfirst; > > SELECT def AS upfirst_explicit FROM icu ORDER BY en COLLATE upperfirst; > > SELECT def AS en_x_explicit FROM icu ORDER BY def COLLATE "en-x-icu"; > > > > Maybe there should be some test along those lines included in the patch? > > I added something like this to a new test suite under src/test/icu/. > (src/test/locale/ was already used for something else.) Great, thanks! > > > The locale object in ICU is an identifier that specifies a particular > > > locale > > > and has fields for language, country, and an optional code to specify > > > further > > > variants or subdivisions. These fields also can be represented as a string > > > with the fields separated by an underscore. > > I think the Localization chapter needs to be reorganized a bit, but I'll > leave that for a separate patch. WFM. > > I spent some time looking at the ICU api trying to figure out if using a > > posix locale name (e.g. en_US) was actually compatible with an ICU locale > > name. > > It seems that ICU accept any of 'en-us', 'en-US', 'en_us' or 'en_US' as the > > same locale, but I might be wrong. I also didn't find a way to figure out > > how > > to ask ICU if the locale identifier passed is complete garbage or not. One > > sure thing is that the system collation we import are of the form 'en-us', > > so > > it seems weird to have this form in pg_collation and by default another > > form in > > pg_database. > > Yeah it seems to be inconsistent about that. The locale ID documentation > appears to indicate that "en_US" is the canonical form, but when you ask it > to list all the locales it knows about it returns "en-US". Yeah I saw that too when checking is POSIX locale names were valid, and that's not great. > > > In CREATE DATABASE manual: > > > > +Specifies the provider to use for the default collation in this > > +database. Possible values are: > > + > > icu,ICU > > +libc. libc is the default. > > The > > +available choices depend on the operating system and build options. > > > > That's actually not true as pg_strcasecmp is used in createdb(): > > I don't understand what the concern is here. Ah sorry I missed the , I thought the possible values listed were icu, ICU and libc. Shouldn't the ICU be before the icu rather than the libc, or at least before the comma? > > Yeah, I think it is not the job of the initdb man page to lecture people > about the merits of their locale choice. The same performance warning can > also be found in the localization chapter; we don't need to repeat it > everywhere a locale choice is mentioned. Ah, I tried to look for another place where the same warning could be found but missed it. I'm fine with it then! > > While on that topic, the doc should probably mention that default ICU > > collations can only be deterministic. > > Well, there is no option to do otherwise, so I'm not sure where/how to > mention that. We usually don't document options that don't exist. ;-) Sure, but I'm afraid that users may still be tempted to use ICU locales like und-u-ks-level2 from the case_insensitive example in the doc and hope that it will work accordingly. Or maybe it's just me that still sees ICU as dark magic and want to be extra cautious. Unrelated, but I just realized that we have PGC_INTERNAL gucs for lc_ctype and lc_collate. Should we add one for icu_locale too? > > Also unrelated, but how about raising a warning about possibly hiding > > corruption if you use CREATE COLLATION ... VERSION or CREATE DATABASE ... > > COLLATION_VERSION if !IsBinaryUpgrade()? > > Hmm, there is a point to that. But then we should just make that an error. > Otherwise, we raise the warning but then there is no way to "fix" what was > warned about. Seems confusing. I was afraid that an error was a bit too much, but if that's acceptable I agree that it's better. > > in AlterCollation(), pg_collation_actual_version(), > > AlterDatabaseRefreshColl() > > and pg_database_collation_actual_version(): > > > > - datum = SysCacheGetAttr(COLLOID, tup, Anum_pg_collation_collcollate, > > ); > > - Assert(!isnull); > > - newversion =
Re: ICU for global collation
On Mon, Mar 14, 2022 at 8:51 AM Peter Eisentraut wrote: > [ new patches ] I'm not very knowledgeable about this topic, but to me, it seems confusing to think of having both a libc collation and an ICU collation associated with a database. I have two main questions: 1. What will happen if I set the ICU collation to something that doesn't match the libc collation? How bad are the consequences? 2. If I want to avoid a mismatch between the two, then I will need a way to figure out which libc collation corresponds to a given ICU collation. How do I do that? I have a sneaking suspicion that I'm not going to like the answer to question #2 very much, but maybe that's life. I feel like this whole area is far too full of magical things. True practitioners know that if you utter some mysterious incantation to the Lords of ICU, you can get exactly the behavior you want. And ... everyone else has no idea what to do. -- Robert Haas EDB: http://www.enterprisedb.com
Re: ICU for global collation
On 05.03.22 09:38, Julien Rouhaud wrote: I say it works because I did manually check, as far as I can see there isn't any test that ensures it. I'm using this naive scenario: DROP DATABASE IF EXISTS dbicu; CREATE DATABASE dbicu LOCALE_PROVIDER icu LOCALE 'en_US' ICU_LOCALE 'en-u-kf-upper' template 'template0'; \c dbicu CREATE COLLATION upperfirst (provider = icu, locale = 'en-u-kf-upper'); CREATE TABLE icu(def text, en text COLLATE "en_US", upfirst text COLLATE upperfirst); INSERT INTO icu VALUES ('a', 'a', 'a'), ('b','b','b'), ('A','A','A'), ('B','B','B'); SELECT def AS def FROM icu ORDER BY def; SELECT def AS en FROM icu ORDER BY en; SELECT def AS upfirst FROM icu ORDER BY upfirst; SELECT def AS upfirst_explicit FROM icu ORDER BY en COLLATE upperfirst; SELECT def AS en_x_explicit FROM icu ORDER BY def COLLATE "en-x-icu"; Maybe there should be some test along those lines included in the patch? I added something like this to a new test suite under src/test/icu/. (src/test/locale/ was already used for something else.) Also, it's not even used consistently in the patch. I can see e.g. "ICU locale" or "ICU locale setting" being used: I have fixed those. Maybe we could point to the ICU documentation for a clear notion of what a locale ID is, and/or use their own short definition [1]: The locale object in ICU is an identifier that specifies a particular locale and has fields for language, country, and an optional code to specify further variants or subdivisions. These fields also can be represented as a string with the fields separated by an underscore. I think the Localization chapter needs to be reorganized a bit, but I'll leave that for a separate patch. I spent some time looking at the ICU api trying to figure out if using a posix locale name (e.g. en_US) was actually compatible with an ICU locale name. It seems that ICU accept any of 'en-us', 'en-US', 'en_us' or 'en_US' as the same locale, but I might be wrong. I also didn't find a way to figure out how to ask ICU if the locale identifier passed is complete garbage or not. One sure thing is that the system collation we import are of the form 'en-us', so it seems weird to have this form in pg_collation and by default another form in pg_database. Yeah it seems to be inconsistent about that. The locale ID documentation appears to indicate that "en_US" is the canonical form, but when you ask it to list all the locales it knows about it returns "en-US". In CREATE DATABASE manual: +Specifies the provider to use for the default collation in this +database. Possible values are: +icu,ICU +libc. libc is the default. The +available choices depend on the operating system and build options. That's actually not true as pg_strcasecmp is used in createdb(): + if (pg_strcasecmp(locproviderstr, "icu") == 0) + dblocprovider = COLLPROVIDER_ICU; + else if (pg_strcasecmp(locproviderstr, "libc") == 0) + dblocprovider = COLLPROVIDER_LIBC; + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), +errmsg("unrecognized locale provider: %s", + locproviderstr))); I don't understand what the concern is here. Unless I'm missing something you entirely removed the warninng about the performance penalty when using non C/POSIX locale? Yeah, I think it is not the job of the initdb man page to lecture people about the merits of their locale choice. The same performance warning can also be found in the localization chapter; we don't need to repeat it everywhere a locale choice is mentioned. initdb has some specific processing to transform the default libc locale to something more appropriate, but as far as I can see creatdb / CREATE DATABASE aren't doing that. It seems inconsistent, and IMHO another reason why defaulting to the libc locale looks like a bad idea. This has all been removed. The separate ICU locale option should now be required everywhere (initdb, createdb, CREATE DATABASE). While on that topic, the doc should probably mention that default ICU collations can only be deterministic. Well, there is no option to do otherwise, so I'm not sure where/how to mention that. We usually don't document options that don't exist. ;-) Also unrelated, but how about raising a warning about possibly hiding corruption if you use CREATE COLLATION ... VERSION or CREATE DATABASE ... COLLATION_VERSION if !IsBinaryUpgrade()? Hmm, there is a point to that. But then we should just make that an error. Otherwise, we raise the warning but then there is no way to "fix" what was warned about. Seems confusing. in AlterCollation(), pg_collation_actual_version(), AlterDatabaseRefreshColl() and pg_database_collation_actual_version(): - datum = SysCacheGetAttr(COLLOID, tup, Anum_pg_collation_collcollate, ); - Assert(!isnull); - newversion =
Re: ICU for global collation
On Thu, Mar 10, 2022 at 10:52:41AM +0100, Peter Eisentraut wrote: > On 05.03.22 09:38, Julien Rouhaud wrote: > > @@ -168,18 +175,6 @@ DefineCollation(ParseState *pstate, List *names, List > > *parameters, bool if_not_e > > errmsg("collation \"default\" cannot be copied"))); > > } > > > > - if (localeEl) > > - { > > - collcollate = defGetString(localeEl); > > - collctype = defGetString(localeEl); > > - } > > [...] > > > > I tried to read the function and quickly got confused about whether possible > > problematic conditions could be reached or not and had protection or not. I > > think that DefineCollation is becoming more and more unreadable, with a mix > > of > > fromEl, !fromEl and either. Maybe it would be a good time to refactor the > > code > > a bit and have have something like > > > > if (fromEl) > > { > > // initialize all specific things > > } > > else > > { > > // initialize all specific things > > } > > > > // handle defaults and sanity checks > > > > What do you think? > > How about the attached? Thanks! That's exactly what I had in mind. I think it's easier to follow and make sure of what it's doing exactly, so +1 for it.
Re: ICU for global collation
On 05.03.22 09:38, Julien Rouhaud wrote: @@ -168,18 +175,6 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e errmsg("collation \"default\" cannot be copied"))); } - if (localeEl) - { - collcollate = defGetString(localeEl); - collctype = defGetString(localeEl); - } [...] I tried to read the function and quickly got confused about whether possible problematic conditions could be reached or not and had protection or not. I think that DefineCollation is becoming more and more unreadable, with a mix of fromEl, !fromEl and either. Maybe it would be a good time to refactor the code a bit and have have something like if (fromEl) { // initialize all specific things } else { // initialize all specific things } // handle defaults and sanity checks What do you think? How about the attached? From c7b408ba646c5ee5f8c6ae84bec04ca4059ed08f Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Thu, 10 Mar 2022 10:49:49 +0100 Subject: [PATCH] DefineCollation() code cleanup Reorganize the code in DefineCollation() so that the parts using the FROM clause and the parts not doing so are more cleanly separated. No functionality change intended. Reported-by: Julien Rouhaud --- src/backend/commands/collationcmds.c | 109 ++- 1 file changed, 57 insertions(+), 52 deletions(-) diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c index 12fc2316f9..93df1d366c 100644 --- a/src/backend/commands/collationcmds.c +++ b/src/backend/commands/collationcmds.c @@ -63,12 +63,11 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e DefElem*providerEl = NULL; DefElem*deterministicEl = NULL; DefElem*versionEl = NULL; - char *collcollate = NULL; - char *collctype = NULL; - char *collproviderstr = NULL; - boolcollisdeterministic = true; - int collencoding = 0; - charcollprovider = 0; + char *collcollate; + char *collctype; + boolcollisdeterministic; + int collencoding; + charcollprovider; char *collversion = NULL; Oid newoid; ObjectAddress address; @@ -167,65 +166,71 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("collation \"default\" cannot be copied"))); } - - if (localeEl) + else { - collcollate = defGetString(localeEl); - collctype = defGetString(localeEl); - } + char *collproviderstr = NULL; - if (lccollateEl) - collcollate = defGetString(lccollateEl); + collcollate = NULL; + collctype = NULL; - if (lcctypeEl) - collctype = defGetString(lcctypeEl); + if (localeEl) + { + collcollate = defGetString(localeEl); + collctype = defGetString(localeEl); + } - if (providerEl) - collproviderstr = defGetString(providerEl); + if (lccollateEl) + collcollate = defGetString(lccollateEl); - if (deterministicEl) - collisdeterministic = defGetBoolean(deterministicEl); + if (lcctypeEl) + collctype = defGetString(lcctypeEl); - if (versionEl) - collversion = defGetString(versionEl); + if (providerEl) + collproviderstr = defGetString(providerEl); - if (collproviderstr) - { - if (pg_strcasecmp(collproviderstr, "icu") == 0) - collprovider = COLLPROVIDER_ICU; - else if (pg_strcasecmp(collproviderstr, "libc") == 0) - collprovider = COLLPROVIDER_LIBC; + if (deterministicEl) + collisdeterministic = defGetBoolean(deterministicEl); + else + collisdeterministic = true; + + if (versionEl) + collversion = defGetString(versionEl); + + if (collproviderstr) + { + if (pg_strcasecmp(collproviderstr, "icu") == 0) + collprovider = COLLPROVIDER_ICU; + else if (pg_strcasecmp(collproviderstr, "libc") == 0) + collprovider = COLLPROVIDER_LIBC; + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), +
Re: ICU for global collation
Hi, On Wed, Feb 16, 2022 at 03:25:40PM +0100, Peter Eisentraut wrote: > > All that preliminary work has been completed, so here is a new patch. > > There isn't actually much left here now except all the new DDL and > command-line options to set this up and documentation for those. I have > given all that another review and I hope it's more intuitive now, but I > guess there will be other opinions. Sorry it took me a bit of time to come back to this patch. TL;DR it works as expected. I just have a few comments, mostly on the doc. I say it works because I did manually check, as far as I can see there isn't any test that ensures it. I'm using this naive scenario: DROP DATABASE IF EXISTS dbicu; CREATE DATABASE dbicu LOCALE_PROVIDER icu LOCALE 'en_US' ICU_LOCALE 'en-u-kf-upper' template 'template0'; \c dbicu CREATE COLLATION upperfirst (provider = icu, locale = 'en-u-kf-upper'); CREATE TABLE icu(def text, en text COLLATE "en_US", upfirst text COLLATE upperfirst); INSERT INTO icu VALUES ('a', 'a', 'a'), ('b','b','b'), ('A','A','A'), ('B','B','B'); SELECT def AS def FROM icu ORDER BY def; SELECT def AS en FROM icu ORDER BY en; SELECT def AS upfirst FROM icu ORDER BY upfirst; SELECT def AS upfirst_explicit FROM icu ORDER BY en COLLATE upperfirst; SELECT def AS en_x_explicit FROM icu ORDER BY def COLLATE "en-x-icu"; Maybe there should be some test along those lines included in the patch? > I have changed the terminology a bit to match ICU better. It's now called > "ICU locale ID" and "locale provider" (instead of "collation"). It might > actually cover things that are not strictly collations (such as the isalpha > stuff in text search, maybe, in the future). I'm not sure that's actually such an improvement as-is. Simply saying "ICU locale ID" is, at least for me, somewhat ambiguous as I don't see any place in our docs where it's clearly defined. I'm afraid that users might confuse it with the OID of a pg_collation line, or even a collation name (like en-x-icu), especially since there's no simple way to know if what you used means what you thought it meant. Also, it's not even used consistently in the patch. I can see e.g. "ICU locale" or "ICU locale setting" being used: + + icu_locale + + +Specifies the ICU locale if the ICU locale provider is used. If +this is not specified, the value from the LOCALE +option is used. + + + + --icu-locale=locale + + +Specifies the ICU locale setting to be used in this database, if the +ICU locale provider is selected. + Maybe we could point to the ICU documentation for a clear notion of what a locale ID is, and/or use their own short definition [1]: > The locale object in ICU is an identifier that specifies a particular locale > and has fields for language, country, and an optional code to specify further > variants or subdivisions. These fields also can be represented as a string > with the fields separated by an underscore. It seems critical to be crystal clear about what should be specified there. I spent some time looking at the ICU api trying to figure out if using a posix locale name (e.g. en_US) was actually compatible with an ICU locale name. It seems that ICU accept any of 'en-us', 'en-US', 'en_us' or 'en_US' as the same locale, but I might be wrong. I also didn't find a way to figure out how to ask ICU if the locale identifier passed is complete garbage or not. One sure thing is that the system collation we import are of the form 'en-us', so it seems weird to have this form in pg_collation and by default another form in pg_database. All in all I'm a bit worried of having the icu_locale optional. Note that this is inconsistent with createdb(), as there's at least one code path where the icu_locale is not optional: + if (dblocprovider == COLLPROVIDER_ICU) + { + /* +* This would happen if template0 uses the libc provider but the new +* database uses icu. +*/ + if (!dbiculocale) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), +errmsg("ICU locale must be specified"))); + } > > One thing that is left that bothers me is that the invocations of > get_collation_actual_version() have all gotten quite complicated. I'm > thinking about ways to refactor that, but I haven't got a great idea yet. Indeed, and I don't have a great idea either. Maybe get_collation_actual_version_extended that would accept both strings? In CREATE DATABASE manual: +Specifies the provider to use for the default collation in this +database. Possible values are: +icu,ICU +libc. libc is the default. The +available choices depend on the operating system and build options. That's actually not true as pg_strcasecmp is used in createdb(): + if (pg_strcasecmp(locproviderstr, "icu") == 0) + dblocprovider =
Re: ICU for global collation
On 02.02.22 14:01, Peter Eisentraut wrote: Here is the main patch rebased on the various changes that have been committed in the meantime. There is still some work to be done on the user interfaces of initdb, createdb, etc. I have split out the database-level collation version tracking into a separate patch [0]. I think we should get that one in first and then refresh this one. All that preliminary work has been completed, so here is a new patch. There isn't actually much left here now except all the new DDL and command-line options to set this up and documentation for those. I have given all that another review and I hope it's more intuitive now, but I guess there will be other opinions. I have changed the terminology a bit to match ICU better. It's now called "ICU locale ID" and "locale provider" (instead of "collation"). It might actually cover things that are not strictly collations (such as the isalpha stuff in text search, maybe, in the future). One thing that is left that bothers me is that the invocations of get_collation_actual_version() have all gotten quite complicated. I'm thinking about ways to refactor that, but I haven't got a great idea yet.From 6d265e2cf78546bcc25d03031ea03f397f1c1c1b Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Wed, 16 Feb 2022 15:11:04 +0100 Subject: [PATCH v5] Add option to use ICU as global locale provider This adds the option to use ICU as the default locale provider for either the whole cluster or a database. New options for initdb, createdb, and CREATE DATABASE are used to select this. Discussion: https://www.postgresql.org/message-id/flat/5e756dd6-0e91-d778-96fd-b1bcb06c161a%402ndquadrant.com XXX catversion bump --- doc/src/sgml/catalogs.sgml| 9 + doc/src/sgml/ref/create_database.sgml | 27 +++ doc/src/sgml/ref/createdb.sgml| 19 ++ doc/src/sgml/ref/initdb.sgml | 76 ++-- src/backend/catalog/pg_collation.c| 18 +- src/backend/commands/collationcmds.c | 96 +++ src/backend/commands/dbcommands.c | 162 +++--- src/backend/utils/adt/pg_locale.c | 144 ++-- src/backend/utils/init/postinit.c | 21 ++- src/bin/initdb/Makefile | 4 +- src/bin/initdb/initdb.c | 110 ++-- src/bin/initdb/t/001_initdb.pl| 25 +++ src/bin/pg_dump/pg_dump.c | 31 +++- src/bin/pg_upgrade/check.c| 13 ++ src/bin/pg_upgrade/info.c | 18 +- src/bin/pg_upgrade/pg_upgrade.h | 2 + src/bin/psql/describe.c | 23 ++- src/bin/psql/tab-complete.c | 3 +- src/bin/scripts/Makefile | 2 + src/bin/scripts/createdb.c| 20 +++ src/bin/scripts/t/020_createdb.pl | 28 +++ src/include/catalog/pg_collation.dat | 3 +- src/include/catalog/pg_collation.h| 20 ++- src/include/catalog/pg_database.dat | 4 +- src/include/catalog/pg_database.h | 6 + src/include/utils/pg_locale.h | 5 + .../regress/expected/collate.icu.utf8.out | 10 +- src/test/regress/sql/collate.icu.utf8.sql | 8 +- 28 files changed, 741 insertions(+), 166 deletions(-) diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index 5a1627a394..8fde32dfac 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -2384,6 +2384,15 @@ pg_collation Columns + + + colliculocale text + + + ICU locale ID for this collation object + + + collversion text diff --git a/doc/src/sgml/ref/create_database.sgml b/doc/src/sgml/ref/create_database.sgml index f70d0c75b4..544c1cb443 100644 --- a/doc/src/sgml/ref/create_database.sgml +++ b/doc/src/sgml/ref/create_database.sgml @@ -28,6 +28,8 @@ [ LOCALE [=] locale ] [ LC_COLLATE [=] lc_collate ] [ LC_CTYPE [=] lc_ctype ] + [ ICU_LOCALE [=] icu_locale ] + [ LOCALE_PROVIDER [=] locale_provider ] [ COLLATION_VERSION = collation_version ] [ TABLESPACE [=] tablespace_name ] [ ALLOW_CONNECTIONS [=] allowconn ] @@ -160,6 +162,31 @@ Parameters + + icu_locale + + +Specifies the ICU locale if the ICU locale provider is used. If +this is not specified, the value from the LOCALE +option is used. + + + + + + locale_provider + + + +Specifies the provider to use for the default collation in this +database. Possible values are: +icu,ICU +libc. libc is the default. The +available choices depend on the operating system and build options. + + + +
Re: ICU for global collation
On 27.01.22 09:10, Peter Eisentraut wrote: On 21.01.22 17:13, Julien Rouhaud wrote: On Fri, Jan 21, 2022 at 03:24:02PM +0100, Peter Eisentraut wrote: On 21.01.22 14:51, Julien Rouhaud wrote: Is that change intended? There isn't any usage of the collversionstr before the possible error when actual_versionstr is missing. I wanted to move it closer to the SysCacheGetAttr() where the "datum" value is obtained. It seemed weird to get the datum, then do other things, then decode the datum. Oh ok. It won't make much difference performance-wise, so no objection. I have committed this and will provide follow-up patches in the next few days. Here is the main patch rebased on the various changes that have been committed in the meantime. There is still some work to be done on the user interfaces of initdb, createdb, etc. I have split out the database-level collation version tracking into a separate patch [0]. I think we should get that one in first and then refresh this one. [0]: https://www.postgresql.org/message-id/flat/f0ff3190-29a3-5b39-a179-fa32eee57db6%40enterprisedb.comFrom 4028980f6be3662c0302575ed92de77e941e5a9e Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Wed, 2 Feb 2022 13:54:04 +0100 Subject: [PATCH v4] Add option to use ICU as global collation provider This adds the option to use ICU as the default collation provider for either the whole cluster or a database. New options for initdb, createdb, and CREATE DATABASE are used to select this. Discussion: https://www.postgresql.org/message-id/flat/5e756dd6-0e91-d778-96fd-b1bcb06c161a%402ndquadrant.com --- doc/src/sgml/catalogs.sgml| 9 + doc/src/sgml/ref/create_database.sgml | 16 ++ doc/src/sgml/ref/createdb.sgml| 9 + doc/src/sgml/ref/initdb.sgml | 23 ++ src/backend/catalog/pg_collation.c| 18 +- src/backend/commands/collationcmds.c | 93 --- src/backend/commands/dbcommands.c | 72 +- src/backend/utils/adt/pg_locale.c | 242 +++--- src/backend/utils/init/postinit.c | 26 ++ src/bin/initdb/Makefile | 2 + src/bin/initdb/initdb.c | 64 - src/bin/initdb/t/001_initdb.pl| 18 +- src/bin/pg_dump/pg_dump.c | 19 ++ src/bin/pg_upgrade/check.c| 10 + src/bin/pg_upgrade/info.c | 18 +- src/bin/pg_upgrade/pg_upgrade.h | 2 + src/bin/psql/describe.c | 23 +- src/bin/psql/tab-complete.c | 2 +- src/bin/scripts/Makefile | 2 + src/bin/scripts/createdb.c| 9 + src/bin/scripts/t/020_createdb.pl | 20 +- src/include/catalog/pg_collation.dat | 3 +- src/include/catalog/pg_collation.h| 6 +- src/include/catalog/pg_database.dat | 4 +- src/include/catalog/pg_database.h | 6 + src/include/utils/pg_locale.h | 6 + .../regress/expected/collate.icu.utf8.out | 10 +- src/test/regress/sql/collate.icu.utf8.sql | 8 +- 28 files changed, 572 insertions(+), 168 deletions(-) diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index 7d5b0b1656..5a5779b9a3 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -2384,6 +2384,15 @@ pg_collation Columns + + + collicucoll text + + + ICU collation string + + + collversion text diff --git a/doc/src/sgml/ref/create_database.sgml b/doc/src/sgml/ref/create_database.sgml index f22e28dc81..403010cddf 100644 --- a/doc/src/sgml/ref/create_database.sgml +++ b/doc/src/sgml/ref/create_database.sgml @@ -28,6 +28,7 @@ [ LOCALE [=] locale ] [ LC_COLLATE [=] lc_collate ] [ LC_CTYPE [=] lc_ctype ] + [ COLLATION_PROVIDER [=] collation_provider ] [ TABLESPACE [=] tablespace_name ] [ ALLOW_CONNECTIONS [=] allowconn ] [ CONNECTION LIMIT [=] connlimit ] @@ -158,6 +159,21 @@ Parameters + + + collation_provider + + + +Specifies the provider to use for the default collation in this +database. Possible values are: +icu,ICU +libc. libc is the default. The +available choices depend on the operating system and build options. + + + + tablespace_name diff --git a/doc/src/sgml/ref/createdb.sgml b/doc/src/sgml/ref/createdb.sgml index 86473455c9..4b07363fcc 100644 --- a/doc/src/sgml/ref/createdb.sgml +++ b/doc/src/sgml/ref/createdb.sgml @@ -83,6 +83,15 @@ Options + + --collation-provider={libc|icu} + + +Specifies the collation provider for the database's default
Re: ICU for global collation
On 21.01.22 17:13, Julien Rouhaud wrote: On Fri, Jan 21, 2022 at 03:24:02PM +0100, Peter Eisentraut wrote: On 21.01.22 14:51, Julien Rouhaud wrote: Is that change intended? There isn't any usage of the collversionstr before the possible error when actual_versionstr is missing. I wanted to move it closer to the SysCacheGetAttr() where the "datum" value is obtained. It seemed weird to get the datum, then do other things, then decode the datum. Oh ok. It won't make much difference performance-wise, so no objection. I have committed this and will provide follow-up patches in the next few days.
Re: ICU for global collation
On Fri, Jan 21, 2022 at 03:24:02PM +0100, Peter Eisentraut wrote: > On 21.01.22 14:51, Julien Rouhaud wrote: > > Is that change intended? There isn't any usage of the collversionstr before > > the possible error when actual_versionstr is missing. > > I wanted to move it closer to the SysCacheGetAttr() where the "datum" value > is obtained. It seemed weird to get the datum, then do other things, then > decode the datum. Oh ok. It won't make much difference performance-wise, so no objection.
Re: ICU for global collation
On 21.01.22 14:51, Julien Rouhaud wrote: From 1c46bf3138ad42074971aa3130142236de7e63f7 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Fri, 21 Jan 2022 10:01:25 +0100 Subject: [PATCH] Change collate and ctype fields to type text + collversionstr = TextDatumGetCString(datum); + actual_versionstr = get_collation_actual_version(collform->collprovider, collcollate); if (!actual_versionstr) { @@ -1606,7 +1616,6 @@ pg_newlocale_from_collation(Oid collid) (errmsg("collation \"%s\" has no actual version, but a version was specified", NameStr(collform->collname; } - collversionstr = TextDatumGetCString(collversion); Is that change intended? There isn't any usage of the collversionstr before the possible error when actual_versionstr is missing. I wanted to move it closer to the SysCacheGetAttr() where the "datum" value is obtained. It seemed weird to get the datum, then do other things, then decode the datum.
Re: ICU for global collation
Hi, On Fri, Jan 21, 2022 at 10:44:24AM +0100, Peter Eisentraut wrote: > > Here is a second preparation patch: Change collate and ctype fields to type > text. > > I think this is useful by itself because it allows longer locale names. ICU > locale names with several options appended can be longer than 63 bytes. > This case is probably broken right now. Also, it saves space in the typical > case, since most locale names are not anywhere near 63 bytes. I totally agree. > From 1c46bf3138ad42074971aa3130142236de7e63f7 Mon Sep 17 00:00:00 2001 > From: Peter Eisentraut > Date: Fri, 21 Jan 2022 10:01:25 +0100 > Subject: [PATCH] Change collate and ctype fields to type text + collversionstr = TextDatumGetCString(datum); + actual_versionstr = get_collation_actual_version(collform->collprovider, collcollate); if (!actual_versionstr) { @@ -1606,7 +1616,6 @@ pg_newlocale_from_collation(Oid collid) (errmsg("collation \"%s\" has no actual version, but a version was specified", NameStr(collform->collname; } - collversionstr = TextDatumGetCString(collversion); Is that change intended? There isn't any usage of the collversionstr before the possible error when actual_versionstr is missing. Apart from that the patch looks good to me, all tests pass and no issue with building the doc either.
Re: ICU for global collation
On 18.01.22 13:54, Peter Eisentraut wrote: On 18.01.22 05:02, Julien Rouhaud wrote: If this is applied, then in my estimation all these hunks will completely disappear from the global ICU patch. So this would be a significant win. Agreed, the patch will be quite smaller and also easier to review. Are you planning to apply it on its own or add it to the default ICU patchset? My plan is to apply this patch in the next few days. This patch has been committed. Here is a second preparation patch: Change collate and ctype fields to type text. I think this is useful by itself because it allows longer locale names. ICU locale names with several options appended can be longer than 63 bytes. This case is probably broken right now. Also, it saves space in the typical case, since most locale names are not anywhere near 63 bytes.From 1c46bf3138ad42074971aa3130142236de7e63f7 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Fri, 21 Jan 2022 10:01:25 +0100 Subject: [PATCH] Change collate and ctype fields to type text This changes the data type of the catalog fields datcollate, datctype, collcollate, and collctype from name to text. There wasn't ever a really good reason for them to be of type name; presumably this was just carried over from when they were fixed-size fields in pg_control, first into the corresponding pg_database fields, and then to pg_collation. The values are not identifiers or object names, and we don't ever look them up that way. Changing to type text saves space in the typical case, since locale names are typically less than 10 bytes long. But it is also possible that an ICU locale name with several customization options appended could be longer than 63 bytes, so this also enables that case, which was previously probably broken. Discussion: https://www.postgresql.org/message-id/flat/5e756dd6-0e91-d778-96fd-b1bcb06c1...@2ndquadrant.com --- doc/src/sgml/catalogs.sgml | 40 +-- src/backend/catalog/pg_collation.c | 10 ++- src/backend/commands/collationcmds.c | 41 +++- src/backend/commands/dbcommands.c| 21 ++ src/backend/utils/adt/pg_locale.c| 29 +--- src/backend/utils/init/postinit.c| 11 ++-- src/include/catalog/pg_collation.h | 4 +-- src/include/catalog/pg_database.h| 12 src/include/utils/pg_locale.h| 2 ++ 9 files changed, 104 insertions(+), 66 deletions(-) diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index 1e65c426b2..7d5b0b1656 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -2368,7 +2368,7 @@ pg_collation Columns - collcollate name + collcollate text LC_COLLATE for this collation object @@ -2377,7 +2377,7 @@ pg_collation Columns - collctype name + collctype text LC_CTYPE for this collation object @@ -2951,24 +2951,6 @@ pg_database Columns - - - datcollate name - - - LC_COLLATE for this database - - - - - - datctype name - - - LC_CTYPE for this database - - - datistemplate bool @@ -3043,6 +3025,24 @@ pg_database Columns + + + datcollate text + + + LC_COLLATE for this database + + + + + + datctype text + + + LC_CTYPE for this database + + + datacl aclitem[] diff --git a/src/backend/catalog/pg_collation.c b/src/backend/catalog/pg_collation.c index 5be6600652..bfc02d3038 100644 --- a/src/backend/catalog/pg_collation.c +++ b/src/backend/catalog/pg_collation.c @@ -58,9 +58,7 @@ CollationCreate(const char *collname, Oid collnamespace, HeapTuple tup; Datum values[Natts_pg_collation]; boolnulls[Natts_pg_collation]; - NameDataname_name, - name_collate, - name_ctype; + NameDataname_name; Oid oid; ObjectAddress myself, referenced; @@ -163,10 +161,8 @@ CollationCreate(const char *collname, Oid collnamespace, values[Anum_pg_collation_collprovider - 1] = CharGetDatum(collprovider); values[Anum_pg_collation_collisdeterministic - 1] = BoolGetDatum(collisdeterministic); values[Anum_pg_collation_collencoding - 1] = Int32GetDatum(collencoding); - namestrcpy(_collate, collcollate); - values[Anum_pg_collation_collcollate - 1] = NameGetDatum(_collate); - namestrcpy(_ctype, collctype); - values[Anum_pg_collation_collctype - 1] = NameGetDatum(_ctype); + values[Anum_pg_collation_collcollate - 1] = CStringGetTextDatum(collcollate); + values[Anum_pg_collation_collctype -
Re: ICU for global collation
On 18.01.22 05:02, Julien Rouhaud wrote: If this is applied, then in my estimation all these hunks will completely disappear from the global ICU patch. So this would be a significant win. Agreed, the patch will be quite smaller and also easier to review. Are you planning to apply it on its own or add it to the default ICU patchset? My plan is to apply this patch in the next few days.
Re: ICU for global collation
Hi, On Thu, Jan 13, 2022 at 09:39:42AM +0100, Peter Eisentraut wrote: > On 11.01.22 12:08, Julien Rouhaud wrote: > > > So, unless there are concerns, I'm going to see about making a patch to > > > call > > > pg_newlocale_from_collation() even with the default collation. That would > > > make the actual feature patch quite a bit smaller, since we won't have to > > > patch every call site of pg_newlocale_from_collation(). > > +1 for me! > > Here is that patch. The patch is quite straightforward so I don't have much to say, it all looks good and passes all regression tests. > If this is applied, then in my estimation all these hunks will completely > disappear from the global ICU patch. So this would be a significant win. Agreed, the patch will be quite smaller and also easier to review. Are you planning to apply it on its own or add it to the default ICU patchset? Given the possible backpatch conflicts it can bring I'm not sure it's worthy enough to apply on its own.
Re: ICU for global collation
Hi, On Mon, Jan 17, 2022 at 07:07:38PM +, Finnerty, Jim wrote: > On 10.01.22 12:49, Daniel Verite wrote: > > > I think some users would want their db-wide ICU collation to be > > case/accent-insensitive. > ... > > IIRC, that was the context for some questions where people were > > enquiring about db-wide ICU collations. > > +1. There is the DEFAULT_COLLATION_OID, which is the cluster-level default > collation, a.k.a. the "global collation", as distinct from the "db-wide" > database-level default collation, which controls the default type of the > collatable types within that database. There's no cluster-level default collation, and DEFAULT_COLLATION_OID is always database-level (and template1 having a specific default collation). The template0 database is there to be able to support different databases with different default collations, among other things. So this patchset would allow per-database default ICU collation, although not non-deterministic ones.
Re: ICU for global collation
On 10.01.22 12:49, Daniel Verite wrote: > I think some users would want their db-wide ICU collation to be > case/accent-insensitive. ... > IIRC, that was the context for some questions where people were > enquiring about db-wide ICU collations. +1. There is the DEFAULT_COLLATION_OID, which is the cluster-level default collation, a.k.a. the "global collation", as distinct from the "db-wide" database-level default collation, which controls the default type of the collatable types within that database. > With the current patch, it's not possible, AFAICS, because the user > can't tell that the collation is non-deterministic. Presumably this > would require another option to CREATE DATABASE and another > column to store that bit of information. On 1/11/22, 6:24 AM, "Peter Eisentraut" wrote: > Adding this would be easy, but since pattern matching currently does not > support nondeterministic collations, if you make a global collation > nondeterministic, a lot of system views, psql, pg_dump queries etc. > would break, so it's not practical. I view this is an orthogonal > project. Once we can support this without breaking system views etc., > then it's easy to enable with a new column in pg_database. So this patch only enables the default cluster collation (DEFAULT_COLLATION_OID) to be a deterministic ICU collation, but doesn't extend the metadata in a way that would enable database collations to be ICU collations? Waiting for the pattern matching problem to be solved in general before creating the metadata to support ICU collations everywhere will make it more difficult for members of the community to help solve the pattern matching problem. What additional metadata changes would be required to enable an ICU collation to be specified at either the cluster-level or the database-level, even if new checks need to be added to disallow a nondeterministic collation to be specified at the cluster level for now?
Re: ICU for global collation
Re: >> After this patch goes in, the big next thing would be to support >> nondeterministic collations for LIKE, ILIKE and pattern matching operators in >> general. Is anyone interested in working on that? > As far as I know you're the last person that seemed to be working on that > topic > back in March :) I have a solution for LIKE and ILIKE for case-insensitive, accent-sensitive ICU collations and the UTF8 server encoding, but didn't attempt to address the general case until ICU collations were supported at the database and cluster levels. I may have another look at that after this patch goes in.
Re: ICU for global collation
On 11.01.22 12:08, Julien Rouhaud wrote: So, unless there are concerns, I'm going to see about making a patch to call pg_newlocale_from_collation() even with the default collation. That would make the actual feature patch quite a bit smaller, since we won't have to patch every call site of pg_newlocale_from_collation(). +1 for me! Here is that patch. If this is applied, then in my estimation all these hunks will completely disappear from the global ICU patch. So this would be a significant win.From 2ee4b77221020e81ec83cf37d36e3955bf619d80 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Thu, 13 Jan 2022 09:14:41 +0100 Subject: [PATCH] Call pg_newlocale_from_collation() also with default collation Previously, callers of pg_newlocale_from_collation() did not call it if the collation was DEFAULT_COLLATION_OID and instead proceeded with a pg_locale_t of 0. Instead, now we call it anyway and have it return 0 if the default collation was passed. It already did this, so we just have to adjust the callers. This simplifies all the call sites and also makes future enhancements easier. After discussion and testing, the previous comment in pg_locale.c about avoiding this for performance reasons may have been mistaken since it was testing a very different patch version way back when. Discussion: https://www.postgresql.org/message-id/ed3baa81-7fac-7788-cc12-41e3f7917...@enterprisedb.com --- src/backend/access/hash/hashfunc.c | 4 +- src/backend/regex/regc_pg_locale.c | 40 ++-- src/backend/utils/adt/formatting.c | 96 +--- src/backend/utils/adt/like.c | 37 ++- src/backend/utils/adt/like_support.c | 27 src/backend/utils/adt/pg_locale.c| 3 - src/backend/utils/adt/varchar.c | 26 +--- src/backend/utils/adt/varlena.c | 34 ++ 8 files changed, 135 insertions(+), 132 deletions(-) diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c index 0521c69dd5..b57ed946c4 100644 --- a/src/backend/access/hash/hashfunc.c +++ b/src/backend/access/hash/hashfunc.c @@ -278,7 +278,7 @@ hashtext(PG_FUNCTION_ARGS) errmsg("could not determine which collation to use for string hashing"), errhint("Use the COLLATE clause to set the collation explicitly."))); - if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID) + if (!lc_collate_is_c(collid)) mylocale = pg_newlocale_from_collation(collid); if (!mylocale || mylocale->deterministic) @@ -334,7 +334,7 @@ hashtextextended(PG_FUNCTION_ARGS) errmsg("could not determine which collation to use for string hashing"), errhint("Use the COLLATE clause to set the collation explicitly."))); - if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID) + if (!lc_collate_is_c(collid)) mylocale = pg_newlocale_from_collation(collid); if (!mylocale || mylocale->deterministic) diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c index e0d93eab32..6e84f42cb2 100644 --- a/src/backend/regex/regc_pg_locale.c +++ b/src/backend/regex/regc_pg_locale.c @@ -231,6 +231,18 @@ static const unsigned char pg_char_properties[128] = { void pg_set_regex_collation(Oid collation) { + if (!OidIsValid(collation)) + { + /* +* This typically means that the parser could not resolve a +* conflict of implicit collations, so report it that way. +*/ + ereport(ERROR, + (errcode(ERRCODE_INDETERMINATE_COLLATION), +errmsg("could not determine which collation to use for regular expression"), +errhint("Use the COLLATE clause to set the collation explicitly."))); + } + if (lc_ctype_is_c(collation)) { /* C/POSIX collations use this path regardless of database encoding */ @@ -240,28 +252,12 @@ pg_set_regex_collation(Oid collation) } else { - if (collation == DEFAULT_COLLATION_OID) - pg_regex_locale = 0; - else if (OidIsValid(collation)) - { - /* -* NB: pg_newlocale_from_collation will fail if not HAVE_LOCALE_T; -* the case of pg_regex_locale != 0 but not HAVE_LOCALE_T does not -* have to be considered below. -*/ - pg_regex_locale = pg_newlocale_from_collation(collation); - } - else - { - /* -* This typically means that the parser could not resolve a -* conflict of implicit collations, so
Re: ICU for global collation
On Tue, Jan 11, 2022 at 12:36:46PM +0100, Daniel Verite wrote: > > If CREATE DATABASE referred to a collation in the template db, > either that collation already exists, or the user would have to add it > to the template db with CREATE COLLATION. > initdb already populates the template databases with a full set of > ICU collations through pg_import_system_collations(). > I don't quite see what change you're seeing that would be needed in > initdb. Yes, there are already the system collation imported. But you still need to make sure that that collation does exist in the template database, and it's still impossible to connect to that database to check when processing the CREATE DATABASE. Also, if the wanted collation wasn't imported by pg_import_system_collations() and isn't the server's default collation, then the user would have to allow connection on the template0 database and create the wanted collation there. It doesn't seem like something that should be recommended for a reasonably standard use case.
Re: ICU for global collation
Julien Rouhaud wrote: > > I guess there's still the possibility of requiring that the ICU db-wide > > collation of the new database does exist in the template database, > > and then the CREATE DATABASE would refer to that collation instead of > > an independent locale string. > > That could work. However if having the collation in the template database a > strict requirement the something should also be done for initdb, and it will > probably be a bigger problem. If CREATE DATABASE referred to a collation in the template db, either that collation already exists, or the user would have to add it to the template db with CREATE COLLATION. initdb already populates the template databases with a full set of ICU collations through pg_import_system_collations(). I don't quite see what change you're seeing that would be needed in initdb. Best regards, -- Daniel Vérité PostgreSQL-powered mailer: https://www.manitou-mail.org Twitter: @DanielVerite
Re: ICU for global collation
On 10.01.22 12:49, Daniel Verite wrote: With the current patch, it's not possible, AFAICS, because the user can't tell that the collation is non-deterministic. Presumably this would require another option to CREATE DATABASE and another column to store that bit of information. Adding this would be easy, but since pattern matching currently does not support nondeterministic collations, if you make a global collation nondeterministic, a lot of system views, psql, pg_dump queries etc. would break, so it's not practical. I view this is an orthogonal project. Once we can support this without breaking system views etc., then it's easy to enable with a new column in pg_database. The "daticucol" column also suggests that we don't expect to add other collation providers in the future. Maybe a pair of columns like (datcollprovider, datcolllocale) would be more future-proof, or a (datcollprovider, datcolllocale, datcollisdeterministic) triplet if non-deterministic collations are allowed. I don't expect many new collation providers. So I don't think an EAV-like storage would be helpful. The other problem is that we don't know what we need. For example, the libc provider needs both a collate and a ctype value, so that wouldn't fit into that scheme nicely. Also, pg_collation has "collversion" to detect a mismatch between the ICU runtime and existing indexes. I don't see that field for the db-wide ICU collation, so maybe we currently miss the capability to detect the mismatch for the db-wide collation? Yeah, I think I need to add a datcollversion field and the associated checks.
Re: ICU for global collation
On 07.01.22 10:03, Julien Rouhaud wrote: I changed the datcollate, datctype, and the new daticucoll fields to type text (from name). That way, the daticucoll field can be set to null if it's not applicable. Also, the limit of 63 characters can actually be a problem if you want to use some combination of the options that ICU locales offer. And for less extreme uses, having variable-length fields will save some storage, since typical locale names are much shorter. I understand the need to have daticucoll as text, however it's not clear to me why this has to be changed for datcollate and datctype? IIUC those will only ever contain libc-based collation and are still mandatory? Right. I just did this for consistency. It would be strange otherwise to have some fields as name and some as text. Arguably, using "name" here was wrong to begin with, since they are not really object names. Maybe there used to be a reason to avoid variable-length fields in pg_database, but there isn't one now AFAICT. - pg_upgrade It checks (in check_locale_and_encoding()) the compatibility for each database, and it looks like the daticucoll field should also be verified. Other than that I don't think there is anything else needed for the pg_upgrade part as everything else should be handled by pg_dump (I didn't look at the changes yet given the below problems). Ok, I have added this and will include it in my next patch submission. - CREATE DATABASE - initdb Ok, some work is needed to make these interfaces behave sensibly in various combinations. I will look into that.
Re: ICU for global collation
On Tue, Jan 11, 2022 at 10:10:25AM +0100, Peter Eisentraut wrote: > > On 10.01.22 07:00, Julien Rouhaud wrote: > > > > So I tried to run Noah's benchmark to see if I could reproduce the slowdown. > > Unfortunately the results I'm getting don't really make sense as removing > > the > > optimisation brings a 15% speedup, and with a few more runs I can see that I > > have about 25% noise, so there isn't much I can do to help. > > Heh, I had that same experience, it actually got faster without the > optimization, but then got lost in the noise on further testing. Ah, so it's not just my machine :) > Looking back at those discussions, I don't think those old test results are > relevant anymore. In the patch that was being tested there, > pg_newlocale_from_collation(), did not contain > > if (collid == DEFAULT_COLLATION_OID) > return (pg_locale_t) 0; > > so the default collation actually went through most or all of the function > and did a lot of work. That would understandably be quite slow. But just > calling a function and returning immediately should not be a problem. > Otherwise, the call to check_collation_set() in varstr_cmp() and elsewhere > would be just as bad. I didn't noticed that. That definitely explain why the performance concern isn't valid anymore. > So, unless there are concerns, I'm going to see about making a patch to call > pg_newlocale_from_collation() even with the default collation. That would > make the actual feature patch quite a bit smaller, since we won't have to > patch every call site of pg_newlocale_from_collation(). +1 for me!
Re: ICU for global collation
On 10.01.22 07:00, Julien Rouhaud wrote: And then I changed in varstr_cmp(): if (collid != DEFAULT_COLLATION_OID) mylocale = pg_newlocale_from_collation(collid); to just mylocale = pg_newlocale_from_collation(collid); I find that the \timing results are indistinguishable. (I used locale "en_US.UTF-8" and made sure that that code path is actually hit.) Does anyone have other insights? Looking at the git history, you added this comment in 414c5a2ea65. After a bit a digging in the lists, I found that you introduced it to fix a reported 13% slowdown in varstr_cmp(): https://www.postgresql.org/message-id/20110129075253.GA18784%40tornado.leadboat.com https://www.postgresql.org/message-id/1296748408.6442.1.camel%40vanquo.pezone.net So I tried to run Noah's benchmark to see if I could reproduce the slowdown. Unfortunately the results I'm getting don't really make sense as removing the optimisation brings a 15% speedup, and with a few more runs I can see that I have about 25% noise, so there isn't much I can do to help. Heh, I had that same experience, it actually got faster without the optimization, but then got lost in the noise on further testing. Looking back at those discussions, I don't think those old test results are relevant anymore. In the patch that was being tested there, pg_newlocale_from_collation(), did not contain if (collid == DEFAULT_COLLATION_OID) return (pg_locale_t) 0; so the default collation actually went through most or all of the function and did a lot of work. That would understandably be quite slow. But just calling a function and returning immediately should not be a problem. Otherwise, the call to check_collation_set() in varstr_cmp() and elsewhere would be just as bad. So, unless there are concerns, I'm going to see about making a patch to call pg_newlocale_from_collation() even with the default collation. That would make the actual feature patch quite a bit smaller, since we won't have to patch every call site of pg_newlocale_from_collation().
Re: ICU for global collation
On Mon, Jan 10, 2022 at 03:45:47PM +0100, Daniel Verite wrote: > > By that I understand that CREATE DATABASE is limited to copying a template > database and then not write anything into it beyond that, as it's > not even connected to it. Yes. > I guess there's still the possibility of requiring that the ICU db-wide > collation of the new database does exist in the template database, > and then the CREATE DATABASE would refer to that collation instead of > an independent locale string. That could work. However if having the collation in the template database a strict requirement the something should also be done for initdb, and it will probably be a bigger problem.
Re: ICU for global collation
Julien Rouhaud wrote: > > The lack of these fields overall suggest the idea that when CREATE > > DATABASE is called with a global ICU collation, what if it somehow > > inserted the collation into pg_collation in the new database? > > Then pg_database would just store the collation oid and no other > > collation-related field would need to be added into it, now > > or in the future. > > I don't think it would be doable given the single-database-per-backend > restriction. By that I understand that CREATE DATABASE is limited to copying a template database and then not write anything into it beyond that, as it's not even connected to it. I guess there's still the possibility of requiring that the ICU db-wide collation of the new database does exist in the template database, and then the CREATE DATABASE would refer to that collation instead of an independent locale string. Best regards, -- Daniel Vérité PostgreSQL-powered mailer: https://www.manitou-mail.org Twitter: @DanielVerite
Re: ICU for global collation
On Mon, Jan 10, 2022 at 12:49:07PM +0100, Daniel Verite wrote: > > The "daticucol" column also suggests that we don't expect to add > other collation providers in the future. Maybe a pair of columns like > (datcollprovider, datcolllocale) would be more future-proof, > or a (datcollprovider, datcolllocale, datcollisdeterministic) > triplet if non-deterministic collations are allowed. I'm not sure about the non-deterministic default collation given the current restrictions with it, but the extra column seems like a good idea. It would require a bit more thinking, as we would need a second collation column in pg_database for any default provider that's not libc. > Also, pg_collation has "collversion" to detect a mismatch between > the ICU runtime and existing indexes. I don't see that field > for the db-wide ICU collation, so maybe we currently miss the capability > to detect the mismatch for the db-wide collation? I don't think that storing a version there will really help. There's no guarantee that any object has been created with the version of the collation that was installed when the database was created. And we would still need to store a version with each underlying object anyway, as rebuilding all broken dependencies can last for a long time, including a server restart. > The lack of these fields overall suggest the idea that when CREATE > DATABASE is called with a global ICU collation, what if it somehow > inserted the collation into pg_collation in the new database? > Then pg_database would just store the collation oid and no other > collation-related field would need to be added into it, now > or in the future. I don't think it would be doable given the single-database-per-backend restriction.
Re: ICU for global collation
Peter Eisentraut wrote: > Unlike in the previous patch, where the ICU > collation name was written in datcollate, there is now a third column > (daticucoll), so we can store all three values. I think some users would want their db-wide ICU collation to be case/accent-insensitive. Postgres users are trained to expect case-sensitive comparisons, but some apps initially made for e.g. MySQL or MS-SQL that use such collations natively would be easier to port to Postgres. IIRC, that was the context for some questions where people were enquiring about db-wide ICU collations. With the current patch, it's not possible, AFAICS, because the user can't tell that the collation is non-deterministic. Presumably this would require another option to CREATE DATABASE and another column to store that bit of information. The "daticucol" column also suggests that we don't expect to add other collation providers in the future. Maybe a pair of columns like (datcollprovider, datcolllocale) would be more future-proof, or a (datcollprovider, datcolllocale, datcollisdeterministic) triplet if non-deterministic collations are allowed. Also, pg_collation has "collversion" to detect a mismatch between the ICU runtime and existing indexes. I don't see that field for the db-wide ICU collation, so maybe we currently miss the capability to detect the mismatch for the db-wide collation? The lack of these fields overall suggest the idea that when CREATE DATABASE is called with a global ICU collation, what if it somehow inserted the collation into pg_collation in the new database? Then pg_database would just store the collation oid and no other collation-related field would need to be added into it, now or in the future. Best regards, -- Daniel Vérité PostgreSQL-powered mailer: https://www.manitou-mail.org Twitter: @DanielVerite
Re: ICU for global collation
On Mon, Jan 10, 2022 at 11:25:08AM +0800, Julien Rouhaud wrote: > On Fri, Jan 07, 2022 at 03:25:28PM +0100, Peter Eisentraut wrote: > > > > I tested this a bit. I used the following setup: > > > > create table t1 (a text); > > insert into t1 select md5(generate_series(1, 1000)::text); > > select count(*) from t1 where a > ''; > > > > And then I changed in varstr_cmp(): > > > > if (collid != DEFAULT_COLLATION_OID) > > mylocale = pg_newlocale_from_collation(collid); > > > > to just > > > > mylocale = pg_newlocale_from_collation(collid); > > > > I find that the \timing results are indistinguishable. (I used locale > > "en_US.UTF-8" and made sure that that code path is actually hit.) > > > > Does anyone have other insights? > > Looking at the git history, you added this comment in 414c5a2ea65. > > After a bit a digging in the lists, I found that you introduced it to fix a > reported 13% slowdown in varstr_cmp(): > https://www.postgresql.org/message-id/20110129075253.GA18784%40tornado.leadboat.com > https://www.postgresql.org/message-id/1296748408.6442.1.camel%40vanquo.pezone.net So I tried to run Noah's benchmark to see if I could reproduce the slowdown. Unfortunately the results I'm getting don't really make sense as removing the optimisation brings a 15% speedup, and with a few more runs I can see that I have about 25% noise, so there isn't much I can do to help.
Re: ICU for global collation
On Fri, Jan 07, 2022 at 03:25:28PM +0100, Peter Eisentraut wrote: > > I tested this a bit. I used the following setup: > > create table t1 (a text); > insert into t1 select md5(generate_series(1, 1000)::text); > select count(*) from t1 where a > ''; > > And then I changed in varstr_cmp(): > > if (collid != DEFAULT_COLLATION_OID) > mylocale = pg_newlocale_from_collation(collid); > > to just > > mylocale = pg_newlocale_from_collation(collid); > > I find that the \timing results are indistinguishable. (I used locale > "en_US.UTF-8" and made sure that that code path is actually hit.) > > Does anyone have other insights? Looking at the git history, you added this comment in 414c5a2ea65. After a bit a digging in the lists, I found that you introduced it to fix a reported 13% slowdown in varstr_cmp(): https://www.postgresql.org/message-id/20110129075253.GA18784%40tornado.leadboat.com https://www.postgresql.org/message-id/1296748408.6442.1.camel%40vanquo.pezone.net
Re: ICU for global collation
Julien Rouhaud wrote: > If you want a database with an ICU default collation the lc_collate > and lc_ctype should inherit what's in the template database and not > what was provided in the LOCALE I think. You could still probably > overload them in some scenario, but without a list of what isn't > ICU-aware I can't really be sure of how often one might have to do > it. I guess we'd need that when creating a database with a different encoding than the template databases, at least. About what's not ICU-aware, I believe the most significant part in core is the Full Text Search parser. It doesn't care about sorting strings, but it relies on the functions from POSIX , which depend on LC_CTYPE (it looks however that this could be improved by following what has been done in backend/regex/regc_pg_locale.c, which has comparable needs and calls ICU functions when applicable). Also, any extension is potentially concerned. Surely many extensions call functions from ctype.h assuming that the current value of LC_CTYPE works with the data they handle. Best regards, -- Daniel Vérité PostgreSQL-powered mailer: https://www.manitou-mail.org Twitter: @DanielVerite
Re: ICU for global collation
On 04.01.22 17:03, Peter Eisentraut wrote: There are really a lot of places with this new code. Maybe it could be some new function/macro to wrap that for the normal case (e.g. not formatting.c)? Right, we could just put this into pg_newlocale_from_collation(), but the comment there says * In fact, they shouldn't call this function at all when they are dealing * with the default locale. That can save quite a bit in hotspots. I don't know how to assess that. I tested this a bit. I used the following setup: create table t1 (a text); insert into t1 select md5(generate_series(1, 1000)::text); select count(*) from t1 where a > ''; And then I changed in varstr_cmp(): if (collid != DEFAULT_COLLATION_OID) mylocale = pg_newlocale_from_collation(collid); to just mylocale = pg_newlocale_from_collation(collid); I find that the \timing results are indistinguishable. (I used locale "en_US.UTF-8" and made sure that that code path is actually hit.) Does anyone have other insights?
Re: ICU for global collation
Hi, I looked a bit more in this patch and I have some additional remarks. On Thu, Dec 30, 2021 at 01:07:21PM +0100, Peter Eisentraut wrote: > > So this is a different approach: If you choose ICU as the default locale for > a database, you still need to specify lc_ctype and lc_collate settings, as > before. Unlike in the previous patch, where the ICU collation name was > written in datcollate, there is now a third column (daticucoll), so we can > store all three values. This fixes the described problem. Other than that, > once you get all the initial settings right, it basically just works: The > places that have ICU support now will use a database-wide ICU collation if > appropriate, the places that don't have ICU support continue to use the > global libc locale settings. So just to confirm a database can now have 2 different *default* collations: a libc-based one for everything that doesn't work with ICU and a ICU-based (if specified) for everything else, and the ICU-based is optional, so if not provided everything works as before, using the libc based default collation. As I mentioned I think this approach is sensible. However, should we document what are the things that are not ICU-aware? > I changed the datcollate, datctype, and the new daticucoll fields to type > text (from name). That way, the daticucoll field can be set to null if it's > not applicable. Also, the limit of 63 characters can actually be a problem > if you want to use some combination of the options that ICU locales offer. > And for less extreme uses, having variable-length fields will save some > storage, since typical locale names are much shorter. I understand the need to have daticucoll as text, however it's not clear to me why this has to be changed for datcollate and datctype? IIUC those will only ever contain libc-based collation and are still mandatory? > > For the same reasons and to keep things consistent, I also changed the > analogous pg_collation fields like that. The respective fields in pg_collation are now nullable, so the changes there sounds ok. Digging a bit more in the patch here are some things that looks problematic. - pg_upgrade It checks (in check_locale_and_encoding()) the compatibility for each database, and it looks like the daticucoll field should also be verified. Other than that I don't think there is anything else needed for the pg_upgrade part as everything else should be handled by pg_dump (I didn't look at the changes yet given the below problems). - CREATE DATABASE There's a new COLLATION_PROVIDER option, but the overall behavior seems quite unintuitive. As far as I can see the idea is to use LOCALE for the ICU default collation, but as it's providing a default for the other values it's quite annoying. For instance: =# CREATE DATABASE db COLLATION_PROVIDER icu LOCALE 'fr-x-icu' LC_COLLATE 'en_GB.UTF-8';; ERROR: 42809: invalid locale name: "fr-x-icu" LOCATION: createdb, dbcommands.c:397 Looking at the code it's actually complaining about LC_CTYPE. If you want a database with an ICU default collation the lc_collate and lc_ctype should inherit what's in the template database and not what was provided in the LOCALE I think. You could still probably overload them in some scenario, but without a list of what isn't ICU-aware I can't really be sure of how often one might have to do it. Now, if I specify everything as needed it looks like it's missing some checks on the ICU default collation when not using template0: =# CREATE DATABASE db COLLATION_PROVIDER icu LOCALE 'en-x-icu' LC_COLLATE 'en_GB.UTF-8' LC_CTYPE 'en_GB.UTF-8';; CREATE DATABASE =# SELECT datname, datcollate, datctype, daticucoll FROM pg_database ; datname | datcollate | datctype | daticucoll ---+-+-+ postgres | en_GB.UTF-8 | en_GB.UTF-8 | fr-x-icu db| en_GB.UTF-8 | en_GB.UTF-8 | en-x-icu template1 | en_GB.UTF-8 | en_GB.UTF-8 | fr-x-icu template0 | en_GB.UTF-8 | en_GB.UTF-8 | fr-x-icu (4 rows) Unless I'm missing something the same concerns about collation incompatibility with objects in the source database should also apply for the ICU collation? While at it, I'm not exactly sure of what the COLLATION_PROVIDER is supposed to mean, as the same commands but with a libc provider is accepted and has the exact same result: =# CREATE DATABASE db2 COLLATION_PROVIDER libc LOCALE 'en-x-icu' LC_COLLATE 'en_GB.UTF-8' LC_CTYPE 'en_GB.UTF-8';; CREATE DATABASE =# SELECT datname, datcollate, datctype, daticucoll FROM pg_database ; datname | datcollate | datctype | daticucoll ---+-+-+ postgres | en_GB.UTF-8 | en_GB.UTF-8 | fr-x-icu db| en_GB.UTF-8 | en_GB.UTF-8 | en-x-icu template1 | en_GB.UTF-8 | en_GB.UTF-8 | fr-x-icu template0 | en_GB.UTF-8 | en_GB.UTF-8 | fr-x-icu db2 | en_GB.UTF-8 | en_GB.UTF-8 | en-x-icu (5 rows) Shouldn't db2 have a NULL daticucoll, and if so also complain about
Re: ICU for global collation
On Thu, Jan 06, 2022 at 01:55:55PM +, Finnerty, Jim wrote: > > I didn't notice anything version-specific about the patch. Would any > modifications be needed to backport it to pg13 and pg14? This is a new feature so it can't be backported. The changes aren't big and mostly touches places that didn't change in a long time so I don't think that it would take much effort if you wanted to backport it on your own forks. > After this patch goes in, the big next thing would be to support > nondeterministic collations for LIKE, ILIKE and pattern matching operators in > general. Is anyone interested in working on that? As far as I know you're the last person that seemed to be working on that topic back in March :)
Re: ICU for global collation
I didn't notice anything version-specific about the patch. Would any modifications be needed to backport it to pg13 and pg14? After this patch goes in, the big next thing would be to support nondeterministic collations for LIKE, ILIKE and pattern matching operators in general. Is anyone interested in working on that? On 1/5/22, 10:36 PM, "Julien Rouhaud" wrote: CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe. On Tue, Jan 04, 2022 at 05:03:10PM +0100, Peter Eisentraut wrote: > On 04.01.22 03:21, Julien Rouhaud wrote: > > > > - if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID) > > > - mylocale = pg_newlocale_from_collation(collid); > > > + if (!lc_collate_is_c(collid)) > > > + { > > > + if (collid != DEFAULT_COLLATION_OID) > > > + mylocale = pg_newlocale_from_collation(collid); > > > + else if (default_locale.provider == COLLPROVIDER_ICU) > > > + mylocale = _locale; > > > + } > > > > There are really a lot of places with this new code. Maybe it could be some > > new function/macro to wrap that for the normal case (e.g. not formatting.c)? > > Right, we could just put this into pg_newlocale_from_collation(), but the > comment there says > > * In fact, they shouldn't call this function at all when they are dealing > * with the default locale. That can save quite a bit in hotspots. > > I don't know how to assess that. > > We could pack this into a macro or inline function if we are concerned about > this. Yes that was my idea, just have a new function (inline function or a macro then since pg_newlocale_from_collation() clearly warns about performance concerns) that have the whole is-not-c-collation-and-is-default-collation-or-icu-collation logic and calls pg_newlocale_from_collation() only when needed.
Re: ICU for global collation
On Tue, Jan 04, 2022 at 05:03:10PM +0100, Peter Eisentraut wrote: > On 04.01.22 03:21, Julien Rouhaud wrote: > > > > - if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID) > > > - mylocale = pg_newlocale_from_collation(collid); > > > + if (!lc_collate_is_c(collid)) > > > + { > > > + if (collid != DEFAULT_COLLATION_OID) > > > + mylocale = pg_newlocale_from_collation(collid); > > > + else if (default_locale.provider == COLLPROVIDER_ICU) > > > + mylocale = _locale; > > > + } > > > > There are really a lot of places with this new code. Maybe it could be some > > new function/macro to wrap that for the normal case (e.g. not formatting.c)? > > Right, we could just put this into pg_newlocale_from_collation(), but the > comment there says > > * In fact, they shouldn't call this function at all when they are dealing > * with the default locale. That can save quite a bit in hotspots. > > I don't know how to assess that. > > We could pack this into a macro or inline function if we are concerned about > this. Yes that was my idea, just have a new function (inline function or a macro then since pg_newlocale_from_collation() clearly warns about performance concerns) that have the whole is-not-c-collation-and-is-default-collation-or-icu-collation logic and calls pg_newlocale_from_collation() only when needed.
Re: ICU for global collation
On 04.01.22 03:21, Julien Rouhaud wrote: @@ -2774,6 +2776,7 @@ dumpDatabase(Archive *fout) appendPQExpBuffer(dbQry, "SELECT tableoid, oid, datname, " "(%s datdba) AS dba, " "pg_encoding_to_char(encoding) AS encoding, " + "datcollprovider, " This needs to be in a new pg 15+ branch, not in the pg 9.3+. ok - if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID) - mylocale = pg_newlocale_from_collation(collid); + if (!lc_collate_is_c(collid)) + { + if (collid != DEFAULT_COLLATION_OID) + mylocale = pg_newlocale_from_collation(collid); + else if (default_locale.provider == COLLPROVIDER_ICU) + mylocale = _locale; + } There are really a lot of places with this new code. Maybe it could be some new function/macro to wrap that for the normal case (e.g. not formatting.c)? Right, we could just put this into pg_newlocale_from_collation(), but the comment there says * In fact, they shouldn't call this function at all when they are dealing * with the default locale. That can save quite a bit in hotspots. I don't know how to assess that. We could pack this into a macro or inline function if we are concerned about this.
Re: ICU for global collation
Hi, On Thu, Dec 30, 2021 at 01:07:21PM +0100, Peter Eisentraut wrote: > > So this is a different approach: If you choose ICU as the default locale for > a database, you still need to specify lc_ctype and lc_collate settings, as > before. Unlike in the previous patch, where the ICU collation name was > written in datcollate, there is now a third column (daticucoll), so we can > store all three values. This fixes the described problem. Other than that, > once you get all the initial settings right, it basically just works: The > places that have ICU support now will use a database-wide ICU collation if > appropriate, the places that don't have ICU support continue to use the > global libc locale settings. That looks sensible to me. > @@ -2774,6 +2776,7 @@ dumpDatabase(Archive *fout) > appendPQExpBuffer(dbQry, "SELECT tableoid, oid, datname, " > "(%s datdba) AS dba, " > > "pg_encoding_to_char(encoding) AS encoding, " > + "datcollprovider, " This needs to be in a new pg 15+ branch, not in the pg 9.3+. > - if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID) > - mylocale = pg_newlocale_from_collation(collid); > + if (!lc_collate_is_c(collid)) > + { > + if (collid != DEFAULT_COLLATION_OID) > + mylocale = pg_newlocale_from_collation(collid); > + else if (default_locale.provider == COLLPROVIDER_ICU) > + mylocale = _locale; > + } There are really a lot of places with this new code. Maybe it could be some new function/macro to wrap that for the normal case (e.g. not formatting.c)?
Re: ICU for global collation
There were a few inquiries about this topic recently, so I dug up the old thread and patch. What we got stuck on last time was that we can't just swap out all locale support in a database for ICU. We still need to set the usual locale environment, otherwise some things that are not ICU aware will break or degrade. I had initially anticipated fixing that by converting everything that uses libc locales to ICU. But that turned out to be tedious and ultimately not very useful as far as the user-facing result is concerned, so I gave up. So this is a different approach: If you choose ICU as the default locale for a database, you still need to specify lc_ctype and lc_collate settings, as before. Unlike in the previous patch, where the ICU collation name was written in datcollate, there is now a third column (daticucoll), so we can store all three values. This fixes the described problem. Other than that, once you get all the initial settings right, it basically just works: The places that have ICU support now will use a database-wide ICU collation if appropriate, the places that don't have ICU support continue to use the global libc locale settings. I changed the datcollate, datctype, and the new daticucoll fields to type text (from name). That way, the daticucoll field can be set to null if it's not applicable. Also, the limit of 63 characters can actually be a problem if you want to use some combination of the options that ICU locales offer. And for less extreme uses, having variable-length fields will save some storage, since typical locale names are much shorter. For the same reasons and to keep things consistent, I also changed the analogous pg_collation fields like that. This also removes some weird code that has to check that colcollate and colctype have to be the same for ICU, so it's overall cleaner.From 4eb9fbac238c1abf481fa43431ecc22e782a5290 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Thu, 30 Dec 2021 12:47:24 +0100 Subject: [PATCH v3] Add option to use ICU as global collation provider This adds the option to use ICU as the default collation provider for either the whole cluster or a database. New options for initdb, createdb, and CREATE DATABASE are used to select this. Discussion: https://www.postgresql.org/message-id/flat/5e756dd6-0e91-d778-96fd-b1bcb06c161a%402ndquadrant.com --- doc/src/sgml/catalogs.sgml| 13 +- doc/src/sgml/ref/create_database.sgml | 16 ++ doc/src/sgml/ref/createdb.sgml| 9 + doc/src/sgml/ref/initdb.sgml | 23 ++ src/backend/access/hash/hashfunc.c| 18 +- src/backend/catalog/pg_collation.c| 24 +- src/backend/commands/collationcmds.c | 120 +++--- src/backend/commands/dbcommands.c | 93 ++-- src/backend/regex/regc_pg_locale.c| 7 +- src/backend/utils/adt/formatting.c| 6 + src/backend/utils/adt/like.c | 20 +- src/backend/utils/adt/like_support.c | 2 + src/backend/utils/adt/pg_locale.c | 223 +++--- src/backend/utils/adt/varchar.c | 22 +- src/backend/utils/adt/varlena.c | 26 +- src/backend/utils/init/postinit.c | 37 ++- src/bin/initdb/Makefile | 2 + src/bin/initdb/initdb.c | 62 - src/bin/initdb/t/001_initdb.pl| 18 +- src/bin/pg_dump/pg_dump.c | 16 ++ src/bin/psql/describe.c | 23 +- src/bin/psql/tab-complete.c | 2 +- src/bin/scripts/Makefile | 2 + src/bin/scripts/createdb.c| 9 + src/bin/scripts/t/020_createdb.pl | 20 +- src/include/catalog/pg_collation.dat | 3 +- src/include/catalog/pg_collation.h| 6 +- src/include/catalog/pg_database.dat | 2 +- src/include/catalog/pg_database.h | 16 +- src/include/utils/pg_locale.h | 6 + .../regress/expected/collate.icu.utf8.out | 10 +- src/test/regress/sql/collate.icu.utf8.sql | 8 +- 32 files changed, 665 insertions(+), 199 deletions(-) diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index 03e2537b07..89e7279030 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -2368,7 +2368,7 @@ pg_collation Columns - collcollate name + collcollate text LC_COLLATE for this collation object @@ -2377,13 +2377,22 @@ pg_collation Columns - collctype name + collctype text LC_CTYPE for this collation object + + + collicucoll text + + + ICU collation string + + + collversion text diff --git a/doc/src/sgml/ref/create_database.sgml b/doc/src