Re: ICU for global collation

2022-11-01 Thread Michael Paquier
On Fri, Oct 21, 2022 at 05:32:38PM +0300, Marina Polyakova wrote:
> Hello!
> 
> I discovered an interesting behaviour during installcheck runs when the
> cluster was initialized with ICU locale provider:
> 
> $ initdb --locale-provider icu --icu-locale en-US -D data &&
> 2) The option --no-locale in pg_regress is described as "use C locale" [2].
> But in this case the created databases actually use the ICU locale provider
> with the ICU cluster locale from template0 (see
> diff_check_backend_used_provider.patch):
> 
> $ make NO_LOCALE=1 installcheck

Yes, this looks wrong on the ground on what -no-locale is expected to
do, aka use a C locale.  Peter?
--
Michael


signature.asc
Description: PGP signature


Re: ICU for global collation

2022-10-21 Thread Marina Polyakova

Hello!

I discovered an interesting behaviour during installcheck runs when the 
cluster was initialized with ICU locale provider:


$ initdb --locale-provider icu --icu-locale en-US -D data &&
pg_ctl -D data -l logfile start

1) The ECPG tests fail because they use the SQL_ASCII encoding [1], the 
database template0 uses the ICU locale provider and SQL_ASCII is not 
supported by ICU:


$ make -C src/interfaces/ecpg/ installcheck
...
== creating database "ecpg1_regression"   ==
ERROR:  encoding "SQL_ASCII" is not supported with ICU provider
ERROR:  database "ecpg1_regression" does not exist
command failed: "/home/marina/postgresql/master/my/inst/bin/psql" -X -c 
"CREATE DATABASE \"ecpg1_regression\" TEMPLATE=template0 
ENCODING='SQL_ASCII'" -c "ALTER DATABASE \"ecpg1_regression\" SET 
lc_messages TO 'C';ALTER DATABASE \"ecpg1_regression\" SET lc_monetary 
TO 'C';ALTER DATABASE \"ecpg1_regression\" SET lc_numeric TO 'C';ALTER 
DATABASE \"ecpg1_regression\" SET lc_time TO 'C';ALTER DATABASE 
\"ecpg1_regression\" SET bytea_output TO 'hex';ALTER DATABASE 
\"ecpg1_regression\" SET timezone_abbreviations TO 'Default';" 
"postgres"


2) The option --no-locale in pg_regress is described as "use C locale" 
[2]. But in this case the created databases actually use the ICU locale 
provider with the ICU cluster locale from template0 (see 
diff_check_backend_used_provider.patch):


$ make NO_LOCALE=1 installcheck

In regression.diffs:

diff -U3 
/home/marina/postgresql/master/src/test/regress/expected/test_setup.out 
/home/marina/postgresql/master/src/test/regress/results/test_setup.out
--- 
/home/marina/postgresql/master/src/test/regress/expected/test_setup.out	2022-09-27 
05:31:27.674628815 +0300
+++ 
/home/marina/postgresql/master/src/test/regress/results/test_setup.out	2022-10-21 
15:09:31.232992885 +0300

@@ -143,6 +143,798 @@
 \set filename :abs_srcdir '/data/person.data'
 COPY person FROM :'filename';
 VACUUM ANALYZE person;
+NOTICE:  varstrfastcmp_locale sss->collate_c 0 sss->locale 0xefacd0
+NOTICE:  varstrfastcmp_locale sss->locale->provider i
+NOTICE:  varstrfastcmp_locale sss->locale->info.icu.locale en-US
...

The patch diff_fix_pg_regress_create_database.patch fixes both issues 
for me.


[1] 
https://github.com/postgres/postgres/blob/ce20f8b9f4354b46b40fd6ebf7ce5c37d08747e0/src/interfaces/ecpg/test/Makefile#L18
[2] 
https://github.com/postgres/postgres/blob/ce20f8b9f4354b46b40fd6ebf7ce5c37d08747e0/src/test/regress/pg_regress.c#L1992


--
Marina Polyakova
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Companydiff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c
index b57ed946c42bb54ede800e95045aa937a8dbad85..b3c0f6f753f8428274389844ccf9778a7ed47ea4 100644
--- a/src/backend/access/hash/hashfunc.c
+++ b/src/backend/access/hash/hashfunc.c
@@ -281,6 +281,14 @@ hashtext(PG_FUNCTION_ARGS)
 	if (!lc_collate_is_c(collid))
 		mylocale = pg_newlocale_from_collation(collid);
 
+	elog(NOTICE, "hashtext lc_collate_is_c(collid) %d mylocale %p", lc_collate_is_c(collid), mylocale);
+	if (mylocale)
+	{
+		elog(NOTICE, "hashtext mylocale->provider %c", mylocale->provider);
+		if (mylocale->provider == COLLPROVIDER_ICU)
+			elog(NOTICE, "hashtext mylocale->info.icu.locale %s", mylocale->info.icu.locale ? mylocale->info.icu.locale : "(null)");
+	}
+
 	if (!mylocale || mylocale->deterministic)
 	{
 		result = hash_any((unsigned char *) VARDATA_ANY(key),
@@ -337,6 +345,14 @@ hashtextextended(PG_FUNCTION_ARGS)
 	if (!lc_collate_is_c(collid))
 		mylocale = pg_newlocale_from_collation(collid);
 
+	elog(NOTICE, "hashtextextended lc_collate_is_c(collid) %d mylocale %p", lc_collate_is_c(collid), mylocale);
+	if (mylocale)
+	{
+		elog(NOTICE, "hashtextextended mylocale->provider %c", mylocale->provider);
+		if (mylocale->provider == COLLPROVIDER_ICU)
+			elog(NOTICE, "hashtextextended mylocale->info.icu.locale %s", mylocale->info.icu.locale ? mylocale->info.icu.locale : "(null)");
+	}
+
 	if (!mylocale || mylocale->deterministic)
 	{
 		result = hash_any_extended((unsigned char *) VARDATA_ANY(key),
diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c
index 02d462a659778016f3c4479d425ba0a84feb6e26..9627c84a7ccfb4c4013556a51c989e9e6d611634 100644
--- a/src/backend/regex/regc_pg_locale.c
+++ b/src/backend/regex/regc_pg_locale.c
@@ -243,6 +243,8 @@ pg_set_regex_collation(Oid collation)
  errhint("Use the COLLATE clause to set the collation explicitly.")));
 	}
 
+	elog(NOTICE, "pg_set_regex_collation lc_ctype_is_c(collid) %d", lc_ctype_is_c(collation));
+
 	if (lc_ctype_is_c(collation))
 	{
 		/* C/POSIX collations use this path regardless of database encoding */
@@ -259,6 +261,14 @@ pg_set_regex_collation(Oid collation)
 		 */
 		pg_regex_locale = pg_newlocale_from_collation(collation);
 
+		elog(NOTICE, "pg_set_regex_collation pg_regex_locale %p", pg_regex_locale);
+		if (pg_regex_locale)
+		{
+			elog(NOTICE, 

Re: ICU for global collation

2022-10-08 Thread Marina Polyakova

On 2022-10-01 15:07, Peter Eisentraut wrote:

On 22.09.22 20:06, Marina Polyakova wrote:

On 2022-09-21 17:53, Peter Eisentraut wrote:

Committed with that test, thanks.  I think that covers all the ICU
issues you reported for PG15 for now?


I thought about the order of the ICU checks - if it is ok to check 
that the selected encoding is supported by ICU after printing all the 
locale & encoding information, why not to move almost all the ICU 
checks here?..


It's possible that we can do better, but I'm not going to add things
like that to PG 15 at this point unless it fixes a faulty behavior.


Will PG 15 always have this order of ICU checks, is the current 
behaviour correct enough? On the other hand, there may be a better fix 
for PG 16+ and not all changes can be backported...


On 2022-09-16 10:56, Peter Eisentraut wrote:

On 15.09.22 17:41, Marina Polyakova wrote:
I agree with you. Here's another version of the patch. The 
locale/encoding checks and reports in initdb have been reordered, 
because now the encoding is set first and only then the ICU locale is 
checked.


I committed something based on the first version of your patch.  This
reordering of the messages here was a little too much surgery for me
at this point.  For instance, there are also messages in #ifdef WIN32
code that would need to be reordered as well.  I kept the overall
structure of the code the same and just inserted the additional
proposed checks.

If you want to pursue the reordering of the checks and messages
overall, a patch for the master branch could be considered.


I've worked on this again (see attached patch) but I'm not sure if the 
messages of encoding mismatches are clear enough without the full locale 
information. For


$ initdb -D data --icu-locale en --locale-provider icu

compare the outputs:

The database cluster will be initialized with this locale configuration:
  provider:icu
  ICU locale:  en
  LC_COLLATE:  de_DE.iso885915@euro
  LC_CTYPE:de_DE.iso885915@euro
  LC_MESSAGES: en_US.utf8
  LC_MONETARY: de_DE.iso885915@euro
  LC_NUMERIC:  de_DE.iso885915@euro
  LC_TIME: de_DE.iso885915@euro
The default database encoding has been set to "UTF8".
initdb: error: encoding mismatch
initdb: detail: The encoding you selected (UTF8) and the encoding that 
the selected locale uses (LATIN9) do not match. This would lead to 
misbehavior in various character string processing functions.
initdb: hint: Rerun initdb and either do not specify an encoding 
explicitly, or choose a matching combination.


and

Encoding "UTF8" implied by locale will be set as the default database 
encoding.

initdb: error: encoding mismatch
initdb: detail: The encoding you selected (UTF8) and the encoding that 
the selected locale uses (LATIN9) do not match. This would lead to 
misbehavior in various character string processing functions.
initdb: hint: Rerun initdb and either do not specify an encoding 
explicitly, or choose a matching combination.


The same without ICU, e.g. for

$ initdb -D data

the output with locale information:

The database cluster will be initialized with this locale configuration:
  provider:libc
  LC_COLLATE:  en_US.utf8
  LC_CTYPE:de_DE.iso885915@euro
  LC_MESSAGES: en_US.utf8
  LC_MONETARY: de_DE.iso885915@euro
  LC_NUMERIC:  de_DE.iso885915@euro
  LC_TIME: de_DE.iso885915@euro
The default database encoding has accordingly been set to "LATIN9".
initdb: error: encoding mismatch
initdb: detail: The encoding you selected (LATIN9) and the encoding that 
the selected locale uses (UTF8) do not match. This would lead to 
misbehavior in various character string processing functions.
initdb: hint: Rerun initdb and either do not specify an encoding 
explicitly, or choose a matching combination.


and the "shorter" version:

Encoding "LATIN9" implied by locale will be set as the default database 
encoding.

initdb: error: encoding mismatch
initdb: detail: The encoding you selected (LATIN9) and the encoding that 
the selected locale uses (UTF8) do not match. This would lead to 
misbehavior in various character string processing functions.
initdb: hint: Rerun initdb and either do not specify an encoding 
explicitly, or choose a matching combination.


BTW, what did you mean that "there are also messages in #ifdef WIN32 
code that would need to be reordered as well"?..


--
Marina Polyakova
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Companydiff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 96b46cbc020ef8b85b6d54d3d4ca8ad116277832..242d68f58287aeb6f95619c2ce8b78e38433cf18 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -1030,6 +1030,11 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 
 	if (dblocprovider == COLLPROVIDER_ICU)
 	{
+#ifndef USE_ICU
+		ereport(ERROR,
+(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("ICU is not supported in this build")));
+#else
 		if 

Re: ICU for global collation

2022-10-01 Thread Peter Eisentraut

On 22.09.22 20:06, Marina Polyakova wrote:

On 2022-09-21 17:53, Peter Eisentraut wrote:

Committed with that test, thanks.  I think that covers all the ICU
issues you reported for PG15 for now?


I thought about the order of the ICU checks - if it is ok to check that 
the selected encoding is supported by ICU after printing all the locale 
& encoding information, why not to move almost all the ICU checks here?..


It's possible that we can do better, but I'm not going to add things 
like that to PG 15 at this point unless it fixes a faulty behavior.






Re: ICU for global collation

2022-09-22 Thread Marina Polyakova

On 2022-09-21 17:53, Peter Eisentraut wrote:

Committed with that test, thanks.  I think that covers all the ICU
issues you reported for PG15 for now?


I thought about the order of the ICU checks - if it is ok to check that 
the selected encoding is supported by ICU after printing all the locale 
& encoding information, why not to move almost all the ICU checks 
here?..


Examples of the work of the attached patch:

1. ICU locale vs supported encoding:

1.1.

$ initdb --encoding sql-ascii --locale-provider icu hoge
...
initdb: error: encoding mismatch
initdb: detail: The encoding you selected (SQL_ASCII) is not supported 
with the ICU provider.
initdb: hint: Rerun initdb and either do not specify an encoding 
explicitly, or choose a matching combination.


1.2. (like before)

$ initdb --encoding sql-ascii --icu-locale en-US hoge
initdb: error: --icu-locale cannot be specified unless locale provider 
"icu" is chosen


$ createdb --encoding sql-ascii --icu-locale en-US hoge
createdb: error: database creation failed: ERROR:  ICU locale cannot be 
specified unless locale provider is ICU


2. For builds without ICU:

2.1.

$ initdb --locale-provider icu hoge
...
initdb: error: ICU is not supported in this build

$ createdb --locale-provider icu hoge
createdb: error: database creation failed: ERROR:  ICU is not supported 
in this build


2.2. (like before)

$ initdb --icu-locale en-US hoge
initdb: error: --icu-locale cannot be specified unless locale provider 
"icu" is chosen


$ createdb --icu-locale en-US hoge
createdb: error: database creation failed: ERROR:  ICU locale cannot be 
specified unless locale provider is ICU


2.3.

$ createdb --locale-provider icu --icu-locale en-US --encoding sql-ascii 
hoge
createdb: error: database creation failed: ERROR:  ICU is not supported 
in this build


4. About errors in initdb:

4.1. If icu_locale is not specified, but it is required, then we get 
this:


$ initdb --locale-provider icu hoge
The files belonging to this database system will be owned by user 
"marina".

This user must also own the server process.

The database cluster will be initialized with this locale configuration:
  provider:icu
  LC_COLLATE:  en_US.UTF-8
  LC_CTYPE:en_US.UTF-8
  LC_MESSAGES: en_US.UTF-8
  LC_MONETARY: ru_RU.UTF-8
  LC_NUMERIC:  ru_RU.UTF-8
  LC_TIME: ru_RU.UTF-8
The default database encoding has been set to "UTF8".
initdb: error: ICU locale must be specified

Almost the same if ICU is not supported in this build:

$ initdb --locale-provider icu hoge
The files belonging to this database system will be owned by user 
"marina".

This user must also own the server process.

The database cluster will be initialized with this locale configuration:
  provider:icu
  LC_COLLATE:  en_US.UTF-8
  LC_CTYPE:en_US.UTF-8
  LC_MESSAGES: en_US.UTF-8
  LC_MONETARY: ru_RU.UTF-8
  LC_NUMERIC:  ru_RU.UTF-8
  LC_TIME: ru_RU.UTF-8
The default database encoding has been set to "UTF8".
initdb: error: ICU is not supported in this build

4.2. If icu_locale is specified for the wrong provider, the error will 
be at the beginning of the program start as before:


$ initdb --icu-locale en-US hoge
initdb: error: --icu-locale cannot be specified unless locale provider 
"icu" is chosen


--
Marina Polyakova
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Companydiff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index e0753c1badcc299e2bac45f3bdd2f23f59d70cbc..2589a2523e07c9543c99c7d7b446438d62382b89 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -1030,6 +1030,11 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 
 	if (dblocprovider == COLLPROVIDER_ICU)
 	{
+#ifndef USE_ICU
+		ereport(ERROR,
+(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("ICU is not supported in this build")));
+#else
 		if (!(is_encoding_supported_by_icu(encoding)))
 			ereport(ERROR,
 	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
@@ -1046,6 +1051,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 	 errmsg("ICU locale must be specified")));
 
 		check_icu_locale(dbiculocale);
+#endif
 	}
 	else
 	{
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 2b42d9ccd8b751bcb5cda3a1f4c7803a68bc0a4a..743f11e1d1fd62d500fc2846976472d13e08046b 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1945,15 +1945,12 @@ icu_set_collation_attributes(UCollator *collator, const char *loc)
 	}
 }
 
-#endif			/* USE_ICU */
-
 /*
  * Check if the given locale ID is valid, and ereport(ERROR) if it isn't.
  */
 void
 check_icu_locale(const char *icu_locale)
 {
-#ifdef USE_ICU
 	UCollator  *collator;
 	UErrorCode	status;
 
@@ -1967,13 +1964,10 @@ check_icu_locale(const char *icu_locale)
 	if (U_ICU_VERSION_MAJOR_NUM < 54)
 		icu_set_collation_attributes(collator, icu_locale);
 	ucol_close(collator);
-#else
-	ereport(ERROR,
-			

Re: ICU for global collation

2022-09-21 Thread Peter Eisentraut

On 21.09.22 08:50, Marina Polyakova wrote:

On 2022-09-20 12:59, Peter Eisentraut wrote:

On 17.09.22 10:33, Marina Polyakova wrote:

3.

The locale provider is ICU, but it has not yet been set from the 
template database:



$ initdb --locale-provider icu --icu-locale en-US -D data &&
pg_ctl -D data -l logfile start &&
createdb --icu-locale ru-RU --template template0 mydb
...
createdb: error: database creation failed: ERROR:  ICU locale cannot be
specified unless locale provider is ICU


Please see attached patch for a fix.  Does that work for you?


Yes, it works. The following test checks this fix:


Committed with that test, thanks.  I think that covers all the ICU 
issues you reported for PG15 for now?






Re: ICU for global collation

2022-09-21 Thread Marina Polyakova

On 2022-09-20 12:59, Peter Eisentraut wrote:

On 17.09.22 10:33, Marina Polyakova wrote:

3.

The locale provider is ICU, but it has not yet been set from the 
template database:



$ initdb --locale-provider icu --icu-locale en-US -D data &&
pg_ctl -D data -l logfile start &&
createdb --icu-locale ru-RU --template template0 mydb
...
createdb: error: database creation failed: ERROR:  ICU locale cannot 
be

specified unless locale provider is ICU


Please see attached patch for a fix.  Does that work for you?


Yes, it works. The following test checks this fix:

diff --git a/src/bin/scripts/t/020_createdb.pl 
b/src/bin/scripts/t/020_createdb.pl
index 
b87d8fc63b5246b02bcd4499aae815269b60df7c..c2464a99618cd7ca5616cc21121e1e4379b52baf 
100644

--- a/src/bin/scripts/t/020_createdb.pl
+++ b/src/bin/scripts/t/020_createdb.pl
@@ -71,6 +71,14 @@ if ($ENV{with_icu} eq 'yes')
$node2->command_ok(
 		[ 'createdb', '-T', 'template0', '--locale-provider=libc', 'foobar55' 
],
 		'create database with libc provider from template database with icu 
provider');

+
+   $node2->command_ok(
+   [
+   'createdb', '-T', 'template0', '--icu-locale',
+   'en-US', 'foobar56'
+   ],
+		'create database with icu locale from template database with icu 
provider'

+   );
 }
 else
 {

--
Marina Polyakova
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company




Re: ICU for global collation

2022-09-20 Thread Peter Eisentraut

On 17.09.22 10:33, Marina Polyakova wrote:
Thanks to Kyotaro Horiguchi review we found out that there're 
interesting cases due to the order of some ICU checks:


1. ICU locale vs supported encoding:

1.1.

On 2022-09-15 09:52, Kyotaro Horiguchi wrote:

If I executed initdb as follows, I would be told to specify
--icu-locale option.


$ initdb --encoding sql-ascii --locale-provider icu hoge
...
initdb: error: ICU locale must be specified


However, when I reran the command, it complains about incompatible
encoding this time.  I think it's more user-friendly to check for the
encoding compatibility before the check for missing --icu-locale
option.


This a valid point, but it would require quite a bit of work to move all 
those checks around and re-verify the result, so I don't want to do it 
in PG15.



1.2. (ok?)

$ initdb --encoding sql-ascii --icu-locale en-US hoge
initdb: error: --icu-locale cannot be specified unless locale provider 
"icu" is chosen


$ initdb --encoding sql-ascii --icu-locale en-US --locale-provider icu hoge
...
initdb: error: encoding mismatch
initdb: detail: The encoding you selected (SQL_ASCII) is not supported 
with the ICU provider.
initdb: hint: Rerun initdb and either do not specify an encoding 
explicitly, or choose a matching combination.


$ createdb --encoding sql-ascii --icu-locale en-US hoge
createdb: error: database creation failed: ERROR:  ICU locale cannot be 
specified unless locale provider is ICU
$ createdb --encoding sql-ascii --icu-locale en-US --locale-provider icu 
hoge
createdb: error: database creation failed: ERROR:  encoding "SQL_ASCII" 
is not supported with ICU provider


I don't see a problem here.


2. For builds without ICU:

2.1.

$ initdb --locale-provider icu hoge
...
initdb: error: ICU locale must be specified

$ initdb --locale-provider icu --icu-locale en-US hoge
...
initdb: error: ICU is not supported in this build

$ createdb --locale-provider icu hoge
createdb: error: database creation failed: ERROR:  ICU locale must be 
specified


$ createdb --locale-provider icu --icu-locale en-US hoge
createdb: error: database creation failed: ERROR:  ICU is not supported 
in this build


IMO, it would be more user-friendly to inform an unsupported build in 
the first runs too..


Again, this would require reorganizing a bunch of code to get some 
cosmetic benefit, which isn't a good idea now for PG15.



2.2. (ok?)
2.3.


same here


3.

The locale provider is ICU, but it has not yet been set from the 
template database:



$ initdb --locale-provider icu --icu-locale en-US -D data &&
pg_ctl -D data -l logfile start &&
createdb --icu-locale ru-RU --template template0 mydb
...
createdb: error: database creation failed: ERROR:  ICU locale cannot be
specified unless locale provider is ICU


Please see attached patch for a fix.  Does that work for you?From 5bb07e390baf8a471f7d30582469268c8a6d71ca Mon Sep 17 00:00:00 2001
From: Peter Eisentraut 
Date: Tue, 20 Sep 2022 05:50:41 -0400
Subject: [PATCH] Improve ICU option handling in CREATE DATABASE

We check that the ICU locale is only specified if the ICU locale
provider is selected.  But we did that too early.  We need to wait
until we load the settings of the template database, since that could
also set what the locale provider is.

Reported-by: Marina Polyakova 
Discussion: 
https://www.postgresql.org/message-id/9ba4cd1ea6ed6b7b15c0ff15e6f54...@postgrespro.ru
---
 src/backend/commands/dbcommands.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/backend/commands/dbcommands.c 
b/src/backend/commands/dbcommands.c
index cba929b7f998..5dfec5c6b056 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -907,10 +907,6 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 errmsg("unrecognized locale provider: 
%s",
locproviderstr)));
}
-   if (diculocale && dblocprovider != COLLPROVIDER_ICU)
-   ereport(ERROR,
-   (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
-errmsg("ICU locale cannot be specified unless 
locale provider is ICU")));
if (distemplate && distemplate->arg)
dbistemplate = defGetBoolean(distemplate);
if (dallowconnections && dallowconnections->arg)
@@ -1050,6 +1046,13 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 
check_icu_locale(dbiculocale);
}
+   else
+   {
+   if (dbiculocale)
+   ereport(ERROR,
+   
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+errmsg("ICU locale cannot be specified 
unless locale provider is ICU")));
+   }
 
/*
 * Check that the new encoding and locale settings match the source
-- 
2.37.3



Re: ICU for global collation

2022-09-17 Thread Marina Polyakova

On 2022-09-16 10:56, Peter Eisentraut wrote:

On 15.09.22 17:41, Marina Polyakova wrote:
I agree with you. Here's another version of the patch. The 
locale/encoding checks and reports in initdb have been reordered, 
because now the encoding is set first and only then the ICU locale is 
checked.


I committed something based on the first version of your patch.  This
reordering of the messages here was a little too much surgery for me
at this point.  For instance, there are also messages in #ifdef WIN32
code that would need to be reordered as well.  I kept the overall
structure of the code the same and just inserted the additional
proposed checks.

If you want to pursue the reordering of the checks and messages
overall, a patch for the master branch could be considered.


Thank you! I already wrote about the order of the ICU checks in 
initdb/create database, they were the only reason to propose such 
changes...


--
Marina Polyakova
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company




Re: ICU for global collation

2022-09-17 Thread Marina Polyakova
Thanks to Kyotaro Horiguchi review we found out that there're 
interesting cases due to the order of some ICU checks:


1. ICU locale vs supported encoding:

1.1.

On 2022-09-15 09:52, Kyotaro Horiguchi wrote:

If I executed initdb as follows, I would be told to specify
--icu-locale option.


$ initdb --encoding sql-ascii --locale-provider icu hoge
...
initdb: error: ICU locale must be specified


However, when I reran the command, it complains about incompatible
encoding this time.  I think it's more user-friendly to check for the
encoding compatibility before the check for missing --icu-locale
option.


1.2. (ok?)

$ initdb --encoding sql-ascii --icu-locale en-US hoge
initdb: error: --icu-locale cannot be specified unless locale provider 
"icu" is chosen


$ initdb --encoding sql-ascii --icu-locale en-US --locale-provider icu 
hoge

...
initdb: error: encoding mismatch
initdb: detail: The encoding you selected (SQL_ASCII) is not supported 
with the ICU provider.
initdb: hint: Rerun initdb and either do not specify an encoding 
explicitly, or choose a matching combination.


$ createdb --encoding sql-ascii --icu-locale en-US hoge
createdb: error: database creation failed: ERROR:  ICU locale cannot be 
specified unless locale provider is ICU
$ createdb --encoding sql-ascii --icu-locale en-US --locale-provider icu 
hoge
createdb: error: database creation failed: ERROR:  encoding "SQL_ASCII" 
is not supported with ICU provider


2. For builds without ICU:

2.1.

$ initdb --locale-provider icu hoge
...
initdb: error: ICU locale must be specified

$ initdb --locale-provider icu --icu-locale en-US hoge
...
initdb: error: ICU is not supported in this build

$ createdb --locale-provider icu hoge
createdb: error: database creation failed: ERROR:  ICU locale must be 
specified


$ createdb --locale-provider icu --icu-locale en-US hoge
createdb: error: database creation failed: ERROR:  ICU is not supported 
in this build


IMO, it would be more user-friendly to inform an unsupported build in 
the first runs too..


2.2. (ok?)

$ initdb --icu-locale en-US hoge
initdb: error: --icu-locale cannot be specified unless locale provider 
"icu" is chosen

$ initdb --icu-locale en-US --locale-provider icu hoge
...
initdb: error: ICU is not supported in this build

$ createdb --icu-locale en-US hoge
createdb: error: database creation failed: ERROR:  ICU locale cannot be 
specified unless locale provider is ICU

$ createdb --icu-locale en-US --locale-provider icu hoge
createdb: error: database creation failed: ERROR:  ICU is not supported 
in this build


2.3.

$ createdb --locale-provider icu --icu-locale en-US --encoding sql-ascii 
hoge
createdb: error: database creation failed: ERROR:  encoding "SQL_ASCII" 
is not supported with ICU provider

$ createdb --locale-provider icu --icu-locale en-US --encoding utf8 hoge
createdb: error: database creation failed: ERROR:  ICU is not supported 
in this build


IMO, it would be more user-friendly to inform an unsupported build in 
the first run too..


3.

The locale provider is ICU, but it has not yet been set from the 
template database:



$ initdb --locale-provider icu --icu-locale en-US -D data &&
pg_ctl -D data -l logfile start &&
createdb --icu-locale ru-RU --template template0 mydb
...
createdb: error: database creation failed: ERROR:  ICU locale cannot be
specified unless locale provider is ICU


--
Marina Polyakova
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company




Re: ICU for global collation

2022-09-17 Thread Marina Polyakova

On 2022-09-16 11:11, Kyotaro Horiguchi wrote:

At Fri, 16 Sep 2022 09:49:28 +0300, Marina Polyakova
 wrote in

In continuation of options check: AFAICS the following checks in
initdb

if (locale_provider == COLLPROVIDER_ICU)
{
if (!icu_locale)
pg_fatal("ICU locale must be specified");

/*
 * In supported builds, the ICU locale ID will be checked by the
 * backend during post-bootstrap initialization.
 */
#ifndef USE_ICU
pg_fatal("ICU is not supported in this build");
#endif
}

are executed approximately when they are executed in create database
after getting all the necessary data from the template database:


initdb doesn't work that way, but anyway, I realized that I am
proposing to move that code in setlocales() to the caller function as
the result. I don't think setlocales() is the place for the code
because icu locale has no business with what the function does.  That
being said there's no obvious reason we *need* to move the code out to
its caller.


Excuse me, but could you explain your last sentence in more detail? I 
read that this code is not for setlocales and then - that it should not 
moved from here, so I'm confused...



+errmsg("encoding \"%s\" is not supported 
with ICU provider",

+   pg_log_error("encoding \"%s\" is not supported with ICU 
provider",
+pg_encoding_to_char(encodingid));

I might be wrong, but the messages look wrong to me.  The alternatives
below might work.

"encoding \"%s\" is not supported by ICU"
"encoding \"%s\" cannot be used for/with ICU locales"


The message indicates that the selected encoding cannot be used with the 
ICU provider because it does not support it. But if the text of the 
message becomes better and clearer, I will only be glad.



+   pg_log_error_hint("Rerun %s and choose a matching combination.",
+ progname);

This doesn't seem to provide users with useful information.


It was commited in more verbose form:

pg_log_error_hint("Rerun %s and either do not specify an encoding 
explicitly, "

  "or choose a matching combination.",

--
Marina Polyakova
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company




Re: ICU for global collation

2022-09-17 Thread Marina Polyakova

On 2022-09-16 10:57, Peter Eisentraut wrote:

On 16.09.22 09:31, Marina Polyakova wrote:
IMO it is hardly understantable from the program output either - it 
looks like I manually chose the encoding UTF8. Maybe first inform 
about selected encoding?..


Yes, I included something like that in the patch just committed.


diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 
6aeec8d426c52414b827686781c245291f27ed1f..348bbbeba0f5bc7ff601912bf883510d580b814c 
100644

--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -2310,7 +2310,11 @@ setup_locale_encoding(void)
  }

  if (!encoding && locale_provider == COLLPROVIDER_ICU)
+    {
  encodingid = PG_UTF8;
+    printf(_("The default database encoding has been set to 
\"%s\" for a better experience with the ICU provider.\n"),

+   pg_encoding_to_char(encodingid));
+    }
  else if (!encoding)
  {
  int    ctype_enc;


Thank you!

--
Marina Polyakova
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company




Re: ICU for global collation

2022-09-16 Thread Peter Eisentraut

On 16.09.22 08:49, Marina Polyakova wrote:
But perhaps the check that --icu-locale cannot be specified unless 
locale provider icu is chosen should also be moved here? So all these 
checks will be in one place and it will use the provider from the 
template database (which could be icu):


$ initdb --locale-provider icu --icu-locale en-US -D data &&
pg_ctl -D data -l logfile start &&
createdb --icu-locale ru-RU --template template0 mydb
...
createdb: error: database creation failed: ERROR:  ICU locale cannot be 
specified unless locale provider is ICU


Can you be more specific about what you are proposing here?  I'm not 
following.






Re: ICU for global collation

2022-09-16 Thread Kyotaro Horiguchi
At Fri, 16 Sep 2022 09:56:31 +0200, Peter Eisentraut 
 wrote in 
> On 15.09.22 17:41, Marina Polyakova wrote:
> > I agree with you. Here's another version of the patch. The
> > locale/encoding checks and reports in initdb have been reordered,
> > because now the encoding is set first and only then the ICU locale is
> > checked.
> 
> I committed something based on the first version of your patch.  This
> reordering of the messages here was a little too much surgery for me
> at this point.  For instance, there are also messages in #ifdef WIN32
> code that would need to be reordered as well.  I kept the overall
> structure of the code the same and just inserted the additional
> proposed checks.

Yeah, as I sent just before, I reached the same conclusion.

> If you want to pursue the reordering of the checks and messages
> overall, a patch for the master branch could be considered.


regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center




Re: ICU for global collation

2022-09-16 Thread Kyotaro Horiguchi
At Fri, 16 Sep 2022 09:49:28 +0300, Marina Polyakova 
 wrote in 
> On 2022-09-15 09:52, Kyotaro Horiguchi wrote:
> > However, when I reran the command, it complains about incompatible
> > encoding this time.  I think it's more user-friendly to check for the
> > encoding compatibility before the check for missing --icu-locale
> > option.
> > regards.
> 
> In continuation of options check: AFAICS the following checks in
> initdb
> 
>   if (locale_provider == COLLPROVIDER_ICU)
>   {
>   if (!icu_locale)
>   pg_fatal("ICU locale must be specified");
> 
>   /*
>* In supported builds, the ICU locale ID will be checked by the
>* backend during post-bootstrap initialization.
>*/
> #ifndef USE_ICU
>   pg_fatal("ICU is not supported in this build");
> #endif
>   }
> 
> are executed approximately when they are executed in create database
> after getting all the necessary data from the template database:

initdb doesn't work that way, but anyway, I realized that I am
proposing to move that code in setlocales() to the caller function as
the result. I don't think setlocales() is the place for the code
because icu locale has no business with what the function does.  That
being said there's no obvious reason we *need* to move the code out to
its caller.

> if (dblocprovider == COLLPROVIDER_ICU)
> {
>   /*
>* This would happen if template0 uses the libc provider but the new
>* database uses icu.
>*/
>   if (!dbiculocale)
>   ereport(ERROR,
>   (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
>errmsg("ICU locale must be specified")));
> }
> 
> if (dblocprovider == COLLPROVIDER_ICU)
>   check_icu_locale(dbiculocale);
> 
> But perhaps the check that --icu-locale cannot be specified unless
> locale provider icu is chosen should also be moved here? So all these
> checks will be in one place and it will use the provider from the
> template database (which could be icu):
> 
> $ initdb --locale-provider icu --icu-locale en-US -D data &&
> pg_ctl -D data -l logfile start &&
> createdb --icu-locale ru-RU --template template0 mydb
> ...
> createdb: error: database creation failed: ERROR: ICU locale cannot be
> specified unless locale provider is ICU

And, I realized that this causes bigger churn than I thought. So, I'm
sorry but I withdraw the comment.

Thus the first proposed patch will be more or less the direction we
would go. And the patch looks good to me as a whole.

+errmsg("encoding \"%s\" is not 
supported with ICU provider",

+   pg_log_error("encoding \"%s\" is not supported with ICU 
provider",
+pg_encoding_to_char(encodingid));

I might be wrong, but the messages look wrong to me.  The alternatives
below might work.

"encoding \"%s\" is not supported by ICU"
"encoding \"%s\" cannot be used for/with ICU locales"

+   pg_log_error_hint("Rerun %s and choose a matching combination.",
+ progname);

This doesn't seem to provide users with useful information.

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center




Re: ICU for global collation

2022-09-16 Thread Peter Eisentraut

On 16.09.22 09:31, Marina Polyakova wrote:
IMO it is hardly understantable from the program output either - it 
looks like I manually chose the encoding UTF8. Maybe first inform about 
selected encoding?..


Yes, I included something like that in the patch just committed.


diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 
6aeec8d426c52414b827686781c245291f27ed1f..348bbbeba0f5bc7ff601912bf883510d580b814c 100644

--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -2310,7 +2310,11 @@ setup_locale_encoding(void)
  }

  if (!encoding && locale_provider == COLLPROVIDER_ICU)
+    {
  encodingid = PG_UTF8;
+    printf(_("The default database encoding has been set to \"%s\" 
for a better experience with the ICU provider.\n"),

+   pg_encoding_to_char(encodingid));
+    }
  else if (!encoding)
  {
  int    ctype_enc;






Re: ICU for global collation

2022-09-16 Thread Peter Eisentraut

On 15.09.22 17:41, Marina Polyakova wrote:
I agree with you. Here's another version of the patch. The 
locale/encoding checks and reports in initdb have been reordered, 
because now the encoding is set first and only then the ICU locale is 
checked.


I committed something based on the first version of your patch.  This 
reordering of the messages here was a little too much surgery for me at 
this point.  For instance, there are also messages in #ifdef WIN32 code 
that would need to be reordered as well.  I kept the overall structure 
of the code the same and just inserted the additional proposed checks.


If you want to pursue the reordering of the checks and messages overall, 
a patch for the master branch could be considered.





Re: ICU for global collation

2022-09-16 Thread Marina Polyakova

On 2022-09-16 07:55, Kyotaro Horiguchi wrote:

At Thu, 15 Sep 2022 18:41:31 +0300, Marina Polyakova
 wrote in

P.S. While working on the patch, I discovered that UTF8 encoding is
always used for the ICU provider in initdb unless it is explicitly
specified by the user:

if (!encoding && locale_provider == COLLPROVIDER_ICU)
encodingid = PG_UTF8;

IMO this creates additional errors for locales with other encodings:

$ initdb --locale de_DE.iso885915@euro --locale-provider icu
--icu-locale de-DE
...
initdb: error: encoding mismatch
initdb: detail: The encoding you selected (UTF8) and the encoding that
the selected locale uses (LATIN9) do not match. This would lead to
misbehavior in various character string processing functions.
initdb: hint: Rerun initdb and either do not specify an encoding
explicitly, or choose a matching combination.

And ICU supports many encodings, see the contents of pg_enc2icu_tbl in
encnames.c...


It seems to me the best default that fits almost all cases using icu
locales.

So, we need to specify encoding explicitly in that case.

$ initdb --encoding iso-8859-15 --locale de_DE.iso885915@euro
--locale-provider icu --icu-locale de-DE

However, I think it is hardly understantable from the documentation.

(I checked this using euc-jp [1] so it might be wrong..)

[1] initdb --encoding euc-jp --locale ja_JP.eucjp --locale-provider
icu --icu-locale ja-x-icu

regards.


Thank you!

IMO it is hardly understantable from the program output either - it 
looks like I manually chose the encoding UTF8. Maybe first inform about 
selected encoding?..


diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 
6aeec8d426c52414b827686781c245291f27ed1f..348bbbeba0f5bc7ff601912bf883510d580b814c 
100644

--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -2310,7 +2310,11 @@ setup_locale_encoding(void)
}

if (!encoding && locale_provider == COLLPROVIDER_ICU)
+   {
encodingid = PG_UTF8;
+		printf(_("The default database encoding has been set to \"%s\" for a 
better experience with the ICU provider.\n"),

+  pg_encoding_to_char(encodingid));
+   }
else if (!encoding)
{
int ctype_enc;

ISTM that such choices (e.g. UTF8 for Windows in some cases) are 
described in the documentation [1] as


By default, initdb uses the locale provider libc, takes the locale 
settings from the environment, and determines the encoding from the 
locale settings. This is almost always sufficient, unless there are 
special requirements.


[1] https://www.postgresql.org/docs/devel/app-initdb.html

--
Marina Polyakova
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company




Re: ICU for global collation

2022-09-16 Thread Marina Polyakova

On 2022-09-15 09:52, Kyotaro Horiguchi wrote:

If I executed initdb as follows, I would be told to specify
--icu-locale option.


$ initdb --encoding sql-ascii --locale-provider icu hoge
...
initdb: error: ICU locale must be specified


However, when I reran the command, it complains about incompatible
encoding this time.  I think it's more user-friendly to check for the
encoding compatibility before the check for missing --icu-locale
option.

regards.


In continuation of options check: AFAICS the following checks in initdb

if (locale_provider == COLLPROVIDER_ICU)
{
if (!icu_locale)
pg_fatal("ICU locale must be specified");

/*
 * In supported builds, the ICU locale ID will be checked by the
 * backend during post-bootstrap initialization.
 */
#ifndef USE_ICU
pg_fatal("ICU is not supported in this build");
#endif
}

are executed approximately when they are executed in create database 
after getting all the necessary data from the template database:


if (dblocprovider == COLLPROVIDER_ICU)
{
/*
 * This would happen if template0 uses the libc provider but the new
 * database uses icu.
 */
if (!dbiculocale)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 errmsg("ICU locale must be specified")));
}

if (dblocprovider == COLLPROVIDER_ICU)
check_icu_locale(dbiculocale);

But perhaps the check that --icu-locale cannot be specified unless 
locale provider icu is chosen should also be moved here? So all these 
checks will be in one place and it will use the provider from the 
template database (which could be icu):


$ initdb --locale-provider icu --icu-locale en-US -D data &&
pg_ctl -D data -l logfile start &&
createdb --icu-locale ru-RU --template template0 mydb
...
createdb: error: database creation failed: ERROR:  ICU locale cannot be 
specified unless locale provider is ICU


--
Marina Polyakova
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company




Re: ICU for global collation

2022-09-15 Thread Kyotaro Horiguchi
At Thu, 15 Sep 2022 18:41:31 +0300, Marina Polyakova 
 wrote in 
> P.S. While working on the patch, I discovered that UTF8 encoding is
> always used for the ICU provider in initdb unless it is explicitly
> specified by the user:
> 
> if (!encoding && locale_provider == COLLPROVIDER_ICU)
>   encodingid = PG_UTF8;
> 
> IMO this creates additional errors for locales with other encodings:
> 
> $ initdb --locale de_DE.iso885915@euro --locale-provider icu
> --icu-locale de-DE
> ...
> initdb: error: encoding mismatch
> initdb: detail: The encoding you selected (UTF8) and the encoding that
> the selected locale uses (LATIN9) do not match. This would lead to
> misbehavior in various character string processing functions.
> initdb: hint: Rerun initdb and either do not specify an encoding
> explicitly, or choose a matching combination.
> 
> And ICU supports many encodings, see the contents of pg_enc2icu_tbl in
> encnames.c...

It seems to me the best default that fits almost all cases using icu
locales.

So, we need to specify encoding explicitly in that case.

$ initdb --encoding iso-8859-15 --locale de_DE.iso885915@euro --locale-provider 
icu --icu-locale de-DE

However, I think it is hardly understantable from the documentation.

(I checked this using euc-jp [1] so it might be wrong..)

[1] initdb --encoding euc-jp --locale ja_JP.eucjp --locale-provider icu 
--icu-locale ja-x-icu

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center




Re: ICU for global collation

2022-09-15 Thread Michael Paquier
On Wed, Sep 14, 2022 at 05:19:34PM +0300, Marina Polyakova wrote:
> I was surprised that it is allowed to create clusters/databases where the
> default ICU collations do not actually work due to unsupported encodings:
> 
> $ initdb --encoding SQL_ASCII --locale-provider icu --icu-locale en-US -D
> data &&
> pg_ctl -D data -l logfile start &&
> psql -c "SELECT 'a' < 'b'" template1
> ...
> waiting for server to start done
> server started
> ERROR:  encoding "SQL_ASCII" not supported by ICU
> 
> $ createdb --encoding SQL_ASCII --locale-provider icu --icu-locale en-US
> --template template0 mydb &&
> psql -c "SELECT 'a' < 'b'" mydb
> ERROR:  encoding "SQL_ASCII" not supported by ICU
> 
> The patch diff_check_icu_encoding.patch prohibits the creation of such
> objects...

Agreed that it is a bit confusing to get this type of error after the
database has been created when querying it due to a mix of unsupported
options.  Peter?
--
Michael


signature.asc
Description: PGP signature


Re: ICU for global collation

2022-09-15 Thread Marina Polyakova

On 2022-09-15 09:52, Kyotaro Horiguchi wrote:

If I executed initdb as follows, I would be told to specify
--icu-locale option.


$ initdb --encoding sql-ascii --locale-provider icu hoge
...
initdb: error: ICU locale must be specified


However, when I reran the command, it complains about incompatible
encoding this time.  I think it's more user-friendly to check for the
encoding compatibility before the check for missing --icu-locale
option.

regards.


I agree with you. Here's another version of the patch. The 
locale/encoding checks and reports in initdb have been reordered, 
because now the encoding is set first and only then the ICU locale is 
checked.


P.S. While working on the patch, I discovered that UTF8 encoding is 
always used for the ICU provider in initdb unless it is explicitly 
specified by the user:


if (!encoding && locale_provider == COLLPROVIDER_ICU)
encodingid = PG_UTF8;

IMO this creates additional errors for locales with other encodings:

$ initdb --locale de_DE.iso885915@euro --locale-provider icu 
--icu-locale de-DE

...
initdb: error: encoding mismatch
initdb: detail: The encoding you selected (UTF8) and the encoding that 
the selected locale uses (LATIN9) do not match. This would lead to 
misbehavior in various character string processing functions.
initdb: hint: Rerun initdb and either do not specify an encoding 
explicitly, or choose a matching combination.


And ICU supports many encodings, see the contents of pg_enc2icu_tbl in 
encnames.c...


--
Marina Polyakova
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Companydiff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 6ff48bb18f3639ae45d9528b32df51a4aebc60c0..f248ad42b77c8c0cf2089963d4357b120914ce20 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -1034,6 +1034,12 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 
 	if (dblocprovider == COLLPROVIDER_ICU)
 	{
+		if (!(is_encoding_supported_by_icu(encoding)))
+			ereport(ERROR,
+	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+	 errmsg("encoding \"%s\" is not supported with ICU provider",
+			pg_encoding_to_char(encoding;
+
 		/*
 		 * This would happen if template0 uses the libc provider but the new
 		 * database uses icu.
@@ -1042,10 +1048,9 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 			ereport(ERROR,
 	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 	 errmsg("ICU locale must be specified")));
-	}
 
-	if (dblocprovider == COLLPROVIDER_ICU)
 		check_icu_locale(dbiculocale);
+	}
 
 	/*
 	 * Check that the new encoding and locale settings match the source
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 6aeec8d426c52414b827686781c245291f27ed1f..999e7c2bf8dc707063eddf19a5c27eed03d3ca09 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -2092,20 +2092,30 @@ setlocales(void)
 	check_locale_name(LC_CTYPE, lc_messages, );
 	lc_messages = canonname;
 #endif
+}
 
-	if (locale_provider == COLLPROVIDER_ICU)
+static void
+check_icu_locale_encoding(void)
+{
+	if (!(is_encoding_supported_by_icu(encodingid)))
 	{
-		if (!icu_locale)
-			pg_fatal("ICU locale must be specified");
+		pg_log_error("encoding \"%s\" is not supported with ICU provider",
+	 pg_encoding_to_char(encodingid));
+		pg_log_error_hint("Rerun %s and choose a matching combination.",
+		  progname);
+		exit(1);
+	}
 
-		/*
-		 * In supported builds, the ICU locale ID will be checked by the
-		 * backend during post-bootstrap initialization.
-		 */
+	if (!icu_locale)
+		pg_fatal("ICU locale must be specified");
+
+	/*
+	 * In supported builds, the ICU locale ID will be checked by the backend
+	 * during post-bootstrap initialization.
+	 */
 #ifndef USE_ICU
-		pg_fatal("ICU is not supported in this build");
+	pg_fatal("ICU is not supported in this build");
 #endif
-	}
 }
 
 /*
@@ -2281,34 +2291,6 @@ setup_locale_encoding(void)
 {
 	setlocales();
 
-	if (locale_provider == COLLPROVIDER_LIBC &&
-		strcmp(lc_ctype, lc_collate) == 0 &&
-		strcmp(lc_ctype, lc_time) == 0 &&
-		strcmp(lc_ctype, lc_numeric) == 0 &&
-		strcmp(lc_ctype, lc_monetary) == 0 &&
-		strcmp(lc_ctype, lc_messages) == 0 &&
-		(!icu_locale || strcmp(lc_ctype, icu_locale) == 0))
-		printf(_("The database cluster will be initialized with locale \"%s\".\n"), lc_ctype);
-	else
-	{
-		printf(_("The database cluster will be initialized with this locale configuration:\n"));
-		printf(_("  provider:%s\n"), collprovider_name(locale_provider));
-		if (icu_locale)
-			printf(_("  ICU locale:  %s\n"), icu_locale);
-		printf(_("  LC_COLLATE:  %s\n"
- "  LC_CTYPE:%s\n"
- "  LC_MESSAGES: %s\n"
- "  LC_MONETARY: %s\n"
- "  LC_NUMERIC:  %s\n"
- "  LC_TIME: %s\n"),
-			   lc_collate,
-			   lc_ctype,
-			   lc_messages,
-			   lc_monetary,
-			   lc_numeric,
-			   lc_time);
-	}
-
 	if (!encoding && locale_provider == COLLPROVIDER_ICU)
 		encodingid = 

Re: ICU for global collation

2022-09-15 Thread Kyotaro Horiguchi
At Wed, 14 Sep 2022 17:19:34 +0300, Marina Polyakova 
 wrote in 
> Hello!
> 
> I was surprised that it is allowed to create clusters/databases where
> the default ICU collations do not actually work due to unsupported
> encodings:
> 
> $ initdb --encoding SQL_ASCII --locale-provider icu --icu-locale en-US
> -D data &&
> pg_ctl -D data -l logfile start &&
> psql -c "SELECT 'a' < 'b'" template1
> ...
> waiting for server to start done
> server started
> ERROR:  encoding "SQL_ASCII" not supported by ICU

Indeed. If I did the following, the direction of the patch looks fine
to me.

If I executed initdb as follows, I would be told to specify
--icu-locale option.

> $ initdb --encoding sql-ascii --locale-provider icu hoge
> ...
> initdb: error: ICU locale must be specified

However, when I reran the command, it complains about incompatible
encoding this time.  I think it's more user-friendly to check for the
encoding compatibility before the check for missing --icu-locale
option.

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center




Re: ICU for global collation

2022-09-14 Thread Marina Polyakova

Hello!

I was surprised that it is allowed to create clusters/databases where 
the default ICU collations do not actually work due to unsupported 
encodings:


$ initdb --encoding SQL_ASCII --locale-provider icu --icu-locale en-US 
-D data &&

pg_ctl -D data -l logfile start &&
psql -c "SELECT 'a' < 'b'" template1
...
waiting for server to start done
server started
ERROR:  encoding "SQL_ASCII" not supported by ICU

$ createdb --encoding SQL_ASCII --locale-provider icu --icu-locale en-US 
--template template0 mydb &&

psql -c "SELECT 'a' < 'b'" mydb
ERROR:  encoding "SQL_ASCII" not supported by ICU

The patch diff_check_icu_encoding.patch prohibits the creation of such 
objects...


--
Marina Polyakova
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Companydiff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 6ff48bb18f3639ae45d9528b32df51a4aebc60c0..07758d15e8613d5a049537ddf2c5992e57ad6424 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -1042,11 +1042,16 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 			ereport(ERROR,
 	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 	 errmsg("ICU locale must be specified")));
-	}
 
-	if (dblocprovider == COLLPROVIDER_ICU)
 		check_icu_locale(dbiculocale);
 
+		if (!(is_encoding_supported_by_icu(encoding)))
+			ereport(ERROR,
+	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+	 errmsg("encoding \"%s\" is not supported with ICU provider",
+			pg_encoding_to_char(encoding;
+	}
+
 	/*
 	 * Check that the new encoding and locale settings match the source
 	 * database.  We insist on this because we simply copy the source data ---
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index e00837ecacf4885cf2a176168c283f3e67c6eb53..8a762ced8340c9d8256f7832a4c19a43f1d5538a 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -2362,6 +2362,16 @@ setup_locale_encoding(void)
 	if (!check_locale_encoding(lc_ctype, encodingid) ||
 		!check_locale_encoding(lc_collate, encodingid))
 		exit(1);/* check_locale_encoding printed the error */
+
+	if (locale_provider == COLLPROVIDER_ICU &&
+		!(is_encoding_supported_by_icu(encodingid)))
+	{
+		pg_log_error("encoding \"%s\" is not supported with ICU provider",
+	 pg_encoding_to_char(encodingid));
+		pg_log_error_hint("Rerun %s and choose a matching combination.",
+		  progname);
+		exit(1);
+	}
 }
 
 
diff --git a/src/bin/initdb/t/001_initdb.pl b/src/bin/initdb/t/001_initdb.pl
index a37f6dd9b334b6ee22d9fdd4d51422795cb54a39..e4bb3d0cdd9c23729c5fb97886374f8df558f239 100644
--- a/src/bin/initdb/t/001_initdb.pl
+++ b/src/bin/initdb/t/001_initdb.pl
@@ -118,6 +118,15 @@ if ($ENV{with_icu} eq 'yes')
 		],
 		qr/FATAL:  could not open collator for locale/,
 		'fails for invalid ICU locale');
+
+	command_fails_like(
+		[
+			'initdb','--no-sync',
+			'--locale-provider=icu', '--icu-locale=en',
+			'--encoding=SQL_ASCII',  "$tempdir/dataX"
+		],
+		qr/error: encoding "SQL_ASCII" is not supported with ICU provider/,
+		'encoding "SQL_ASCII" is not supported with ICU provider');
 }
 else
 {
diff --git a/src/bin/scripts/t/020_createdb.pl b/src/bin/scripts/t/020_createdb.pl
index e91c1d013d08d8bd1e3a92f2aba958c5c7713ca6..eaab3caa32669ead068719d98bb953c5c6ff5a17 100644
--- a/src/bin/scripts/t/020_createdb.pl
+++ b/src/bin/scripts/t/020_createdb.pl
@@ -50,6 +50,16 @@ if ($ENV{with_icu} eq 'yes')
 		],
 		'fails for invalid ICU locale');
 
+	$node->command_fails_like(
+		[
+			'createdb','-T',
+			'template0',   '--locale-provider=icu',
+			'--icu-locale=en', '--encoding=SQL_ASCII',
+			'foobarX'
+		],
+		qr/ERROR:  encoding "SQL_ASCII" is not supported with ICU provider/,
+		'encoding "SQL_ASCII" is not supported with ICU provider');
+
 	# additional node, which uses the icu provider
 	my $node2 = PostgreSQL::Test::Cluster->new('icu');
 	$node2->init(extra => ['--locale-provider=icu', '--icu-locale=en']);


Re: ICU for global collation

2022-09-13 Thread Marina Polyakova

On 2022-09-13 15:41, Peter Eisentraut wrote:

On 13.09.22 07:34, Marina Polyakova wrote:
I agree with you that it is more comfortable and more similar to what 
has already been done in initdb. IMO it would be easier to do it like 
this:


diff --git a/src/bin/scripts/createdb.c b/src/bin/scripts/createdb.c
index 
e523e58b2189275dc603a06324a2f28b0f49d8b7..a1482df3d981a680dd3322052e7c03ddacc8dc26 
100644

--- a/src/bin/scripts/createdb.c
+++ b/src/bin/scripts/createdb.c
@@ -161,12 +161,10 @@ main(int argc, char *argv[])

  if (locale)
  {
-    if (lc_ctype)
-    pg_fatal("only one of --locale and --lc-ctype can be 
specified");

-    if (lc_collate)
-    pg_fatal("only one of --locale and --lc-collate can be 
specified");

-    lc_ctype = locale;
-    lc_collate = locale;
+    if (!lc_ctype)
+    lc_ctype = locale;
+    if (!lc_collate)
+    lc_collate = locale;
  }

  if (encoding)


done that way


Thank you!


BTW it's somewhat crummy that it uses a string comparison, so if you
write "UTF8" without a dash, it says this; it took me a few minutes 
to

see the difference...

postgres=# create database a LC_COLLATE "en_US.UTF8" LC_CTYPE
"en_US.UTF8" LOCALE "en_US.UTF8";
ERROR:  new collation (en_US.UTF8) is incompatible with the collation
of the template database (en_US.UTF-8)


Perhaps we could check the locale itself with the function 
normalize_libc_locale_name (collationcmds.c). But ISTM that the 
current check is a safety net in case the function 
pg_get_encoding_from_locale (chklocale.c) returns -1 or 
PG_SQL_ASCII...


This is not new behavior in PG15, is it?


No, it has always existed [1] AFAICS..

[1] 
https://github.com/postgres/postgres/commit/61d967498802ab86d8897cb3c61740d7e9d712f6


--
Marina Polyakova
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company




Re: ICU for global collation

2022-09-13 Thread Peter Eisentraut

On 13.09.22 07:34, Marina Polyakova wrote:
I agree with you that it is more comfortable and more similar to what 
has already been done in initdb. IMO it would be easier to do it like this:


diff --git a/src/bin/scripts/createdb.c b/src/bin/scripts/createdb.c
index 
e523e58b2189275dc603a06324a2f28b0f49d8b7..a1482df3d981a680dd3322052e7c03ddacc8dc26 100644

--- a/src/bin/scripts/createdb.c
+++ b/src/bin/scripts/createdb.c
@@ -161,12 +161,10 @@ main(int argc, char *argv[])

  if (locale)
  {
-    if (lc_ctype)
-    pg_fatal("only one of --locale and --lc-ctype can be 
specified");

-    if (lc_collate)
-    pg_fatal("only one of --locale and --lc-collate can be 
specified");

-    lc_ctype = locale;
-    lc_collate = locale;
+    if (!lc_ctype)
+    lc_ctype = locale;
+    if (!lc_collate)
+    lc_collate = locale;
  }

  if (encoding)


done that way

Should we change the behaviour of createdb and CREATE DATABASE in 
previous major versions?..


I don't see a need for that.


BTW it's somewhat crummy that it uses a string comparison, so if you
write "UTF8" without a dash, it says this; it took me a few minutes to
see the difference...

postgres=# create database a LC_COLLATE "en_US.UTF8" LC_CTYPE
"en_US.UTF8" LOCALE "en_US.UTF8";
ERROR:  new collation (en_US.UTF8) is incompatible with the collation
of the template database (en_US.UTF-8)


Perhaps we could check the locale itself with the function 
normalize_libc_locale_name (collationcmds.c). But ISTM that the current 
check is a safety net in case the function pg_get_encoding_from_locale 
(chklocale.c) returns -1 or PG_SQL_ASCII...


This is not new behavior in PG15, is it?





Re: ICU for global collation

2022-09-12 Thread Marina Polyakova

On 2022-09-09 19:46, Justin Pryzby wrote:

In pg14:
|postgres=# create database a LC_COLLATE C LC_CTYPE C LOCALE C;
|ERROR:  conflicting or redundant options
|DETAIL:  LOCALE cannot be specified together with LC_COLLATE or 
LC_CTYPE.


In pg15:
|postgres=# create database a LC_COLLATE "en_US.UTF-8" LC_CTYPE
"en_US.UTF-8" LOCALE "en_US.UTF-8" ;
|CREATE DATABASE

f2553d430 actually relaxed the restriction by removing this check:

-   if (dlocale && (dcollate || dctype))
-   ereport(ERROR,
-   (errcode(ERRCODE_SYNTAX_ERROR),
-errmsg("conflicting or redundant 
options"),

-errdetail("LOCALE cannot be specified
together with LC_COLLATE or LC_CTYPE.")));

But isn't the right fix to do the corresponding thing in createdb
(relaxing the frontend restriction rather than reverting its relaxation
in the backend).

diff --git a/src/bin/scripts/createdb.c b/src/bin/scripts/createdb.c
index e523e58b218..5b80e56dfd9 100644
--- a/src/bin/scripts/createdb.c
+++ b/src/bin/scripts/createdb.c
@@ -159,15 +159,10 @@ main(int argc, char *argv[])
exit(1);
}

-   if (locale)
-   {
-   if (lc_ctype)
-   pg_fatal("only one of --locale and --lc-ctype can be 
specified");
-   if (lc_collate)
-   pg_fatal("only one of --locale and --lc-collate can be 
specified");
+   if (locale && !lc_ctype)
lc_ctype = locale;
+   if (locale && !lc_collate)
lc_collate = locale;
-   }

if (encoding)
{


I agree with you that it is more comfortable and more similar to what 
has already been done in initdb. IMO it would be easier to do it like 
this:


diff --git a/src/bin/scripts/createdb.c b/src/bin/scripts/createdb.c
index 
e523e58b2189275dc603a06324a2f28b0f49d8b7..a1482df3d981a680dd3322052e7c03ddacc8dc26 
100644

--- a/src/bin/scripts/createdb.c
+++ b/src/bin/scripts/createdb.c
@@ -161,12 +161,10 @@ main(int argc, char *argv[])

if (locale)
{
-   if (lc_ctype)
-   pg_fatal("only one of --locale and --lc-ctype can be 
specified");
-   if (lc_collate)
-   pg_fatal("only one of --locale and --lc-collate can be 
specified");
-   lc_ctype = locale;
-   lc_collate = locale;
+   if (!lc_ctype)
+   lc_ctype = locale;
+   if (!lc_collate)
+   lc_collate = locale;
}

if (encoding)

Should we change the behaviour of createdb and CREATE DATABASE in 
previous major versions?..



BTW it's somewhat crummy that it uses a string comparison, so if you
write "UTF8" without a dash, it says this; it took me a few minutes to
see the difference...

postgres=# create database a LC_COLLATE "en_US.UTF8" LC_CTYPE
"en_US.UTF8" LOCALE "en_US.UTF8";
ERROR:  new collation (en_US.UTF8) is incompatible with the collation
of the template database (en_US.UTF-8)


Perhaps we could check the locale itself with the function 
normalize_libc_locale_name (collationcmds.c). But ISTM that the current 
check is a safety net in case the function pg_get_encoding_from_locale 
(chklocale.c) returns -1 or PG_SQL_ASCII...


--
Marina Polyakova
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company




Re: ICU for global collation

2022-09-09 Thread Justin Pryzby
In pg14:
|postgres=# create database a LC_COLLATE C LC_CTYPE C LOCALE C;
|ERROR:  conflicting or redundant options
|DETAIL:  LOCALE cannot be specified together with LC_COLLATE or LC_CTYPE.

In pg15:
|postgres=# create database a LC_COLLATE "en_US.UTF-8" LC_CTYPE "en_US.UTF-8" 
LOCALE "en_US.UTF-8" ;
|CREATE DATABASE

f2553d430 actually relaxed the restriction by removing this check:

-   if (dlocale && (dcollate || dctype))
-   ereport(ERROR,
-   (errcode(ERRCODE_SYNTAX_ERROR),
-errmsg("conflicting or redundant options"),
-errdetail("LOCALE cannot be specified together 
with LC_COLLATE or LC_CTYPE.")));

But isn't the right fix to do the corresponding thing in createdb
(relaxing the frontend restriction rather than reverting its relaxation
in the backend).

diff --git a/src/bin/scripts/createdb.c b/src/bin/scripts/createdb.c
index e523e58b218..5b80e56dfd9 100644
--- a/src/bin/scripts/createdb.c
+++ b/src/bin/scripts/createdb.c
@@ -159,15 +159,10 @@ main(int argc, char *argv[])
exit(1);
}
 
-   if (locale)
-   {
-   if (lc_ctype)
-   pg_fatal("only one of --locale and --lc-ctype can be 
specified");
-   if (lc_collate)
-   pg_fatal("only one of --locale and --lc-collate can be 
specified");
+   if (locale && !lc_ctype)
lc_ctype = locale;
+   if (locale && !lc_collate)
lc_collate = locale;
-   }
 
if (encoding)
{


BTW it's somewhat crummy that it uses a string comparison, so if you
write "UTF8" without a dash, it says this; it took me a few minutes to
see the difference...

postgres=# create database a LC_COLLATE "en_US.UTF8" LC_CTYPE "en_US.UTF8" 
LOCALE "en_US.UTF8";
ERROR:  new collation (en_US.UTF8) is incompatible with the collation of the 
template database (en_US.UTF-8)




Re: ICU for global collation

2022-09-05 Thread Marina Polyakova

Hello!

IMO after adding ICU for global collations [1] the behaviour of createdb 
and CREATE DATABASE is a bit inconsistent when both locale and 
lc_collate (or locale and lc_ctype) options are used:


$ createdb mydb --locale C --lc-collate C --template template0
createdb: error: only one of --locale and --lc-collate can be specified
$ psql -c "create database mydb locale = 'C' lc_collate = 'C' template = 
'template0'" postgres

CREATE DATABASE

From the CREATE DATABASE documentation [2]:

locale
This is a shortcut for setting LC_COLLATE and LC_CTYPE at once. If you 
specify this, you cannot specify either of those parameters.


The patch diff_return_back_create_database_error.patch returns back the 
removed code for CREATE DATABASE so it behaves like createdb as 
before...


[1] 
https://github.com/postgres/postgres/commit/f2553d43060edb210b36c63187d52a632448e1d2

[2] https://www.postgresql.org/docs/devel/sql-createdatabase.html

--
Marina Polyakova
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Companydiff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 6ff48bb18f3639ae45d9528b32df51a4aebc60c0..0a22cace11d9df6b5fc085bfd7b86319f4b13165 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -851,6 +851,12 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 	 parser_errposition(pstate, defel->location)));
 	}
 
+	if (dlocale && (dcollate || dctype))
+		ereport(ERROR,
+(errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("conflicting or redundant options"),
+ errdetail("LOCALE cannot be specified together with LC_COLLATE or LC_CTYPE.")));
+
 	if (downer && downer->arg)
 		dbowner = defGetString(downer);
 	if (dtemplate && dtemplate->arg)


Re: ICU for global collation

2022-08-30 Thread Michael Paquier
On Wed, Aug 24, 2022 at 08:28:24PM +0200, Peter Eisentraut wrote:
> Committed, thanks.
> 
> (This should conclude all the issues discussed in this thread recently.)

Please note that this open item was still listed as open.  I have
closed it now.
--
Michael


signature.asc
Description: PGP signature


Re: ICU for global collation

2022-08-24 Thread Peter Eisentraut

On 24.08.22 10:59, Julien Rouhaud wrote:

I have not tested the patch in details but
this looks rather sane to me on a quick read.  Peter?

Patch looks good to me too.


Committed, thanks.

(This should conclude all the issues discussed in this thread recently.)




Re: ICU for global collation

2022-08-24 Thread Julien Rouhaud
On Wed, Aug 24, 2022 at 01:38:44PM +0900, Michael Paquier wrote:
> On Tue, Aug 23, 2022 at 08:59:02PM +0300, Marina Polyakova wrote:
> > My colleague Andrew Bille found another bug in master
> > (b4e936859dc441102eb0b6fb7a104f3948c90490) and REL_15_STABLE
> > (2c63b0930aee1bb5c265fad4a65c9d0b62b1f9da): pg_collation.colliculocale is
> > not dumped. See check_icu_locale.sh:
> > 
> > In the old cluster:
> > SELECT collname, colliculocale FROM pg_collation WHERE collname =
> > 'testcoll_backwards'
> >   collname  |   colliculocale
> > +---
> >  testcoll_backwards | @colBackwards=yes
> > (1 row)
> > 
> > In the new cluster:
> > SELECT collname, colliculocale FROM pg_collation WHERE collname =
> > 'testcoll_backwards'
> >   collname  | colliculocale
> > +---
> >  testcoll_backwards |
> > (1 row)
> > 
> > diff_dump_colliculocale.patch works for me.
> 
> Ugh.  Good catch, again!

+1

> I have not tested the patch in details but
> this looks rather sane to me on a quick read.  Peter?

Patch looks good to me too.




Re: ICU for global collation

2022-08-23 Thread Michael Paquier
On Tue, Aug 23, 2022 at 08:59:02PM +0300, Marina Polyakova wrote:
> My colleague Andrew Bille found another bug in master
> (b4e936859dc441102eb0b6fb7a104f3948c90490) and REL_15_STABLE
> (2c63b0930aee1bb5c265fad4a65c9d0b62b1f9da): pg_collation.colliculocale is
> not dumped. See check_icu_locale.sh:
> 
> In the old cluster:
> SELECT collname, colliculocale FROM pg_collation WHERE collname =
> 'testcoll_backwards'
>   collname  |   colliculocale
> +---
>  testcoll_backwards | @colBackwards=yes
> (1 row)
> 
> In the new cluster:
> SELECT collname, colliculocale FROM pg_collation WHERE collname =
> 'testcoll_backwards'
>   collname  | colliculocale
> +---
>  testcoll_backwards |
> (1 row)
> 
> diff_dump_colliculocale.patch works for me.

Ugh.  Good catch, again!  I have not tested the patch in details but
this looks rather sane to me on a quick read.  Peter?
--
Michael


signature.asc
Description: PGP signature


Re: ICU for global collation

2022-08-23 Thread Marina Polyakova
My colleague Andrew Bille found another bug in master 
(b4e936859dc441102eb0b6fb7a104f3948c90490) and REL_15_STABLE 
(2c63b0930aee1bb5c265fad4a65c9d0b62b1f9da): pg_collation.colliculocale 
is not dumped. See check_icu_locale.sh:


In the old cluster:
SELECT collname, colliculocale FROM pg_collation WHERE collname = 
'testcoll_backwards'

  collname  |   colliculocale
+---
 testcoll_backwards | @colBackwards=yes
(1 row)

In the new cluster:
SELECT collname, colliculocale FROM pg_collation WHERE collname = 
'testcoll_backwards'

  collname  | colliculocale
+---
 testcoll_backwards |
(1 row)

diff_dump_colliculocale.patch works for me.

--
Marina Polyakova
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Companyinitdb -D data_old &&
pg_ctl -D data_old -l logfile_old start &&
psql -ac "CREATE COLLATION testcoll_backwards (provider = icu, locale = 
'@colBackwards=yes')" postgres &&
echo "In the old cluster:" &&
psql -ac "SELECT collname, colliculocale FROM pg_collation WHERE collname = 
'testcoll_backwards'" postgres &&
pg_dump postgres > dump_postgres.sql &&
pg_ctl -D data_old stop &&
initdb -D data_new &&
pg_ctl -D data_new -l logfile_new start &&
psql -v ON_ERROR_STOP=1 -f dump_postgres.sql postgres &&
echo "In the new cluster:" &&
psql -ac "SELECT collname, colliculocale FROM pg_collation WHERE collname = 
'testcoll_backwards'" postgres &&
pg_ctl -D data_new stop
diff --git a/src/bin/pg_dump/Makefile b/src/bin/pg_dump/Makefile
index 2f524b09bf53a55037013d78148f8cbca4fa7eee..9dc5a784dd2d1ce58a6284f19c54730364779c4d 100644
--- a/src/bin/pg_dump/Makefile
+++ b/src/bin/pg_dump/Makefile
@@ -17,6 +17,7 @@ top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
 export GZIP_PROGRAM=$(GZIP)
+export with_icu
 
 override CPPFLAGS := -I$(libpq_srcdir) $(CPPFLAGS)
 LDFLAGS_INTERNAL += -L$(top_builddir)/src/fe_utils -lpgfeutils $(libpq_pgport)
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 2c6891573296b395c5bd6b627d061de657355e9c..e8e8f69e30c8a787e5dff157f5d1619917638d7d 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -13084,9 +13084,11 @@ dumpCollation(Archive *fout, const CollInfo *collinfo)
 	int			i_collisdeterministic;
 	int			i_collcollate;
 	int			i_collctype;
+	int			i_colliculocale;
 	const char *collprovider;
 	const char *collcollate;
 	const char *collctype;
+	const char *colliculocale;
 
 	/* Do nothing in data-only dump */
 	if (dopt->dataOnly)
@@ -13117,6 +13119,13 @@ dumpCollation(Archive *fout, const CollInfo *collinfo)
 		appendPQExpBufferStr(query,
 			 "true AS collisdeterministic, ");
 
+	if (fout->remoteVersion >= 15)
+		appendPQExpBufferStr(query,
+			 "colliculocale, ");
+	else
+		appendPQExpBufferStr(query,
+			 "NULL AS colliculocale, ");
+
 	appendPQExpBuffer(query,
 	  "collcollate, "
 	  "collctype "
@@ -13130,10 +13139,24 @@ dumpCollation(Archive *fout, const CollInfo *collinfo)
 	i_collisdeterministic = PQfnumber(res, "collisdeterministic");
 	i_collcollate = PQfnumber(res, "collcollate");
 	i_collctype = PQfnumber(res, "collctype");
+	i_colliculocale = PQfnumber(res, "colliculocale");
 
 	collprovider = PQgetvalue(res, 0, i_collprovider);
-	collcollate = PQgetvalue(res, 0, i_collcollate);
-	collctype = PQgetvalue(res, 0, i_collctype);
+
+	if (!PQgetisnull(res, 0, i_collcollate))
+		collcollate = PQgetvalue(res, 0, i_collcollate);
+	else
+		collcollate = NULL;
+
+	if (!PQgetisnull(res, 0, i_collctype))
+		collctype = PQgetvalue(res, 0, i_collctype);
+	else
+		collctype = NULL;
+
+	if (!PQgetisnull(res, 0, i_colliculocale))
+		colliculocale = PQgetvalue(res, 0, i_colliculocale);
+	else
+		colliculocale = NULL;
 
 	appendPQExpBuffer(delq, "DROP COLLATION %s;\n",
 	  fmtQualifiedDumpable(collinfo));
@@ -13156,17 +13179,28 @@ dumpCollation(Archive *fout, const CollInfo *collinfo)
 	if (strcmp(PQgetvalue(res, 0, i_collisdeterministic), "f") == 0)
 		appendPQExpBufferStr(q, ", deterministic = false");
 
-	if (strcmp(collcollate, collctype) == 0)
+	if (colliculocale != NULL)
 	{
 		appendPQExpBufferStr(q, ", locale = ");
-		appendStringLiteralAH(q, collcollate, fout);
+		appendStringLiteralAH(q, colliculocale, fout);
 	}
 	else
 	{
-		appendPQExpBufferStr(q, ", lc_collate = ");
-		appendStringLiteralAH(q, collcollate, fout);
-		appendPQExpBufferStr(q, ", lc_ctype = ");
-		appendStringLiteralAH(q, collctype, fout);
+		Assert(collcollate != NULL);
+		Assert(collctype != NULL);
+
+		if (strcmp(collcollate, collctype) == 0)
+		{
+			appendPQExpBufferStr(q, ", locale = ");
+			appendStringLiteralAH(q, collcollate, fout);
+		}
+		else
+		{
+			appendPQExpBufferStr(q, ", lc_collate = ");
+			appendStringLiteralAH(q, collcollate, fout);
+			appendPQExpBufferStr(q, ", lc_ctype = ");
+			appendStringLiteralAH(q, collctype, fout);
+		}
 	}
 
 	/*
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl 

Re: ICU for global collation

2022-08-22 Thread Michael Paquier
On Mon, Aug 22, 2022 at 04:10:59PM +0200, Peter Eisentraut wrote:
> I think this piece of code was left over from some earlier attempts to
> specify the libc locale and the icu locale with one option, which never
> really worked well.  The CREATE DATABASE man page does not mention that
> LOCALE provides the default for ICU_LOCALE.  Hence, I think we should delete
> this.

As of 36f729e, is there anything left to address on this thread or
should this open item be closed?
--
Michael


signature.asc
Description: PGP signature


Re: ICU for global collation

2022-08-22 Thread Marina Polyakova

On 2022-08-22 17:10, Peter Eisentraut wrote:

On 15.08.22 14:06, Marina Polyakova wrote:
1.1) It looks like there's a bug in the function get_db_infos 
(src/bin/pg_upgrade/info.c), where the version of the old cluster is 
always checked:


if (GET_MAJOR_VERSION(old_cluster.major_version) < 1500)
 snprintf(query + strlen(query), sizeof(query) - strlen(query),
  "'c' AS datlocprovider, NULL AS daticulocale, ");
else
 snprintf(query + strlen(query), sizeof(query) - strlen(query),
  "datlocprovider, daticulocale, ");

With the simple patch

diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c
index 
df374ce4b362b4c6c87fc1fd0e476e5d6d353d9e..53ea348e211d3ac38334292bc16cb814bc13bb87 
100644

--- a/src/bin/pg_upgrade/info.c
+++ b/src/bin/pg_upgrade/info.c
@@ -319,7 +319,7 @@ get_db_infos(ClusterInfo *cluster)

  snprintf(query, sizeof(query),
   "SELECT d.oid, d.datname, d.encoding, d.datcollate, 
d.datctype, ");

-    if (GET_MAJOR_VERSION(old_cluster.major_version) < 1500)
+    if (GET_MAJOR_VERSION(cluster->major_version) < 1500)
  snprintf(query + strlen(query), sizeof(query) - 
strlen(query),

   "'c' AS datlocprovider, NULL AS daticulocale, ");
  else


fixed

1.2) It looks like the mentioned asserion in dbcommands.c conflicts 
with the following lines earlier:


if (dbiculocale == NULL)
 dbiculocale = src_iculocale;


fixed

I'm wondering if this is not a fully-supported feature (because 
createdb creates an SQL command with LC_COLLATE and LC_CTYPE options 
instead of LOCALE option) or is it a bug in CREATE DATABASE?.. From 
src/backend/commands/dbcommands.c:


if (dblocprovider == COLLPROVIDER_ICU && !dbiculocale)
{
 if (dlocale && dlocale->arg)
     dbiculocale = defGetString(dlocale);
}


I think this piece of code was left over from some earlier attempts to
specify the libc locale and the icu locale with one option, which
never really worked well.  The CREATE DATABASE man page does not
mention that LOCALE provides the default for ICU_LOCALE.  Hence, I
think we should delete this.


Thank you!

--
Marina Polyakova
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company




Re: ICU for global collation

2022-08-22 Thread Peter Eisentraut

On 15.08.22 14:06, Marina Polyakova wrote:
1.1) It looks like there's a bug in the function get_db_infos 
(src/bin/pg_upgrade/info.c), where the version of the old cluster is 
always checked:


if (GET_MAJOR_VERSION(old_cluster.major_version) < 1500)
 snprintf(query + strlen(query), sizeof(query) - strlen(query),
  "'c' AS datlocprovider, NULL AS daticulocale, ");
else
 snprintf(query + strlen(query), sizeof(query) - strlen(query),
  "datlocprovider, daticulocale, ");

With the simple patch

diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c
index 
df374ce4b362b4c6c87fc1fd0e476e5d6d353d9e..53ea348e211d3ac38334292bc16cb814bc13bb87 
100644

--- a/src/bin/pg_upgrade/info.c
+++ b/src/bin/pg_upgrade/info.c
@@ -319,7 +319,7 @@ get_db_infos(ClusterInfo *cluster)

  snprintf(query, sizeof(query),
   "SELECT d.oid, d.datname, d.encoding, d.datcollate, 
d.datctype, ");

-    if (GET_MAJOR_VERSION(old_cluster.major_version) < 1500)
+    if (GET_MAJOR_VERSION(cluster->major_version) < 1500)
  snprintf(query + strlen(query), sizeof(query) - strlen(query),
   "'c' AS datlocprovider, NULL AS daticulocale, ");
  else


fixed

1.2) It looks like the mentioned asserion in dbcommands.c conflicts with 
the following lines earlier:


if (dbiculocale == NULL)
 dbiculocale = src_iculocale;


fixed

I'm wondering if this is not a fully-supported feature (because createdb 
creates an SQL command with LC_COLLATE and LC_CTYPE options instead of 
LOCALE option) or is it a bug in CREATE DATABASE?.. From 
src/backend/commands/dbcommands.c:


if (dblocprovider == COLLPROVIDER_ICU && !dbiculocale)
{
 if (dlocale && dlocale->arg)
     dbiculocale = defGetString(dlocale);
}


I think this piece of code was left over from some earlier attempts to 
specify the libc locale and the icu locale with one option, which never 
really worked well.  The CREATE DATABASE man page does not mention that 
LOCALE provides the default for ICU_LOCALE.  Hence, I think we should 
delete this.





Re: ICU for global collation

2022-08-20 Thread Marina Polyakova

On 2022-08-17 19:53, Julien Rouhaud wrote:

Good catch.  There's unfortunately not a lot of regression tests for
ICU-initialized clusters.  I'm wondering if the build-farm client could 
be
taught about the locale provider rather than assuming libc.  Clearly 
that
wouldn't have caught this issue, but it should still increase the 
coverage a

bit (I'm thinking of the recent problem with the abbreviated keys).


Looking at installchecks with different locales (e.g. see [1] with 
sv_SE.UTF-8) - why not?..



diff --git a/src/backend/commands/dbcommands.c
b/src/backend/commands/dbcommands.c
index 
b31a30550b025d48ba3cc250dc4c15f41f9a80be..17a2942341e528c01182fb6d4878580f2706bec9

100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -1048,6 +1048,8 @@ createdb(ParseState *pstate, const CreatedbStmt 
*stmt)


(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 errmsg("ICU locale must be 
specified")));
}
+   else
+   dbiculocale = NULL;

if (dblocprovider == COLLPROVIDER_ICU)
check_icu_locale(dbiculocale);


I think it would be better to do that in the various variable 
initialization.

Maybe switch the dblocprovider and dbiculocale initialization, and only
initialize dbiculocale if dblocprovider is COLLPROVIDER_ICU.


diff --git a/src/backend/commands/dbcommands.c 
b/src/backend/commands/dbcommands.c
index 
b31a30550b025d48ba3cc250dc4c15f41f9a80be..883f381f3453142790f728a3725586cebe2e2f20 
100644

--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -1012,10 +1012,10 @@ createdb(ParseState *pstate, const CreatedbStmt 
*stmt)

dbcollate = src_collate;
if (dbctype == NULL)
dbctype = src_ctype;
-   if (dbiculocale == NULL)
-   dbiculocale = src_iculocale;
if (dblocprovider == '\0')
dblocprovider = src_locprovider;
+   if (dbiculocale == NULL && dblocprovider == COLLPROVIDER_ICU)
+   dbiculocale = src_iculocale;

/* Some encodings are client only */
if (!PG_VALID_BE_ENCODING(encoding))

Then it seemed to me that it was easier to first get all the parameters 
from the template database as usual and then process them as needed. But 
with your suggestion the failed assertion will check the code above more 
accurately...


This discrepancy between createdb and CREATE DATABASE looks like an 
oversight,

as createdb indeed interprets --locale as:

if (locale)
{
if (lc_ctype)
pg_fatal("only one of --locale and --lc-ctype can be 
specified");
if (lc_collate)
pg_fatal("only one of --locale and --lc-collate can be 
specified");
lc_ctype = locale;
lc_collate = locale;
}

AFAIK the fallback in the CREATE DATABASE case is expected as POSIX 
locale

names should be accepted by icu, so this should work for createdb too.


Oh, great, thanks!


> > I spent some time looking at the ICU api trying to figure out if using a
> > posix locale name (e.g. en_US) was actually compatible with an ICU locale 
name.
> > It seems that ICU accept any of 'en-us', 'en-US', 'en_us' or 'en_US' as the
> > same locale, but I might be wrong.  I also didn't find a way to figure out 
how
> > to ask ICU if the locale identifier passed is complete garbage or not.  One
> > sure thing is that the system collation we import are of the form 'en-us', 
so
> > it seems weird to have this form in pg_collation and by default another 
form in
> > pg_database.
>
> Yeah it seems to be inconsistent about that.  The locale ID documentation
> appears to indicate that "en_US" is the canonical form, but when you ask it
> to list all the locales it knows about it returns "en-US".

Yeah I saw that too when checking is POSIX locale names were valid, and 
that's

not great.


I'm sorry but IIUC pg_import_system_collations uses uloc_getAvailable to 
get the locale ID and then specifically calls uloc_toLanguageTag?..


I don't think that initdb --collation-provider icu should be allowed 
without

--icu-locale, same for --collation-provider libc *with* --icu-locale.



> initdb has some specific processing to transform the default libc locale to
> something more appropriate, but as far as I can see creatdb / CREATE DATABASE
> aren't doing that.  It seems inconsistent, and IMHO another reason why
> defaulting to the libc locale looks like a bad idea.

This has all been removed.  The separate ICU locale option should now 
be

required everywhere (initdb, createdb, CREATE DATABASE).


If it's a feature and not a bug in CREATE DATABASE, why should not it 
work in initdb too? Here we define locale/lc_collate/lc_ctype for the 
first 3 databases in the cluster in much the same way...


P.S. FYI there seems to be a bug for very old ICU versions: in master 

Re: ICU for global collation

2022-08-17 Thread Julien Rouhaud
Hi,

On Mon, Aug 15, 2022 at 03:06:32PM +0300, Marina Polyakova wrote:
>
> 1.1) It looks like there's a bug in the function get_db_infos
> (src/bin/pg_upgrade/info.c), where the version of the old cluster is always
> checked:
>
> if (GET_MAJOR_VERSION(old_cluster.major_version) < 1500)
>   snprintf(query + strlen(query), sizeof(query) - strlen(query),
>"'c' AS datlocprovider, NULL AS daticulocale, ");
> else
>   snprintf(query + strlen(query), sizeof(query) - strlen(query),
>"datlocprovider, daticulocale, ");
>
> With the simple patch
>
> diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c
> index 
> df374ce4b362b4c6c87fc1fd0e476e5d6d353d9e..53ea348e211d3ac38334292bc16cb814bc13bb87
> 100644
> --- a/src/bin/pg_upgrade/info.c
> +++ b/src/bin/pg_upgrade/info.c
> @@ -319,7 +319,7 @@ get_db_infos(ClusterInfo *cluster)
>
>   snprintf(query, sizeof(query),
>"SELECT d.oid, d.datname, d.encoding, d.datcollate, 
> d.datctype, ");
> - if (GET_MAJOR_VERSION(old_cluster.major_version) < 1500)
> + if (GET_MAJOR_VERSION(cluster->major_version) < 1500)
>   snprintf(query + strlen(query), sizeof(query) - strlen(query),
>"'c' AS datlocprovider, NULL AS daticulocale, 
> ");
>   else
>
> I got the expected error during the upgrade:
>
> locale providers for database "template1" do not match:  old "libc", new
> "icu"
> Failure, exiting

Good catch.  There's unfortunately not a lot of regression tests for
ICU-initialized clusters.  I'm wondering if the build-farm client could be
taught about the locale provider rather than assuming libc.  Clearly that
wouldn't have caught this issue, but it should still increase the coverage a
bit (I'm thinking of the recent problem with the abbreviated keys).

> 1.2) It looks like the mentioned asserion in dbcommands.c conflicts with the
> following lines earlier:
>
> if (dbiculocale == NULL)
>   dbiculocale = src_iculocale;
>
> The following patch works for me:
>
> diff --git a/src/backend/commands/dbcommands.c
> b/src/backend/commands/dbcommands.c
> index 
> b31a30550b025d48ba3cc250dc4c15f41f9a80be..17a2942341e528c01182fb6d4878580f2706bec9
> 100644
> --- a/src/backend/commands/dbcommands.c
> +++ b/src/backend/commands/dbcommands.c
> @@ -1048,6 +1048,8 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
>   
> (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
>errmsg("ICU locale must be 
> specified")));
>   }
> + else
> + dbiculocale = NULL;
>
>   if (dblocprovider == COLLPROVIDER_ICU)
>   check_icu_locale(dbiculocale);

I think it would be better to do that in the various variable initialization.
Maybe switch the dblocprovider and dbiculocale initialization, and only
initialize dbiculocale if dblocprovider is COLLPROVIDER_ICU.

> 2) CREATE DATABASE does not always require the icu locale unlike initdb and
> createdb:
>
> $ createdb mydb --locale en_US.UTF-8 --template template0 --locale-provider
> icu
> createdb: error: database creation failed: ERROR:  ICU locale must be
> specified
>
> $ psql -c "CREATE DATABASE mydb LOCALE \"en_US.UTF-8\" TEMPLATE template0
> LOCALE_PROVIDER icu" postgres
> CREATE DATABASE
>
> I'm wondering if this is not a fully-supported feature (because createdb
> creates an SQL command with LC_COLLATE and LC_CTYPE options instead of
> LOCALE option) or is it a bug in CREATE DATABASE?.. From
> src/backend/commands/dbcommands.c:
>
> if (dblocprovider == COLLPROVIDER_ICU && !dbiculocale)
> {
>   if (dlocale && dlocale->arg)
>   dbiculocale = defGetString(dlocale);
> }

This discrepancy between createdb and CREATE DATABASE looks like an oversight,
as createdb indeed interprets --locale as:

if (locale)
{
if (lc_ctype)
pg_fatal("only one of --locale and --lc-ctype can be 
specified");
if (lc_collate)
pg_fatal("only one of --locale and --lc-collate can be 
specified");
lc_ctype = locale;
lc_collate = locale;
}

AFAIK the fallback in the CREATE DATABASE case is expected as POSIX locale
names should be accepted by icu, so this should work for createdb too.




Re: ICU for global collation

2022-08-15 Thread Marina Polyakova

Hello everyone in this thread!

While reading and testing the patch that adds ICU for global collations 
[1] I noticed on master (1c5818b9c68e5c2ac8f19d372f24cce409de1a26) and 
REL_15_STABLE (63b64d8270691894a9a8f2d4e929e7780020edb8) that:


1) pg_upgrade from REL_14_STABLE 
(63b64d8270691894a9a8f2d4e929e7780020edb8) does not always work:


For REL_14_STABLE:

$ initdb -D data_old

For REL_15_STABLE or master:

$ initdb -D data_new --locale-provider icu --icu-locale ru-RU
$ pg_upgrade -d .../data_old -D data_new -b ... -B ...
...
Restoring database schemas in the new cluster
  template1
*failure*

Consult the last few lines of 
"data_new/pg_upgrade_output.d/20220815T142454.223/log/pg_upgrade_dump_1.log" 
for

the probable cause of the failure.
Failure, exiting

In 
data_new/pg_upgrade_output.d/20220815T142454.223/log/pg_upgrade_dump_1.log:


pg_restore: error: could not execute query: server closed the connection 
unexpectedly

This probably means the server terminated abnormally
before or while processing the request.
Command was: CREATE DATABASE "template1" WITH TEMPLATE = template0 OID = 
1 ENCODING = 'UTF8' LOCALE_PROVIDER = libc LOCALE = 'en_US.UTF-8';


In 
data_new/pg_upgrade_output.d/20220815T142454.223/log/pg_upgrade_server.log:


TRAP: FailedAssertion("(dblocprovider == COLLPROVIDER_ICU && 
dbiculocale) || (dblocprovider != COLLPROVIDER_ICU && !dbiculocale)", 
File: "dbcommands.c", Line: 1292, PID: 69247)
postgres: marina postgres [local] CREATE 
DATABASE(ExceptionalCondition+0xb9)[0xb4d8ec]
postgres: marina postgres [local] CREATE 
DATABASE(createdb+0x1abc)[0x68ca99]
postgres: marina postgres [local] CREATE 
DATABASE(standard_ProcessUtility+0x651)[0x9b1d82]
postgres: marina postgres [local] CREATE 
DATABASE(ProcessUtility+0x122)[0x9b172a]

postgres: marina postgres [local] CREATE DATABASE[0x9b01cf]
postgres: marina postgres [local] CREATE DATABASE[0x9b0433]
postgres: marina postgres [local] CREATE 
DATABASE(PortalRun+0x2fe)[0x9af95d]

postgres: marina postgres [local] CREATE DATABASE[0x9a953b]
postgres: marina postgres [local] CREATE 
DATABASE(PostgresMain+0x733)[0x9ada6b]

postgres: marina postgres [local] CREATE DATABASE[0x8ec632]
postgres: marina postgres [local] CREATE DATABASE[0x8ebfbb]
postgres: marina postgres [local] CREATE DATABASE[0x8e8653]
postgres: marina postgres [local] CREATE 
DATABASE(PostmasterMain+0x1226)[0x8e7f26]

postgres: marina postgres [local] CREATE DATABASE[0x7bbccb]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf3)[0x7eff082f90b3]
postgres: marina postgres [local] CREATE DATABASE(_start+0x2e)[0x49c29e]
2022-08-15 14:24:56.124 MSK [69231] LOG:  server process (PID 69247) was 
terminated by signal 6: Aborted
2022-08-15 14:24:56.124 MSK [69231] DETAIL:  Failed process was running: 
CREATE DATABASE "template1" WITH TEMPLATE = template0 OID = 1 ENCODING = 
'UTF8' LOCALE_PROVIDER = libc LOCALE = 'en_US.UTF-8';


1.1) It looks like there's a bug in the function get_db_infos 
(src/bin/pg_upgrade/info.c), where the version of the old cluster is 
always checked:


if (GET_MAJOR_VERSION(old_cluster.major_version) < 1500)
snprintf(query + strlen(query), sizeof(query) - strlen(query),
 "'c' AS datlocprovider, NULL AS daticulocale, ");
else
snprintf(query + strlen(query), sizeof(query) - strlen(query),
 "datlocprovider, daticulocale, ");

With the simple patch

diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c
index 
df374ce4b362b4c6c87fc1fd0e476e5d6d353d9e..53ea348e211d3ac38334292bc16cb814bc13bb87 
100644

--- a/src/bin/pg_upgrade/info.c
+++ b/src/bin/pg_upgrade/info.c
@@ -319,7 +319,7 @@ get_db_infos(ClusterInfo *cluster)

snprintf(query, sizeof(query),
 "SELECT d.oid, d.datname, d.encoding, d.datcollate, 
d.datctype, ");
-   if (GET_MAJOR_VERSION(old_cluster.major_version) < 1500)
+   if (GET_MAJOR_VERSION(cluster->major_version) < 1500)
snprintf(query + strlen(query), sizeof(query) - strlen(query),
 "'c' AS datlocprovider, NULL AS daticulocale, 
");
else

I got the expected error during the upgrade:

locale providers for database "template1" do not match:  old "libc", new 
"icu"

Failure, exiting

1.2) It looks like the mentioned asserion in dbcommands.c conflicts with 
the following lines earlier:


if (dbiculocale == NULL)
dbiculocale = src_iculocale;

The following patch works for me:

diff --git a/src/backend/commands/dbcommands.c 
b/src/backend/commands/dbcommands.c
index 
b31a30550b025d48ba3cc250dc4c15f41f9a80be..17a2942341e528c01182fb6d4878580f2706bec9 
100644

--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -1048,6 +1048,8 @@ createdb(ParseState *pstate, const CreatedbStmt 
*stmt)


(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 errmsg("ICU locale must be 

Re: ICU for global collation

2022-06-27 Thread Justin Pryzby
On Mon, Jun 27, 2022 at 09:10:29AM +0200, Peter Eisentraut wrote:
> On 27.06.22 08:42, Michael Paquier wrote:
> > On Mon, Jun 27, 2022 at 08:23:59AM +0200, Peter Eisentraut wrote:
> > > On 26.06.22 05:51, Julien Rouhaud wrote:
> > > > On Sat, Jun 25, 2022 at 10:19:30AM -0500, Justin Pryzby wrote:
> > > > > > +   if (GET_MAJOR_VERSION(old_cluster.major_version) <= 1500)
> > > 
> > > I think the fix here is to change <= to < ?
> > 
> > Yes.
> 
> Ok, committed.
> 
> (I see now that in the context of pg_upgrade, writing <= 1400 is equivalent,
> but I find that confusing, so I did < 1500.)

I suggested using <= 1400 for consistency with the other code, and per
bc1fbc960.  But YMMV.

-- 
Justin




Re: ICU for global collation

2022-06-27 Thread Peter Eisentraut

On 27.06.22 08:42, Michael Paquier wrote:

On Mon, Jun 27, 2022 at 08:23:59AM +0200, Peter Eisentraut wrote:

On 26.06.22 05:51, Julien Rouhaud wrote:

On Sat, Jun 25, 2022 at 10:19:30AM -0500, Justin Pryzby wrote:

+   if (GET_MAJOR_VERSION(old_cluster.major_version) <= 1500)


I think the fix here is to change <= to < ?


Yes.


Ok, committed.

(I see now that in the context of pg_upgrade, writing <= 1400 is 
equivalent, but I find that confusing, so I did < 1500.)





Re: ICU for global collation

2022-06-27 Thread Michael Paquier
On Mon, Jun 27, 2022 at 08:23:59AM +0200, Peter Eisentraut wrote:
> On 26.06.22 05:51, Julien Rouhaud wrote:
>> On Sat, Jun 25, 2022 at 10:19:30AM -0500, Justin Pryzby wrote:
 +  if (GET_MAJOR_VERSION(old_cluster.major_version) <= 1500)
> 
> I think the fix here is to change <= to < ?

Yes.
--
Michael


signature.asc
Description: PGP signature


Re: ICU for global collation

2022-06-27 Thread Peter Eisentraut

On 26.06.22 05:51, Julien Rouhaud wrote:

Hi,

On Sat, Jun 25, 2022 at 10:19:30AM -0500, Justin Pryzby wrote:

commit f2553d43060edb210b36c63187d52a632448e1d2 says >=1500 in a few places,
but in pg_upgrade says <=1500, which looks wrong for upgrades from v15.
I think it should say <= 1400.

On Wed, Feb 02, 2022 at 02:01:23PM +0100, Peter Eisentraut wrote:

diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c
index 69ef23119f..2a9ca0e389 100644
--- a/src/bin/pg_upgrade/info.c
+++ b/src/bin/pg_upgrade/info.c
@@ -312,11 +312,20 @@ get_db_infos(ClusterInfo *cluster)
i_spclocation;
charquery[QUERY_ALLOC];
  
  	snprintf(query, sizeof(query),

-"SELECT d.oid, d.datname, d.encoding, d.datcollate, 
d.datctype, "
+"SELECT d.oid, d.datname, d.encoding, d.datcollate, 
d.datctype, ");
+   if (GET_MAJOR_VERSION(old_cluster.major_version) <= 1500)


I think the fix here is to change <= to < ?


+   snprintf(query + strlen(query), sizeof(query) - strlen(query),
+"'c' AS datcollprovider, NULL AS daticucoll, 
");
+   else
+   snprintf(query + strlen(query), sizeof(query) - strlen(query),
+"datcollprovider, daticucoll, ");
+   snprintf(query + strlen(query), sizeof(query) - strlen(query),
 "pg_catalog.pg_tablespace_location(t.oid) AS spclocation 
"
 "FROM pg_catalog.pg_database d "
 " LEFT OUTER JOIN pg_catalog.pg_tablespace t "


Indeed!








Re: ICU for global collation

2022-06-26 Thread Michael Paquier
On Sun, Jun 26, 2022 at 11:51:24AM +0800, Julien Rouhaud wrote:
> On Sat, Jun 25, 2022 at 10:19:30AM -0500, Justin Pryzby wrote:
>>> +   if (GET_MAJOR_VERSION(old_cluster.major_version) <= 1500)
>>> +   snprintf(query + strlen(query), sizeof(query) - strlen(query),
>>> +"'c' AS datcollprovider, NULL AS daticucoll, 
>>> ");
>>> +   else
>>> +   snprintf(query + strlen(query), sizeof(query) - strlen(query),
>>> +"datcollprovider, daticucoll, ");
>>> +   snprintf(query + strlen(query), sizeof(query) - strlen(query),
>>>  "pg_catalog.pg_tablespace_location(t.oid) AS 
>>> spclocation "
>>>  "FROM pg_catalog.pg_database d "
>>>  " LEFT OUTER JOIN pg_catalog.pg_tablespace t "
> 
> Indeed!

Oops.  Beta2 tagging is very close by, so I think that it would be
better to not take a risk on that now, and this is an issue only when
upgrading from v15 where datcollprovider is ICU for a database.
As things stand, someone using beta1 with this new feature, running
pg_upgrade to beta2 would lose any non-libc locale provider set.
--
Michael


signature.asc
Description: PGP signature


Re: ICU for global collation

2022-06-25 Thread Julien Rouhaud
Hi,

On Sat, Jun 25, 2022 at 10:19:30AM -0500, Justin Pryzby wrote:
> commit f2553d43060edb210b36c63187d52a632448e1d2 says >=1500 in a few places,
> but in pg_upgrade says <=1500, which looks wrong for upgrades from v15.
> I think it should say <= 1400.
> 
> On Wed, Feb 02, 2022 at 02:01:23PM +0100, Peter Eisentraut wrote:
> > diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c
> > index 69ef23119f..2a9ca0e389 100644
> > --- a/src/bin/pg_upgrade/info.c
> > +++ b/src/bin/pg_upgrade/info.c
> > @@ -312,11 +312,20 @@ get_db_infos(ClusterInfo *cluster)
> > i_spclocation;
> > charquery[QUERY_ALLOC];
> >  
> > snprintf(query, sizeof(query),
> > -"SELECT d.oid, d.datname, d.encoding, d.datcollate, 
> > d.datctype, "
> > +"SELECT d.oid, d.datname, d.encoding, d.datcollate, 
> > d.datctype, ");
> > +   if (GET_MAJOR_VERSION(old_cluster.major_version) <= 1500)
> > +   snprintf(query + strlen(query), sizeof(query) - strlen(query),
> > +"'c' AS datcollprovider, NULL AS daticucoll, 
> > ");
> > +   else
> > +   snprintf(query + strlen(query), sizeof(query) - strlen(query),
> > +"datcollprovider, daticucoll, ");
> > +   snprintf(query + strlen(query), sizeof(query) - strlen(query),
> >  "pg_catalog.pg_tablespace_location(t.oid) AS 
> > spclocation "
> >  "FROM pg_catalog.pg_database d "
> >  " LEFT OUTER JOIN pg_catalog.pg_tablespace t "

Indeed!




Re: ICU for global collation

2022-06-25 Thread Justin Pryzby
commit f2553d43060edb210b36c63187d52a632448e1d2 says >=1500 in a few places,
but in pg_upgrade says <=1500, which looks wrong for upgrades from v15.
I think it should say <= 1400.

On Wed, Feb 02, 2022 at 02:01:23PM +0100, Peter Eisentraut wrote:
> --- a/src/bin/pg_dump/pg_dump.c
> +++ b/src/bin/pg_dump/pg_dump.c
> @@ -2792,6 +2794,10 @@ dumpDatabase(Archive *fout)
>   appendPQExpBuffer(dbQry, "datminmxid, ");
>   else
>   appendPQExpBuffer(dbQry, "0 AS datminmxid, ");
> + if (fout->remoteVersion >= 15)
> + appendPQExpBuffer(dbQry, "datcollprovider, ");
> + else
> + appendPQExpBuffer(dbQry, "'c' AS datcollprovider, ");
>   appendPQExpBuffer(dbQry,
> "(SELECT spcname FROM pg_tablespace t 
> WHERE t.oid = dattablespace) AS tablespace, "
> "shobj_description(oid, 
> 'pg_database') AS description "

> diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c
> index 69ef23119f..2a9ca0e389 100644
> --- a/src/bin/pg_upgrade/info.c
> +++ b/src/bin/pg_upgrade/info.c
> @@ -312,11 +312,20 @@ get_db_infos(ClusterInfo *cluster)
>   i_spclocation;
>   charquery[QUERY_ALLOC];
>  
>   snprintf(query, sizeof(query),
> -  "SELECT d.oid, d.datname, d.encoding, d.datcollate, 
> d.datctype, "
> +  "SELECT d.oid, d.datname, d.encoding, d.datcollate, 
> d.datctype, ");
> + if (GET_MAJOR_VERSION(old_cluster.major_version) <= 1500)
> + snprintf(query + strlen(query), sizeof(query) - strlen(query),
> +  "'c' AS datcollprovider, NULL AS daticucoll, 
> ");
> + else
> + snprintf(query + strlen(query), sizeof(query) - strlen(query),
> +  "datcollprovider, daticucoll, ");
> + snprintf(query + strlen(query), sizeof(query) - strlen(query),
>"pg_catalog.pg_tablespace_location(t.oid) AS 
> spclocation "
>"FROM pg_catalog.pg_database d "
>" LEFT OUTER JOIN pg_catalog.pg_tablespace t "

> --- a/src/bin/psql/describe.c
> +++ b/src/bin/psql/describe.c
> @@ -896,6 +896,18 @@ listAllDbs(const char *pattern, bool verbose)
> gettext_noop("Encoding"),
> gettext_noop("Collate"),
> gettext_noop("Ctype"));
> + if (pset.sversion >= 15)
> + appendPQExpBuffer(,
> +   "   d.daticucoll as 
> \"%s\",\n"
> +   "   CASE 
> d.datcollprovider WHEN 'c' THEN 'libc' WHEN 'i' THEN 'icu' END AS \"%s\",\n",
> +   gettext_noop("ICU Collation"),
> +   gettext_noop("Coll. 
> Provider"));
> + else
> + appendPQExpBuffer(,
> +   "   d.datcollate as 
> \"%s\",\n"
> +   "   'libc' AS \"%s\",\n",
> +   gettext_noop("ICU Collation"),
> +   gettext_noop("Coll. 
> Provider"));
>   appendPQExpBufferStr(, "   ");
>   printACLColumn(, "d.datacl");
>   if (verbose)

> @@ -4617,6 +4629,15 @@ listCollations(const char *pattern, bool verbose, bool 
> showSystem)
> gettext_noop("Collate"),
> gettext_noop("Ctype"));
>  
> + if (pset.sversion >= 15)
> + appendPQExpBuffer(,
> +   ",\n   c.collicucoll AS 
> \"%s\"",
> +   gettext_noop("ICU 
> Collation"));
> + else
> + appendPQExpBuffer(,
> +   ",\n   c.collcollate AS 
> \"%s\"",
> +   gettext_noop("ICU 
> Collation"));
> +
>   if (pset.sversion >= 10)
>   appendPQExpBuffer(,
> ",\n   CASE 
> c.collprovider WHEN 'd' THEN 'default' WHEN 'c' THEN 'libc' WHEN 'i' THEN 
> 'icu' END AS \"%s\"",




Re: ICU for global collation

2022-03-18 Thread Andres Freund
Hi,

On 2022-03-17 14:14:52 +0100, Peter Eisentraut wrote:
> committed, thanks

Just noticed that this adds a new warning when building with -O3:

In file included from 
/home/andres/src/postgresql/src/include/catalog/pg_collation.h:22,
 from 
/home/andres/src/postgresql/src/backend/commands/dbcommands.c:39:
In function ‘collprovider_name’,
inlined from ‘createdb’ at 
/home/andres/src/postgresql/src/backend/commands/dbcommands.c:514:4:
../../../src/include/catalog/pg_collation_d.h:47:9: warning: ‘src_locprovider’ 
may be used uninitialized [-Wmaybe-uninitialized]
   47 | switch (c)
  | ^~
/home/andres/src/postgresql/src/backend/commands/dbcommands.c: In function 
‘createdb’:
/home/andres/src/postgresql/src/backend/commands/dbcommands.c:112:25: note: 
‘src_locprovider’ was declared here
  112 | charsrc_locprovider;
  | ^~~

I'd fixed that for nearby variables in 3f6b3be39ca9... Gonna just NULL
initialize it as well.

Greetings,

Andres Freund




Re: ICU for global collation

2022-03-18 Thread Julien Rouhaud
On Thu, Mar 17, 2022 at 02:14:52PM +0100, Peter Eisentraut wrote:
> On 17.03.22 13:01, Shinoda, Noriyoshi (PN Japan FSIP) wrote:
> > Thank you to all the developers.
> > I found that the description of the pg_database.daticulocale column was not 
> > written in the documentation.
> > The attached small patch adds a description of the daticulocale column to 
> > catalogs.sgml.
>
> committed, thanks

Thanks a lot both!  Glad to finally have that feature, as soon as we'll fix
the few reported problems.




Re: ICU for global collation

2022-03-17 Thread Andres Freund
Hi,

On 2022-03-17 14:14:52 +0100, Peter Eisentraut wrote:
> On 17.03.22 13:01, Shinoda, Noriyoshi (PN Japan FSIP) wrote:
> > Thank you to all the developers.
> > I found that the description of the pg_database.daticulocale column was not 
> > written in the documentation.
> > The attached small patch adds a description of the daticulocale column to 
> > catalogs.sgml.
> 
> committed, thanks

Wee! That's a long time weakness addressed...


Just saw a weird failure after rebasing my meson branch ontop of this. Tests
passed on debian, suse, centos 8 stream, fedora rawhide (failed due to an
independent reason), but not on centos 7.


all runs: https://cirrus-ci.com/build/5190538184884224
centos 7: https://cirrus-ci.com/task/4786632883699712?logs=tests_world#L204
centos 7 failure: 
https://api.cirrus-ci.com/v1/artifact/task/4786632883699712/log/build/testrun/icu/t/010_database/log/regress_log_010_database

not ok 1 - sort by database default locale
#   Failed test 'sort by database default locale'
#   at /tmp/cirrus-ci-build/src/test/icu/t/010_database.pl line 28.
#  got: 'a
# A
# b
# B'
# expected: 'A
# a
# B
# b'
ok 2 - sort by explicit collation standard
not ok 3 - sort by explicit collation upper first
#   Failed test 'sort by explicit collation upper first'
#   at /tmp/cirrus-ci-build/src/test/icu/t/010_database.pl line 42.
#  got: 'a
# A
# b
# B'
# expected: 'A
# a
# B
# b'
ok 4 - ICU locale must be specified for ICU provider: exit code not 0
ok 5 - ICU locale must be specified for ICU provider: error message
1..5

This is a run building with meson. But I've now triggered builds with autoconf
on centos 7 as well and that also failed. See
https://cirrus-ci.com/task/6194007767252992?logs=test_world#L378

So it looks like older ICU versions don't work?

Greetings,

Andres Freund

PS: I had not yet passed with_icu in the initdb tests for meson, that's why
there's two failures with autoconf but only one with meson.




Re: ICU for global collation

2022-03-17 Thread Peter Geoghegan
On Thu, Mar 17, 2022 at 6:15 AM Peter Eisentraut
 wrote:
> committed, thanks

Glad that this finally happened. Thanks to everybody involved!

-- 
Peter Geoghegan




Re: ICU for global collation

2022-03-17 Thread Peter Eisentraut

On 17.03.22 13:01, Shinoda, Noriyoshi (PN Japan FSIP) wrote:

Thank you to all the developers.
I found that the description of the pg_database.daticulocale column was not 
written in the documentation.
The attached small patch adds a description of the daticulocale column to 
catalogs.sgml.


committed, thanks




RE: ICU for global collation

2022-03-17 Thread Shinoda, Noriyoshi (PN Japan FSIP)
Hi, 
Thank you to all the developers.
I found that the description of the pg_database.daticulocale column was not 
written in the documentation. 
The attached small patch adds a description of the daticulocale column to 
catalogs.sgml.

Regards,
Noriyoshi Shinoda
-Original Message-
From: Peter Eisentraut  
Sent: Thursday, March 17, 2022 7:29 PM
To: Julien Rouhaud 
Cc: pgsql-hackers ; Daniel Verite 

Subject: Re: ICU for global collation

On 15.03.22 08:41, Julien Rouhaud wrote:
>>>> The locale object in ICU is an identifier that specifies a 
>>>> particular locale and has fields for language, country, and an 
>>>> optional code to specify further variants or subdivisions. These 
>>>> fields also can be represented as a string with the fields separated by an 
>>>> underscore.
>>
>> I think the Localization chapter needs to be reorganized a bit, but 
>> I'll leave that for a separate patch.
> 
> WFM.

I ended up writing a bit of content for that chapter.

>>> While on that topic, the doc should probably mention that default 
>>> ICU collations can only be deterministic.
>>
>> Well, there is no option to do otherwise, so I'm not sure where/how 
>> to mention that.  We usually don't document options that don't exist. 
>> ;-)
> 
> Sure, but I'm afraid that users may still be tempted to use ICU 
> locales like
> und-u-ks-level2 from the case_insensitive example in the doc and hope 
> that it will work accordingly.  Or maybe it's just me that still sees 
> ICU as dark magic and want to be extra cautious.

I have added this to the CREATE DATABASE ref page.

> Unrelated, but I just realized that we have PGC_INTERNAL gucs for 
> lc_ctype and lc_collate.  Should we add one for icu_locale too?

I'm not sure.  I think the existing ones are more for backward compatibility 
with the time before it was settable per database.

>>> in AlterCollation(), pg_collation_actual_version(), 
>>> AlterDatabaseRefreshColl() and pg_database_collation_actual_version():
>>>
>>> -   datum = SysCacheGetAttr(COLLOID, tup, Anum_pg_collation_collcollate, 
>>> );
>>> -   Assert(!isnull);
>>> -   newversion = get_collation_actual_version(collForm->collprovider, 
>>> TextDatumGetCString(datum));
>>> +   datum = SysCacheGetAttr(COLLOID, tup, collForm->collprovider == 
>>> COLLPROVIDER_ICU ? Anum_pg_collation_colliculocale : 
>>> Anum_pg_collation_collcollate, );
>>> +   if (!isnull)
>>> +   newversion = get_collation_actual_version(collForm->collprovider, 
>>> TextDatumGetCString(datum));
>>> +   else
>>> +   newversion = NULL;
>>>
>>> The columns are now nullable, but can you actually end up with a 
>>> null locale name in the expected field without manual DML on the 
>>> catalog, corruption or similar?  I think it should be a plain error 
>>> explaining the inconsistency rather then silently assuming there's 
>>> no version.  Note that at least
>>> pg_newlocale_from_collation() asserts that the specific libc/icu 
>>> field it's interested in isn't null.
>>
>> This is required because the default collations's fields are null now.
> 
> Yes I saw that, but that's a specific exception.  Detecting whether 
> it's the DEFAULT_COLLATION_OID or not and raise an error when a null 
> value isn't expected seems like it could be worthwhile.

I have fixed that as you suggest.

> So apart from the few details mentioned above I'm happy with this patch!

committed!




pg_database_iculocale_v1.diff
Description: pg_database_iculocale_v1.diff


Re: ICU for global collation

2022-03-17 Thread Peter Eisentraut

On 15.03.22 08:41, Julien Rouhaud wrote:

The locale object in ICU is an identifier that specifies a particular locale
and has fields for language, country, and an optional code to specify further
variants or subdivisions. These fields also can be represented as a string
with the fields separated by an underscore.


I think the Localization chapter needs to be reorganized a bit, but I'll
leave that for a separate patch.


WFM.


I ended up writing a bit of content for that chapter.


While on that topic, the doc should probably mention that default ICU
collations can only be deterministic.


Well, there is no option to do otherwise, so I'm not sure where/how to
mention that.  We usually don't document options that don't exist. ;-)


Sure, but I'm afraid that users may still be tempted to use ICU locales like
und-u-ks-level2 from the case_insensitive example in the doc and hope that it
will work accordingly.  Or maybe it's just me that still sees ICU as dark magic
and want to be extra cautious.


I have added this to the CREATE DATABASE ref page.


Unrelated, but I just realized that we have PGC_INTERNAL gucs for lc_ctype and
lc_collate.  Should we add one for icu_locale too?


I'm not sure.  I think the existing ones are more for backward 
compatibility with the time before it was settable per database.



in AlterCollation(), pg_collation_actual_version(), AlterDatabaseRefreshColl()
and pg_database_collation_actual_version():

-   datum = SysCacheGetAttr(COLLOID, tup, Anum_pg_collation_collcollate, 
);
-   Assert(!isnull);
-   newversion = get_collation_actual_version(collForm->collprovider, 
TextDatumGetCString(datum));
+   datum = SysCacheGetAttr(COLLOID, tup, collForm->collprovider == 
COLLPROVIDER_ICU ? Anum_pg_collation_colliculocale : Anum_pg_collation_collcollate, 
);
+   if (!isnull)
+   newversion = get_collation_actual_version(collForm->collprovider, 
TextDatumGetCString(datum));
+   else
+   newversion = NULL;

The columns are now nullable, but can you actually end up with a null locale
name in the expected field without manual DML on the catalog, corruption or
similar?  I think it should be a plain error explaining the inconsistency
rather then silently assuming there's no version.  Note that at least
pg_newlocale_from_collation() asserts that the specific libc/icu field it's
interested in isn't null.


This is required because the default collations's fields are null now.


Yes I saw that, but that's a specific exception.  Detecting whether it's the
DEFAULT_COLLATION_OID or not and raise an error when a null value isn't
expected seems like it could be worthwhile.


I have fixed that as you suggest.


So apart from the few details mentioned above I'm happy with this patch!


committed!




Re: ICU for global collation

2022-03-16 Thread Peter Eisentraut

On 15.03.22 18:28, Robert Haas wrote:

On Tue, Mar 15, 2022 at 12:58 PM Peter Eisentraut
 wrote:

On 14.03.22 19:57, Robert Haas wrote:

1. What will happen if I set the ICU collation to something that
doesn't match the libc collation? How bad are the consequences?


These are unrelated, so there are no consequences.


Can you please elaborate on this?


The code that is aware of ICU generally works like this:

if (locale_provider == ICU)
  result = call ICU code
else
  result = call libc code
return result

However, there is code out there, both within PostgreSQL itself and in 
extensions, that does not do that yet.  Ideally, we would eventually 
change all that over, but it's not happening now.  So we ought to 
preserve the ability to set the libc to keep that legacy code working 
for now.


This legacy code by definition doesn't know about ICU, so it doesn't 
care whether the ICU setting "matches" the libc setting or anything like 
that.  It will just do its thing depending on its own setting.


The only consequence of settings that don't match is that the different 
pieces of code behave semantically inconsistently (e.g., some routine 
thinks the data is Greek and other code thinks the data is French).  But 
that's up to the user to set correctly.  And the actual scenarios where 
you can actually do anything semantically relevant this way are very 
limited.


A second point is that the LC_CTYPE setting tells other parts of libc 
what the current encoding is.  This affects gettext for example.  So you 
need to set this to something sensible even if you don't use libc locale 
routines otherwise.



2. If I want to avoid a mismatch between the two, then I will need a
way to figure out which libc collation corresponds to a given ICU
collation. How do I do that?


You can specify the same name for both.


Hmm. If every name were valid in both systems, I don't think you'd be
proposing two fields.


Earlier versions of this patch and predecessor patches indeed had common 
fields.  But in fact the two systems accept different values if you want 
to delve into the advanced features.  But for basic usage something like 
"en_US" will work for both.





Re: ICU for global collation

2022-03-15 Thread Daniel Verite
Finnerty, Jim wrote:

> In ICU, the "locale" is just the first part of what we can pass to the
> "locale" parameter in CREATE COLLATION - the part before the optional '@'
> delimiter.  The ICU locale does not include the secondary or tertiary
> properties,

Why not? Please see
https://unicode-org.github.io/icu/userguide/locale

It says that "a locale consists of one or more pieces of ordered
information", the pieces being a language code, a script code, a
country code, a variant code, and keywords.


Best regards,
-- 
Daniel Vérité
https://postgresql.verite.pro/
Twitter: @DanielVerite




Re: ICU for global collation

2022-03-15 Thread Daniel Verite
Julien Rouhaud wrote:

> > > While on that topic, the doc should probably mention that default ICU
> > > collations can only be deterministic.
> >
> > Well, there is no option to do otherwise, so I'm not sure where/how to
> > mention that.  We usually don't document options that don't exist. ;-)
> 
> Sure, but I'm afraid that users may still be tempted to use ICU locales like
> und-u-ks-level2 from the case_insensitive example in the doc and hope that
> it will work accordingly.

+1.

The CREATE DATABASE doc says this currently:

icu_locale

Specifies the ICU locale ID if the ICU locale provider is used.


ISTM that we need to say explicitly that this locale will be used by
default to compare all collatable strings, except that it's overruled
by a bytewise comparison to break ties in case of equality.

The idea is to describe what the backend will do with the setting
rather than saying that we don't have a nondeterministic option.


Best regards,
-- 
Daniel Vérité
https://postgresql.verite.pro/
Twitter: @DanielVerite




Re: ICU for global collation

2022-03-15 Thread Finnerty, Jim
Can we get some more consistent terminology around the term "locale"?

In ICU, the "locale" is just the first part of what we can pass to the "locale" 
parameter in CREATE COLLATION - the part before the optional '@' delimiter.  
The ICU locale does not include the secondary or tertiary properties, so it is 
usually just the country and the language, e.g. en_US (or en-US), but it can 
also be something like es_TRADITIONAL for traditional Spanish.  

I think it would be an improvement in clarity if we consistently use the term 
'locale' to mean the same thing that ICU means by that term, and not to have 
the thing that we call the "locale" also include collation modifiers, or to 
imply that a locale is the same thing as a collation.

/Jim






Re: ICU for global collation

2022-03-15 Thread Robert Haas
On Tue, Mar 15, 2022 at 12:58 PM Peter Eisentraut
 wrote:
> On 14.03.22 19:57, Robert Haas wrote:
> > 1. What will happen if I set the ICU collation to something that
> > doesn't match the libc collation? How bad are the consequences?
>
> These are unrelated, so there are no consequences.

Can you please elaborate on this?

> > 2. If I want to avoid a mismatch between the two, then I will need a
> > way to figure out which libc collation corresponds to a given ICU
> > collation. How do I do that?
>
> You can specify the same name for both.

Hmm. If every name were valid in both systems, I don't think you'd be
proposing two fields.

-- 
Robert Haas
EDB: http://www.enterprisedb.com




Re: ICU for global collation

2022-03-15 Thread Peter Eisentraut



On 14.03.22 19:57, Robert Haas wrote:

1. What will happen if I set the ICU collation to something that
doesn't match the libc collation? How bad are the consequences?


These are unrelated, so there are no consequences.


2. If I want to avoid a mismatch between the two, then I will need a
way to figure out which libc collation corresponds to a given ICU
collation. How do I do that?


You can specify the same name for both.




Re: ICU for global collation

2022-03-15 Thread Julien Rouhaud
On Mon, Mar 14, 2022 at 01:50:50PM +0100, Peter Eisentraut wrote:
> On 05.03.22 09:38, Julien Rouhaud wrote:
> > I say it works because I did manually check, as far as I can see there isn't
> > any test that ensures it.
> >
> > I'm using this naive scenario:
> >
> > DROP DATABASE IF EXISTS dbicu;
> > CREATE DATABASE dbicu LOCALE_PROVIDER icu LOCALE 'en_US' ICU_LOCALE 
> > 'en-u-kf-upper' template 'template0';
> > \c dbicu
> > CREATE COLLATION upperfirst (provider = icu, locale = 'en-u-kf-upper');
> > CREATE TABLE icu(def text, en text COLLATE "en_US", upfirst text COLLATE 
> > upperfirst);
> > INSERT INTO icu VALUES ('a', 'a', 'a'), ('b','b','b'), ('A','A','A'), 
> > ('B','B','B');
> > SELECT def AS def FROM icu ORDER BY def;
> > SELECT def AS en FROM icu ORDER BY en;
> > SELECT def AS upfirst FROM icu ORDER BY upfirst;
> > SELECT def AS upfirst_explicit FROM icu ORDER BY en COLLATE upperfirst;
> > SELECT def AS en_x_explicit FROM icu ORDER BY def COLLATE "en-x-icu";
> >
> > Maybe there should be some test along those lines included in the patch?
>
> I added something like this to a new test suite under src/test/icu/.
> (src/test/locale/ was already used for something else.)

Great, thanks!

> > > The locale object in ICU is an identifier that specifies a particular 
> > > locale
> > > and has fields for language, country, and an optional code to specify 
> > > further
> > > variants or subdivisions. These fields also can be represented as a string
> > > with the fields separated by an underscore.
>
> I think the Localization chapter needs to be reorganized a bit, but I'll
> leave that for a separate patch.

WFM.

> > I spent some time looking at the ICU api trying to figure out if using a
> > posix locale name (e.g. en_US) was actually compatible with an ICU locale 
> > name.
> > It seems that ICU accept any of 'en-us', 'en-US', 'en_us' or 'en_US' as the
> > same locale, but I might be wrong.  I also didn't find a way to figure out 
> > how
> > to ask ICU if the locale identifier passed is complete garbage or not.  One
> > sure thing is that the system collation we import are of the form 'en-us', 
> > so
> > it seems weird to have this form in pg_collation and by default another 
> > form in
> > pg_database.
>
> Yeah it seems to be inconsistent about that.  The locale ID documentation
> appears to indicate that "en_US" is the canonical form, but when you ask it
> to list all the locales it knows about it returns "en-US".

Yeah I saw that too when checking is POSIX locale names were valid, and that's
not great.
>
> > In CREATE DATABASE manual:
> >
> > +Specifies the provider to use for the default collation in this
> > +database.  Possible values are:
> > +
> > icu,ICU
> > +libc.  libc is the default.  
> > The
> > +available choices depend on the operating system and build options.
> >
> > That's actually not true as pg_strcasecmp is used in createdb():
>
> I don't understand what the concern is here.

Ah sorry I missed the , I thought the possible values listed were
icu, ICU and libc.  Shouldn't the ICU  be before the icu 
rather than the libc, or at least before the comma?
>
> Yeah, I think it is not the job of the initdb man page to lecture people
> about the merits of their locale choice.  The same performance warning can
> also be found in the localization chapter; we don't need to repeat it
> everywhere a locale choice is mentioned.

Ah, I tried to look for another place where the same warning could be found but
missed it.  I'm fine with it then!

> > While on that topic, the doc should probably mention that default ICU
> > collations can only be deterministic.
>
> Well, there is no option to do otherwise, so I'm not sure where/how to
> mention that.  We usually don't document options that don't exist. ;-)

Sure, but I'm afraid that users may still be tempted to use ICU locales like
und-u-ks-level2 from the case_insensitive example in the doc and hope that it
will work accordingly.  Or maybe it's just me that still sees ICU as dark magic
and want to be extra cautious.

Unrelated, but I just realized that we have PGC_INTERNAL gucs for lc_ctype and
lc_collate.  Should we add one for icu_locale too?

> > Also unrelated, but how about raising a warning about possibly hiding
> > corruption if you use CREATE COLLATION ...  VERSION or CREATE DATABASE ...
> > COLLATION_VERSION if !IsBinaryUpgrade()?
>
> Hmm, there is a point to that.  But then we should just make that an error.
> Otherwise, we raise the warning but then there is no way to "fix" what was
> warned about.  Seems confusing.

I was afraid that an error was a bit too much, but if that's acceptable I agree
that it's better.

> > in AlterCollation(), pg_collation_actual_version(), 
> > AlterDatabaseRefreshColl()
> > and pg_database_collation_actual_version():
> >
> > -   datum = SysCacheGetAttr(COLLOID, tup, Anum_pg_collation_collcollate, 
> > );
> > -   Assert(!isnull);
> > -   newversion = 

Re: ICU for global collation

2022-03-14 Thread Robert Haas
On Mon, Mar 14, 2022 at 8:51 AM Peter Eisentraut
 wrote:
> [ new patches ]

I'm not very knowledgeable about this topic, but to me, it seems
confusing to think of having both a libc collation and an ICU
collation associated with a database. I have two main questions:

1. What will happen if I set the ICU collation to something that
doesn't match the libc collation? How bad are the consequences?

2. If I want to avoid a mismatch between the two, then I will need a
way to figure out which libc collation corresponds to a given ICU
collation. How do I do that?

I have a sneaking suspicion that I'm not going to like the answer to
question #2 very much, but maybe that's life. I feel like this whole
area is far too full of magical things. True practitioners know that
if you utter some mysterious incantation to the Lords of ICU, you can
get exactly the behavior you want. And ... everyone else has no idea
what to do.

-- 
Robert Haas
EDB: http://www.enterprisedb.com




Re: ICU for global collation

2022-03-14 Thread Peter Eisentraut

On 05.03.22 09:38, Julien Rouhaud wrote:

I say it works because I did manually check, as far as I can see there isn't
any test that ensures it.

I'm using this naive scenario:

DROP DATABASE IF EXISTS dbicu;
CREATE DATABASE dbicu LOCALE_PROVIDER icu LOCALE 'en_US' ICU_LOCALE 
'en-u-kf-upper' template 'template0';
\c dbicu
CREATE COLLATION upperfirst (provider = icu, locale = 'en-u-kf-upper');
CREATE TABLE icu(def text, en text COLLATE "en_US", upfirst text COLLATE 
upperfirst);
INSERT INTO icu VALUES ('a', 'a', 'a'), ('b','b','b'), ('A','A','A'), 
('B','B','B');
SELECT def AS def FROM icu ORDER BY def;
SELECT def AS en FROM icu ORDER BY en;
SELECT def AS upfirst FROM icu ORDER BY upfirst;
SELECT def AS upfirst_explicit FROM icu ORDER BY en COLLATE upperfirst;
SELECT def AS en_x_explicit FROM icu ORDER BY def COLLATE "en-x-icu";

Maybe there should be some test along those lines included in the patch?


I added something like this to a new test suite under src/test/icu/. 
(src/test/locale/ was already used for something else.)



Also, it's not even used consistently in the patch.  I can see e.g. "ICU
locale" or "ICU locale setting" being used:


I have fixed those.


Maybe we could point to the ICU documentation for a clear notion of what a
locale ID is, and/or use their own short definition [1]:


The locale object in ICU is an identifier that specifies a particular locale
and has fields for language, country, and an optional code to specify further
variants or subdivisions. These fields also can be represented as a string
with the fields separated by an underscore.


I think the Localization chapter needs to be reorganized a bit, but I'll 
leave that for a separate patch.



I spent some time looking at the ICU api trying to figure out if using a
posix locale name (e.g. en_US) was actually compatible with an ICU locale name.
It seems that ICU accept any of 'en-us', 'en-US', 'en_us' or 'en_US' as the
same locale, but I might be wrong.  I also didn't find a way to figure out how
to ask ICU if the locale identifier passed is complete garbage or not.  One
sure thing is that the system collation we import are of the form 'en-us', so
it seems weird to have this form in pg_collation and by default another form in
pg_database.


Yeah it seems to be inconsistent about that.  The locale ID 
documentation appears to indicate that "en_US" is the canonical form, 
but when you ask it to list all the locales it knows about it returns 
"en-US".



In CREATE DATABASE manual:

+Specifies the provider to use for the default collation in this
+database.  Possible values are:
+icu,ICU
+libc.  libc is the default.  The
+available choices depend on the operating system and build options.

That's actually not true as pg_strcasecmp is used in createdb():

+   if (pg_strcasecmp(locproviderstr, "icu") == 0)
+   dblocprovider = COLLPROVIDER_ICU;
+   else if (pg_strcasecmp(locproviderstr, "libc") == 0)
+   dblocprovider = COLLPROVIDER_LIBC;
+   else
+   ereport(ERROR,
+   (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+errmsg("unrecognized locale provider: %s",
+   locproviderstr)));


I don't understand what the concern is here.


Unless I'm missing something you entirely removed the warninng about the
performance penalty when using non C/POSIX locale?


Yeah, I think it is not the job of the initdb man page to lecture people 
about the merits of their locale choice.  The same performance warning 
can also be found in the localization chapter; we don't need to repeat 
it everywhere a locale choice is mentioned.



initdb has some specific processing to transform the default libc locale to
something more appropriate, but as far as I can see creatdb / CREATE DATABASE
aren't doing that.  It seems inconsistent, and IMHO another reason why
defaulting to the libc locale looks like a bad idea.


This has all been removed.  The separate ICU locale option should now be 
required everywhere (initdb, createdb, CREATE DATABASE).



While on that topic, the doc should probably mention that default ICU
collations can only be deterministic.


Well, there is no option to do otherwise, so I'm not sure where/how to 
mention that.  We usually don't document options that don't exist. ;-)



Also unrelated, but how about raising a warning about possibly hiding
corruption if you use CREATE COLLATION ...  VERSION or CREATE DATABASE ...
COLLATION_VERSION if !IsBinaryUpgrade()?


Hmm, there is a point to that.  But then we should just make that an 
error.  Otherwise, we raise the warning but then there is no way to 
"fix" what was warned about.  Seems confusing.



in AlterCollation(), pg_collation_actual_version(), AlterDatabaseRefreshColl()
and pg_database_collation_actual_version():

-   datum = SysCacheGetAttr(COLLOID, tup, Anum_pg_collation_collcollate, 
);
-   Assert(!isnull);
-   newversion = 

Re: ICU for global collation

2022-03-10 Thread Julien Rouhaud
On Thu, Mar 10, 2022 at 10:52:41AM +0100, Peter Eisentraut wrote:
> On 05.03.22 09:38, Julien Rouhaud wrote:
> > @@ -168,18 +175,6 @@ DefineCollation(ParseState *pstate, List *names, List 
> > *parameters, bool if_not_e
> >   errmsg("collation \"default\" cannot be copied")));
> >  }
> > 
> > -   if (localeEl)
> > -   {
> > -   collcollate = defGetString(localeEl);
> > -   collctype = defGetString(localeEl);
> > -   }
> > [...]
> > 
> > I tried to read the function and quickly got confused about whether possible
> > problematic conditions could be reached or not and had protection or not.  I
> > think that DefineCollation is becoming more and more unreadable, with a mix 
> > of
> > fromEl, !fromEl and either.  Maybe it would be a good time to refactor the 
> > code
> > a bit and have have something like
> > 
> > if (fromEl)
> > {
> >  // initialize all specific things
> > }
> > else
> > {
> >  // initialize all specific things
> > }
> > 
> > // handle defaults and sanity checks
> > 
> > What do you think?
> 
> How about the attached?

Thanks!  That's exactly what I had in mind.  I think it's easier to follow and
make sure of what it's doing exactly, so +1 for it.




Re: ICU for global collation

2022-03-10 Thread Peter Eisentraut

On 05.03.22 09:38, Julien Rouhaud wrote:

@@ -168,18 +175,6 @@ DefineCollation(ParseState *pstate, List *names, List 
*parameters, bool if_not_e
  errmsg("collation \"default\" cannot be copied")));
 }

-   if (localeEl)
-   {
-   collcollate = defGetString(localeEl);
-   collctype = defGetString(localeEl);
-   }
[...]

I tried to read the function and quickly got confused about whether possible
problematic conditions could be reached or not and had protection or not.  I
think that DefineCollation is becoming more and more unreadable, with a mix of
fromEl, !fromEl and either.  Maybe it would be a good time to refactor the code
a bit and have have something like

if (fromEl)
{
 // initialize all specific things
}
else
{
 // initialize all specific things
}

// handle defaults and sanity checks

What do you think?


How about the attached?
From c7b408ba646c5ee5f8c6ae84bec04ca4059ed08f Mon Sep 17 00:00:00 2001
From: Peter Eisentraut 
Date: Thu, 10 Mar 2022 10:49:49 +0100
Subject: [PATCH] DefineCollation() code cleanup

Reorganize the code in DefineCollation() so that the parts using the
FROM clause and the parts not doing so are more cleanly separated.  No
functionality change intended.

Reported-by: Julien Rouhaud 
---
 src/backend/commands/collationcmds.c | 109 ++-
 1 file changed, 57 insertions(+), 52 deletions(-)

diff --git a/src/backend/commands/collationcmds.c 
b/src/backend/commands/collationcmds.c
index 12fc2316f9..93df1d366c 100644
--- a/src/backend/commands/collationcmds.c
+++ b/src/backend/commands/collationcmds.c
@@ -63,12 +63,11 @@ DefineCollation(ParseState *pstate, List *names, List 
*parameters, bool if_not_e
DefElem*providerEl = NULL;
DefElem*deterministicEl = NULL;
DefElem*versionEl = NULL;
-   char   *collcollate = NULL;
-   char   *collctype = NULL;
-   char   *collproviderstr = NULL;
-   boolcollisdeterministic = true;
-   int collencoding = 0;
-   charcollprovider = 0;
+   char   *collcollate;
+   char   *collctype;
+   boolcollisdeterministic;
+   int collencoding;
+   charcollprovider;
char   *collversion = NULL;
Oid newoid;
ObjectAddress address;
@@ -167,65 +166,71 @@ DefineCollation(ParseState *pstate, List *names, List 
*parameters, bool if_not_e

(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
 errmsg("collation \"default\" cannot 
be copied")));
}
-
-   if (localeEl)
+   else
{
-   collcollate = defGetString(localeEl);
-   collctype = defGetString(localeEl);
-   }
+   char   *collproviderstr = NULL;
 
-   if (lccollateEl)
-   collcollate = defGetString(lccollateEl);
+   collcollate = NULL;
+   collctype = NULL;
 
-   if (lcctypeEl)
-   collctype = defGetString(lcctypeEl);
+   if (localeEl)
+   {
+   collcollate = defGetString(localeEl);
+   collctype = defGetString(localeEl);
+   }
 
-   if (providerEl)
-   collproviderstr = defGetString(providerEl);
+   if (lccollateEl)
+   collcollate = defGetString(lccollateEl);
 
-   if (deterministicEl)
-   collisdeterministic = defGetBoolean(deterministicEl);
+   if (lcctypeEl)
+   collctype = defGetString(lcctypeEl);
 
-   if (versionEl)
-   collversion = defGetString(versionEl);
+   if (providerEl)
+   collproviderstr = defGetString(providerEl);
 
-   if (collproviderstr)
-   {
-   if (pg_strcasecmp(collproviderstr, "icu") == 0)
-   collprovider = COLLPROVIDER_ICU;
-   else if (pg_strcasecmp(collproviderstr, "libc") == 0)
-   collprovider = COLLPROVIDER_LIBC;
+   if (deterministicEl)
+   collisdeterministic = defGetBoolean(deterministicEl);
+   else
+   collisdeterministic = true;
+
+   if (versionEl)
+   collversion = defGetString(versionEl);
+
+   if (collproviderstr)
+   {
+   if (pg_strcasecmp(collproviderstr, "icu") == 0)
+   collprovider = COLLPROVIDER_ICU;
+   else if (pg_strcasecmp(collproviderstr, "libc") == 0)
+   collprovider = COLLPROVIDER_LIBC;
+   else
+   ereport(ERROR,
+   
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+   

Re: ICU for global collation

2022-03-05 Thread Julien Rouhaud
Hi,

On Wed, Feb 16, 2022 at 03:25:40PM +0100, Peter Eisentraut wrote:
>
> All that preliminary work has been completed, so here is a new patch.
>
> There isn't actually much left here now except all the new DDL and
> command-line options to set this up and documentation for those.  I have
> given all that another review and I hope it's more intuitive now, but I
> guess there will be other opinions.

Sorry it took me a bit of time to come back to this patch.

TL;DR it works as expected.  I just have a few comments, mostly on the doc.

I say it works because I did manually check, as far as I can see there isn't
any test that ensures it.

I'm using this naive scenario:

DROP DATABASE IF EXISTS dbicu;
CREATE DATABASE dbicu LOCALE_PROVIDER icu LOCALE 'en_US' ICU_LOCALE 
'en-u-kf-upper' template 'template0';
\c dbicu
CREATE COLLATION upperfirst (provider = icu, locale = 'en-u-kf-upper');
CREATE TABLE icu(def text, en text COLLATE "en_US", upfirst text COLLATE 
upperfirst);
INSERT INTO icu VALUES ('a', 'a', 'a'), ('b','b','b'), ('A','A','A'), 
('B','B','B');
SELECT def AS def FROM icu ORDER BY def;
SELECT def AS en FROM icu ORDER BY en;
SELECT def AS upfirst FROM icu ORDER BY upfirst;
SELECT def AS upfirst_explicit FROM icu ORDER BY en COLLATE upperfirst;
SELECT def AS en_x_explicit FROM icu ORDER BY def COLLATE "en-x-icu";

Maybe there should be some test along those lines included in the patch?

> I have changed the terminology a bit to match ICU better.  It's now called
> "ICU locale ID" and "locale provider" (instead of "collation"). It might
> actually cover things that are not strictly collations (such as the isalpha
> stuff in text search, maybe, in the future).

I'm not sure that's actually such an improvement as-is.  Simply saying "ICU
locale ID" is, at least for me, somewhat ambiguous as I don't see any place in
our docs where it's clearly defined.  I'm afraid that users might confuse it
with the OID of a pg_collation line, or even a collation name (like en-x-icu),
especially since there's no simple way to know if what you used means what you
thought it meant.

Also, it's not even used consistently in the patch.  I can see e.g. "ICU
locale" or "ICU locale setting" being used:

+ 
+  icu_locale
+  
+   
+Specifies the ICU locale if the ICU locale provider is used.  If
+this is not specified, the value from the LOCALE
+option is used.
+   
+  
+ 

+  --icu-locale=locale
+  
+   
+Specifies the ICU locale setting to be used in this database, if the
+ICU locale provider is selected.
+   

Maybe we could point to the ICU documentation for a clear notion of what a
locale ID is, and/or use their own short definition [1]:

> The locale object in ICU is an identifier that specifies a particular locale
> and has fields for language, country, and an optional code to specify further
> variants or subdivisions. These fields also can be represented as a string
> with the fields separated by an underscore.

It seems critical to be crystal clear about what should be specified there.

I spent some time looking at the ICU api trying to figure out if using a
posix locale name (e.g. en_US) was actually compatible with an ICU locale name.
It seems that ICU accept any of 'en-us', 'en-US', 'en_us' or 'en_US' as the
same locale, but I might be wrong.  I also didn't find a way to figure out how
to ask ICU if the locale identifier passed is complete garbage or not.  One
sure thing is that the system collation we import are of the form 'en-us', so
it seems weird to have this form in pg_collation and by default another form in
pg_database.

All in all I'm a bit worried of having the icu_locale optional.  Note that this
is inconsistent with createdb(), as there's at least one code path where the
icu_locale is not optional:

+   if (dblocprovider == COLLPROVIDER_ICU)
+   {
+   /*
+* This would happen if template0 uses the libc provider but the new
+* database uses icu.
+*/
+   if (!dbiculocale)
+   ereport(ERROR,
+   (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+errmsg("ICU locale must be specified")));
+   }

>
> One thing that is left that bothers me is that the invocations of
> get_collation_actual_version() have all gotten quite complicated.  I'm
> thinking about ways to refactor that, but I haven't got a great idea yet.

Indeed, and I don't have a great idea either.  Maybe
get_collation_actual_version_extended that would accept both strings?


In CREATE DATABASE manual:

+Specifies the provider to use for the default collation in this
+database.  Possible values are:
+icu,ICU
+libc.  libc is the default.  The
+available choices depend on the operating system and build options.

That's actually not true as pg_strcasecmp is used in createdb():

+   if (pg_strcasecmp(locproviderstr, "icu") == 0)
+   dblocprovider = 

Re: ICU for global collation

2022-02-16 Thread Peter Eisentraut

On 02.02.22 14:01, Peter Eisentraut wrote:
Here is the main patch rebased on the various changes that have been 
committed in the meantime.  There is still some work to be done on the 
user interfaces of initdb, createdb, etc.


I have split out the database-level collation version tracking into a 
separate patch [0].  I think we should get that one in first and then 
refresh this one.


All that preliminary work has been completed, so here is a new patch.

There isn't actually much left here now except all the new DDL and 
command-line options to set this up and documentation for those.  I have 
given all that another review and I hope it's more intuitive now, but I 
guess there will be other opinions.


I have changed the terminology a bit to match ICU better.  It's now 
called "ICU locale ID" and "locale provider" (instead of "collation"). 
It might actually cover things that are not strictly collations (such as 
the isalpha stuff in text search, maybe, in the future).


One thing that is left that bothers me is that the invocations of 
get_collation_actual_version() have all gotten quite complicated.  I'm 
thinking about ways to refactor that, but I haven't got a great idea yet.From 6d265e2cf78546bcc25d03031ea03f397f1c1c1b Mon Sep 17 00:00:00 2001
From: Peter Eisentraut 
Date: Wed, 16 Feb 2022 15:11:04 +0100
Subject: [PATCH v5] Add option to use ICU as global locale provider

This adds the option to use ICU as the default locale provider for
either the whole cluster or a database.  New options for initdb,
createdb, and CREATE DATABASE are used to select this.

Discussion: 
https://www.postgresql.org/message-id/flat/5e756dd6-0e91-d778-96fd-b1bcb06c161a%402ndquadrant.com

XXX catversion bump
---
 doc/src/sgml/catalogs.sgml|   9 +
 doc/src/sgml/ref/create_database.sgml |  27 +++
 doc/src/sgml/ref/createdb.sgml|  19 ++
 doc/src/sgml/ref/initdb.sgml  |  76 ++--
 src/backend/catalog/pg_collation.c|  18 +-
 src/backend/commands/collationcmds.c  |  96 +++
 src/backend/commands/dbcommands.c | 162 +++---
 src/backend/utils/adt/pg_locale.c | 144 ++--
 src/backend/utils/init/postinit.c |  21 ++-
 src/bin/initdb/Makefile   |   4 +-
 src/bin/initdb/initdb.c   | 110 ++--
 src/bin/initdb/t/001_initdb.pl|  25 +++
 src/bin/pg_dump/pg_dump.c |  31 +++-
 src/bin/pg_upgrade/check.c|  13 ++
 src/bin/pg_upgrade/info.c |  18 +-
 src/bin/pg_upgrade/pg_upgrade.h   |   2 +
 src/bin/psql/describe.c   |  23 ++-
 src/bin/psql/tab-complete.c   |   3 +-
 src/bin/scripts/Makefile  |   2 +
 src/bin/scripts/createdb.c|  20 +++
 src/bin/scripts/t/020_createdb.pl |  28 +++
 src/include/catalog/pg_collation.dat  |   3 +-
 src/include/catalog/pg_collation.h|  20 ++-
 src/include/catalog/pg_database.dat   |   4 +-
 src/include/catalog/pg_database.h |   6 +
 src/include/utils/pg_locale.h |   5 +
 .../regress/expected/collate.icu.utf8.out |  10 +-
 src/test/regress/sql/collate.icu.utf8.sql |   8 +-
 28 files changed, 741 insertions(+), 166 deletions(-)

diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index 5a1627a394..8fde32dfac 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -2384,6 +2384,15 @@ pg_collation 
Columns
   
  
 
+ 
+  
+   colliculocale text
+  
+  
+   ICU locale ID for this collation object
+  
+ 
+
  
   
collversion text
diff --git a/doc/src/sgml/ref/create_database.sgml 
b/doc/src/sgml/ref/create_database.sgml
index f70d0c75b4..544c1cb443 100644
--- a/doc/src/sgml/ref/create_database.sgml
+++ b/doc/src/sgml/ref/create_database.sgml
@@ -28,6 +28,8 @@
[ LOCALE [=] locale ]
[ LC_COLLATE [=] lc_collate ]
[ LC_CTYPE [=] lc_ctype ]
+   [ ICU_LOCALE [=] icu_locale ]
+   [ LOCALE_PROVIDER [=] locale_provider ]
[ COLLATION_VERSION = collation_version ]
[ TABLESPACE [=] tablespace_name ]
[ ALLOW_CONNECTIONS [=] allowconn ]
@@ -160,6 +162,31 @@ Parameters
   
  
 
+ 
+  icu_locale
+  
+   
+Specifies the ICU locale if the ICU locale provider is used.  If
+this is not specified, the value from the LOCALE
+option is used.
+   
+  
+ 
+
+ 
+  locale_provider
+
+  
+   
+Specifies the provider to use for the default collation in this
+database.  Possible values are:
+icu,ICU
+libc.  libc is the default.  The
+available choices depend on the operating system and build options.
+   
+  
+ 
+
  

Re: ICU for global collation

2022-02-02 Thread Peter Eisentraut


On 27.01.22 09:10, Peter Eisentraut wrote:

On 21.01.22 17:13, Julien Rouhaud wrote:

On Fri, Jan 21, 2022 at 03:24:02PM +0100, Peter Eisentraut wrote:

On 21.01.22 14:51, Julien Rouhaud wrote:
Is that change intended?  There isn't any usage of the 
collversionstr before

the possible error when actual_versionstr is missing.


I wanted to move it closer to the SysCacheGetAttr() where the "datum" 
value
is obtained.  It seemed weird to get the datum, then do other things, 
then

decode the datum.


Oh ok.  It won't make much difference performance-wise, so no objection.


I have committed this and will provide follow-up patches in the next few 
days.


Here is the main patch rebased on the various changes that have been 
committed in the meantime.  There is still some work to be done on the 
user interfaces of initdb, createdb, etc.


I have split out the database-level collation version tracking into a 
separate patch [0].  I think we should get that one in first and then 
refresh this one.


[0]: 
https://www.postgresql.org/message-id/flat/f0ff3190-29a3-5b39-a179-fa32eee57db6%40enterprisedb.comFrom 4028980f6be3662c0302575ed92de77e941e5a9e Mon Sep 17 00:00:00 2001
From: Peter Eisentraut 
Date: Wed, 2 Feb 2022 13:54:04 +0100
Subject: [PATCH v4] Add option to use ICU as global collation provider

This adds the option to use ICU as the default collation provider for
either the whole cluster or a database.  New options for initdb,
createdb, and CREATE DATABASE are used to select this.

Discussion: 
https://www.postgresql.org/message-id/flat/5e756dd6-0e91-d778-96fd-b1bcb06c161a%402ndquadrant.com
---
 doc/src/sgml/catalogs.sgml|   9 +
 doc/src/sgml/ref/create_database.sgml |  16 ++
 doc/src/sgml/ref/createdb.sgml|   9 +
 doc/src/sgml/ref/initdb.sgml  |  23 ++
 src/backend/catalog/pg_collation.c|  18 +-
 src/backend/commands/collationcmds.c  |  93 ---
 src/backend/commands/dbcommands.c |  72 +-
 src/backend/utils/adt/pg_locale.c | 242 +++---
 src/backend/utils/init/postinit.c |  26 ++
 src/bin/initdb/Makefile   |   2 +
 src/bin/initdb/initdb.c   |  64 -
 src/bin/initdb/t/001_initdb.pl|  18 +-
 src/bin/pg_dump/pg_dump.c |  19 ++
 src/bin/pg_upgrade/check.c|  10 +
 src/bin/pg_upgrade/info.c |  18 +-
 src/bin/pg_upgrade/pg_upgrade.h   |   2 +
 src/bin/psql/describe.c   |  23 +-
 src/bin/psql/tab-complete.c   |   2 +-
 src/bin/scripts/Makefile  |   2 +
 src/bin/scripts/createdb.c|   9 +
 src/bin/scripts/t/020_createdb.pl |  20 +-
 src/include/catalog/pg_collation.dat  |   3 +-
 src/include/catalog/pg_collation.h|   6 +-
 src/include/catalog/pg_database.dat   |   4 +-
 src/include/catalog/pg_database.h |   6 +
 src/include/utils/pg_locale.h |   6 +
 .../regress/expected/collate.icu.utf8.out |  10 +-
 src/test/regress/sql/collate.icu.utf8.sql |   8 +-
 28 files changed, 572 insertions(+), 168 deletions(-)

diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index 7d5b0b1656..5a5779b9a3 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -2384,6 +2384,15 @@ pg_collation 
Columns
   
  
 
+ 
+  
+   collicucoll text
+  
+  
+   ICU collation string
+  
+ 
+
  
   
collversion text
diff --git a/doc/src/sgml/ref/create_database.sgml 
b/doc/src/sgml/ref/create_database.sgml
index f22e28dc81..403010cddf 100644
--- a/doc/src/sgml/ref/create_database.sgml
+++ b/doc/src/sgml/ref/create_database.sgml
@@ -28,6 +28,7 @@
[ LOCALE [=] locale ]
[ LC_COLLATE [=] lc_collate ]
[ LC_CTYPE [=] lc_ctype ]
+   [ COLLATION_PROVIDER [=] collation_provider ]
[ TABLESPACE [=] tablespace_name ]
[ ALLOW_CONNECTIONS [=] allowconn ]
[ CONNECTION LIMIT [=] connlimit ]
@@ -158,6 +159,21 @@ Parameters

   
  
+
+ 
+  collation_provider
+
+  
+   
+Specifies the provider to use for the default collation in this
+database.  Possible values are:
+icu,ICU
+libc.  libc is the default.  The
+available choices depend on the operating system and build options.
+   
+  
+ 
+
  
   tablespace_name
   
diff --git a/doc/src/sgml/ref/createdb.sgml b/doc/src/sgml/ref/createdb.sgml
index 86473455c9..4b07363fcc 100644
--- a/doc/src/sgml/ref/createdb.sgml
+++ b/doc/src/sgml/ref/createdb.sgml
@@ -83,6 +83,15 @@ Options
   
  
 
+ 
+  
--collation-provider={libc|icu}
+  
+   
+Specifies the collation provider for the database's default

Re: ICU for global collation

2022-01-27 Thread Peter Eisentraut

On 21.01.22 17:13, Julien Rouhaud wrote:

On Fri, Jan 21, 2022 at 03:24:02PM +0100, Peter Eisentraut wrote:

On 21.01.22 14:51, Julien Rouhaud wrote:

Is that change intended?  There isn't any usage of the collversionstr before
the possible error when actual_versionstr is missing.


I wanted to move it closer to the SysCacheGetAttr() where the "datum" value
is obtained.  It seemed weird to get the datum, then do other things, then
decode the datum.


Oh ok.  It won't make much difference performance-wise, so no objection.


I have committed this and will provide follow-up patches in the next few 
days.





Re: ICU for global collation

2022-01-21 Thread Julien Rouhaud
On Fri, Jan 21, 2022 at 03:24:02PM +0100, Peter Eisentraut wrote:
> On 21.01.22 14:51, Julien Rouhaud wrote:
> > Is that change intended?  There isn't any usage of the collversionstr before
> > the possible error when actual_versionstr is missing.
> 
> I wanted to move it closer to the SysCacheGetAttr() where the "datum" value
> is obtained.  It seemed weird to get the datum, then do other things, then
> decode the datum.

Oh ok.  It won't make much difference performance-wise, so no objection.




Re: ICU for global collation

2022-01-21 Thread Peter Eisentraut

On 21.01.22 14:51, Julien Rouhaud wrote:

 From 1c46bf3138ad42074971aa3130142236de7e63f7 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut 
Date: Fri, 21 Jan 2022 10:01:25 +0100
Subject: [PATCH] Change collate and ctype fields to type text


+   collversionstr = TextDatumGetCString(datum);
+
actual_versionstr = 
get_collation_actual_version(collform->collprovider, collcollate);
if (!actual_versionstr)
{
@@ -1606,7 +1616,6 @@ pg_newlocale_from_collation(Oid collid)
(errmsg("collation \"%s\" has no 
actual version, but a version was specified",

NameStr(collform->collname;
}
-   collversionstr = TextDatumGetCString(collversion);


Is that change intended?  There isn't any usage of the collversionstr before
the possible error when actual_versionstr is missing.


I wanted to move it closer to the SysCacheGetAttr() where the "datum" 
value is obtained.  It seemed weird to get the datum, then do other 
things, then decode the datum.





Re: ICU for global collation

2022-01-21 Thread Julien Rouhaud
Hi,

On Fri, Jan 21, 2022 at 10:44:24AM +0100, Peter Eisentraut wrote:
> 
> Here is a second preparation patch: Change collate and ctype fields to type
> text.
> 
> I think this is useful by itself because it allows longer locale names. ICU
> locale names with several options appended can be longer than 63 bytes.
> This case is probably broken right now.  Also, it saves space in the typical
> case, since most locale names are not anywhere near 63 bytes.

I totally agree.

> From 1c46bf3138ad42074971aa3130142236de7e63f7 Mon Sep 17 00:00:00 2001
> From: Peter Eisentraut 
> Date: Fri, 21 Jan 2022 10:01:25 +0100
> Subject: [PATCH] Change collate and ctype fields to type text

+   collversionstr = TextDatumGetCString(datum);
+
actual_versionstr = 
get_collation_actual_version(collform->collprovider, collcollate);
if (!actual_versionstr)
{
@@ -1606,7 +1616,6 @@ pg_newlocale_from_collation(Oid collid)
(errmsg("collation \"%s\" has 
no actual version, but a version was specified",

NameStr(collform->collname;
}
-   collversionstr = TextDatumGetCString(collversion);


Is that change intended?  There isn't any usage of the collversionstr before
the possible error when actual_versionstr is missing.

Apart from that the patch looks good to me, all tests pass and no issue with
building the doc either.




Re: ICU for global collation

2022-01-21 Thread Peter Eisentraut

On 18.01.22 13:54, Peter Eisentraut wrote:

On 18.01.22 05:02, Julien Rouhaud wrote:
If this is applied, then in my estimation all these hunks will 
completely
disappear from the global ICU patch.  So this would be a significant 
win.
Agreed, the patch will be quite smaller and also easier to review.  
Are you

planning to apply it on its own or add it to the default ICU patchset?


My plan is to apply this patch in the next few days.


This patch has been committed.

Here is a second preparation patch: Change collate and ctype fields to 
type text.


I think this is useful by itself because it allows longer locale names. 
ICU locale names with several options appended can be longer than 63 
bytes.  This case is probably broken right now.  Also, it saves space in 
the typical case, since most locale names are not anywhere near 63 bytes.From 1c46bf3138ad42074971aa3130142236de7e63f7 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut 
Date: Fri, 21 Jan 2022 10:01:25 +0100
Subject: [PATCH] Change collate and ctype fields to type text

This changes the data type of the catalog fields datcollate, datctype,
collcollate, and collctype from name to text.  There wasn't ever a
really good reason for them to be of type name; presumably this was
just carried over from when they were fixed-size fields in pg_control,
first into the corresponding pg_database fields, and then to
pg_collation.  The values are not identifiers or object names, and we
don't ever look them up that way.

Changing to type text saves space in the typical case, since locale
names are typically less than 10 bytes long.  But it is also possible
that an ICU locale name with several customization options appended
could be longer than 63 bytes, so this also enables that case, which
was previously probably broken.

Discussion: 
https://www.postgresql.org/message-id/flat/5e756dd6-0e91-d778-96fd-b1bcb06c1...@2ndquadrant.com
---
 doc/src/sgml/catalogs.sgml   | 40 +--
 src/backend/catalog/pg_collation.c   | 10 ++-
 src/backend/commands/collationcmds.c | 41 +++-
 src/backend/commands/dbcommands.c| 21 ++
 src/backend/utils/adt/pg_locale.c| 29 +---
 src/backend/utils/init/postinit.c| 11 ++--
 src/include/catalog/pg_collation.h   |  4 +--
 src/include/catalog/pg_database.h| 12 
 src/include/utils/pg_locale.h|  2 ++
 9 files changed, 104 insertions(+), 66 deletions(-)

diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index 1e65c426b2..7d5b0b1656 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -2368,7 +2368,7 @@ pg_collation 
Columns
 
  
   
-   collcollate name
+   collcollate text
   
   
LC_COLLATE for this collation object
@@ -2377,7 +2377,7 @@ pg_collation 
Columns
 
  
   
-   collctype name
+   collctype text
   
   
LC_CTYPE for this collation object
@@ -2951,24 +2951,6 @@ pg_database 
Columns
   
  
 
- 
-  
-   datcollate name
-  
-  
-   LC_COLLATE for this database
-  
- 
-
- 
-  
-   datctype name
-  
-  
-   LC_CTYPE for this database
-  
- 
-
  
   
datistemplate bool
@@ -3043,6 +3025,24 @@ pg_database 
Columns
   
  
 
+ 
+  
+   datcollate text
+  
+  
+   LC_COLLATE for this database
+  
+ 
+
+ 
+  
+   datctype text
+  
+  
+   LC_CTYPE for this database
+  
+ 
+
  
   
datacl aclitem[]
diff --git a/src/backend/catalog/pg_collation.c 
b/src/backend/catalog/pg_collation.c
index 5be6600652..bfc02d3038 100644
--- a/src/backend/catalog/pg_collation.c
+++ b/src/backend/catalog/pg_collation.c
@@ -58,9 +58,7 @@ CollationCreate(const char *collname, Oid collnamespace,
HeapTuple   tup;
Datum   values[Natts_pg_collation];
boolnulls[Natts_pg_collation];
-   NameDataname_name,
-   name_collate,
-   name_ctype;
+   NameDataname_name;
Oid oid;
ObjectAddress myself,
referenced;
@@ -163,10 +161,8 @@ CollationCreate(const char *collname, Oid collnamespace,
values[Anum_pg_collation_collprovider - 1] = CharGetDatum(collprovider);
values[Anum_pg_collation_collisdeterministic - 1] = 
BoolGetDatum(collisdeterministic);
values[Anum_pg_collation_collencoding - 1] = 
Int32GetDatum(collencoding);
-   namestrcpy(_collate, collcollate);
-   values[Anum_pg_collation_collcollate - 1] = NameGetDatum(_collate);
-   namestrcpy(_ctype, collctype);
-   values[Anum_pg_collation_collctype - 1] = NameGetDatum(_ctype);
+   values[Anum_pg_collation_collcollate - 1] = 
CStringGetTextDatum(collcollate);
+   values[Anum_pg_collation_collctype - 

Re: ICU for global collation

2022-01-18 Thread Peter Eisentraut

On 18.01.22 05:02, Julien Rouhaud wrote:

If this is applied, then in my estimation all these hunks will completely
disappear from the global ICU patch.  So this would be a significant win.

Agreed, the patch will be quite smaller and also easier to review.  Are you
planning to apply it on its own or add it to the default ICU patchset?


My plan is to apply this patch in the next few days.




Re: ICU for global collation

2022-01-17 Thread Julien Rouhaud
Hi,

On Thu, Jan 13, 2022 at 09:39:42AM +0100, Peter Eisentraut wrote:
> On 11.01.22 12:08, Julien Rouhaud wrote:
> > > So, unless there are concerns, I'm going to see about making a patch to 
> > > call
> > > pg_newlocale_from_collation() even with the default collation. That would
> > > make the actual feature patch quite a bit smaller, since we won't have to
> > > patch every call site of pg_newlocale_from_collation().
> > +1 for me!
> 
> Here is that patch.

The patch is quite straightforward so I don't have much to say, it all looks
good and passes all regression tests.

> If this is applied, then in my estimation all these hunks will completely
> disappear from the global ICU patch.  So this would be a significant win.

Agreed, the patch will be quite smaller and also easier to review.  Are you
planning to apply it on its own or add it to the default ICU patchset?  Given
the possible backpatch conflicts it can bring I'm not sure it's worthy enough
to apply on its own.




Re: ICU for global collation

2022-01-17 Thread Julien Rouhaud
Hi,

On Mon, Jan 17, 2022 at 07:07:38PM +, Finnerty, Jim wrote:
> On 10.01.22 12:49, Daniel Verite wrote:
> 
> > I think some users would want their db-wide ICU collation to be
> > case/accent-insensitive. 
> ...
> > IIRC, that was the context for some questions where people were
> > enquiring about db-wide ICU collations.
> 
> +1.  There is the DEFAULT_COLLATION_OID, which is the cluster-level default
> collation, a.k.a. the "global collation", as distinct from the "db-wide"
> database-level default collation, which controls the default type of the
> collatable types within that database.

There's no cluster-level default collation, and DEFAULT_COLLATION_OID is always
database-level (and template1 having a specific default collation).  The
template0 database is there to be able to support different databases with
different default collations, among other things.

So this patchset would allow per-database default ICU collation, although not
non-deterministic ones.




Re: ICU for global collation

2022-01-17 Thread Finnerty, Jim
On 10.01.22 12:49, Daniel Verite wrote:

> I think some users would want their db-wide ICU collation to be
> case/accent-insensitive. 
...
> IIRC, that was the context for some questions where people were
> enquiring about db-wide ICU collations.

+1.  There is the DEFAULT_COLLATION_OID, which is the cluster-level default 
collation, a.k.a. the "global collation", as distinct from the "db-wide" 
database-level default collation, which controls the default type of the 
collatable types within that database.

> With the current patch, it's not possible, AFAICS, because the user
> can't tell that the collation is non-deterministic. Presumably this
> would require another option to CREATE DATABASE and another
> column to store that bit of information.

On 1/11/22, 6:24 AM, "Peter Eisentraut"  
wrote:
   
>  Adding this would be easy, but since pattern matching currently does not
>  support nondeterministic collations, if you make a global collation
>  nondeterministic, a lot of system views, psql, pg_dump queries etc.
>  would break, so it's not practical.  I view this is an orthogonal
>  project.  Once we can support this without breaking system views etc.,
>  then it's easy to enable with a new column in pg_database.

So this patch only enables the default cluster collation 
(DEFAULT_COLLATION_OID) to be a deterministic ICU collation, but doesn't extend 
the metadata in a way that would enable database collations to be ICU 
collations?

Waiting for the pattern matching problem to be solved in general before 
creating the metadata to support ICU collations everywhere will make it more 
difficult for members of the community to help solve the pattern matching 
problem.  

What additional metadata changes would be required to enable an ICU collation 
to be specified at either the cluster-level or the database-level, even if new 
checks need to be added to disallow a nondeterministic collation to be 
specified at the cluster level for now?




Re: ICU for global collation

2022-01-17 Thread Finnerty, Jim
Re: 
>> After this patch goes in, the big next thing would be to support
>> nondeterministic collations for LIKE, ILIKE and pattern matching operators in
>> general.  Is anyone interested in working on that?

> As far as I know you're the last person that seemed to be working on that 
> topic
> back in March :)

I have a solution for LIKE and ILIKE for case-insensitive, accent-sensitive 
ICU collations and the UTF8 server encoding, but didn't attempt to address the 
general case until ICU collations were supported at the database and cluster 
levels.  I may have another look at that after this patch goes in.




Re: ICU for global collation

2022-01-13 Thread Peter Eisentraut

On 11.01.22 12:08, Julien Rouhaud wrote:

So, unless there are concerns, I'm going to see about making a patch to call
pg_newlocale_from_collation() even with the default collation. That would
make the actual feature patch quite a bit smaller, since we won't have to
patch every call site of pg_newlocale_from_collation().

+1 for me!


Here is that patch.

If this is applied, then in my estimation all these hunks will 
completely disappear from the global ICU patch.  So this would be a 
significant win.From 2ee4b77221020e81ec83cf37d36e3955bf619d80 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut 
Date: Thu, 13 Jan 2022 09:14:41 +0100
Subject: [PATCH] Call pg_newlocale_from_collation() also with default
 collation

Previously, callers of pg_newlocale_from_collation() did not call it
if the collation was DEFAULT_COLLATION_OID and instead proceeded with
a pg_locale_t of 0.  Instead, now we call it anyway and have it return
0 if the default collation was passed.  It already did this, so we
just have to adjust the callers.  This simplifies all the call sites
and also makes future enhancements easier.

After discussion and testing, the previous comment in pg_locale.c
about avoiding this for performance reasons may have been mistaken
since it was testing a very different patch version way back when.

Discussion: 
https://www.postgresql.org/message-id/ed3baa81-7fac-7788-cc12-41e3f7917...@enterprisedb.com
---
 src/backend/access/hash/hashfunc.c   |  4 +-
 src/backend/regex/regc_pg_locale.c   | 40 ++--
 src/backend/utils/adt/formatting.c   | 96 +---
 src/backend/utils/adt/like.c | 37 ++-
 src/backend/utils/adt/like_support.c | 27 
 src/backend/utils/adt/pg_locale.c|  3 -
 src/backend/utils/adt/varchar.c  | 26 +---
 src/backend/utils/adt/varlena.c  | 34 ++
 8 files changed, 135 insertions(+), 132 deletions(-)

diff --git a/src/backend/access/hash/hashfunc.c 
b/src/backend/access/hash/hashfunc.c
index 0521c69dd5..b57ed946c4 100644
--- a/src/backend/access/hash/hashfunc.c
+++ b/src/backend/access/hash/hashfunc.c
@@ -278,7 +278,7 @@ hashtext(PG_FUNCTION_ARGS)
 errmsg("could not determine which collation to 
use for string hashing"),
 errhint("Use the COLLATE clause to set the 
collation explicitly.")));
 
-   if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
+   if (!lc_collate_is_c(collid))
mylocale = pg_newlocale_from_collation(collid);
 
if (!mylocale || mylocale->deterministic)
@@ -334,7 +334,7 @@ hashtextextended(PG_FUNCTION_ARGS)
 errmsg("could not determine which collation to 
use for string hashing"),
 errhint("Use the COLLATE clause to set the 
collation explicitly.")));
 
-   if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
+   if (!lc_collate_is_c(collid))
mylocale = pg_newlocale_from_collation(collid);
 
if (!mylocale || mylocale->deterministic)
diff --git a/src/backend/regex/regc_pg_locale.c 
b/src/backend/regex/regc_pg_locale.c
index e0d93eab32..6e84f42cb2 100644
--- a/src/backend/regex/regc_pg_locale.c
+++ b/src/backend/regex/regc_pg_locale.c
@@ -231,6 +231,18 @@ static const unsigned char pg_char_properties[128] = {
 void
 pg_set_regex_collation(Oid collation)
 {
+   if (!OidIsValid(collation))
+   {
+   /*
+* This typically means that the parser could not resolve a
+* conflict of implicit collations, so report it that way.
+*/
+   ereport(ERROR,
+   (errcode(ERRCODE_INDETERMINATE_COLLATION),
+errmsg("could not determine which collation to 
use for regular expression"),
+errhint("Use the COLLATE clause to set the 
collation explicitly.")));
+   }
+
if (lc_ctype_is_c(collation))
{
/* C/POSIX collations use this path regardless of database 
encoding */
@@ -240,28 +252,12 @@ pg_set_regex_collation(Oid collation)
}
else
{
-   if (collation == DEFAULT_COLLATION_OID)
-   pg_regex_locale = 0;
-   else if (OidIsValid(collation))
-   {
-   /*
-* NB: pg_newlocale_from_collation will fail if not 
HAVE_LOCALE_T;
-* the case of pg_regex_locale != 0 but not 
HAVE_LOCALE_T does not
-* have to be considered below.
-*/
-   pg_regex_locale = 
pg_newlocale_from_collation(collation);
-   }
-   else
-   {
-   /*
-* This typically means that the parser could not 
resolve a
-* conflict of implicit collations, so 

Re: ICU for global collation

2022-01-11 Thread Julien Rouhaud
On Tue, Jan 11, 2022 at 12:36:46PM +0100, Daniel Verite wrote:
> 
> If CREATE DATABASE referred to a collation in the template db,
> either that collation already exists, or the user would have to add it
> to the template db with CREATE COLLATION.
> initdb already populates the template databases with a full set of
> ICU collations through pg_import_system_collations().
> I don't quite see what change you're seeing that would be needed in
> initdb.

Yes, there are already the system collation imported.  But you still need to
make sure that that collation does exist in the template database, and it's
still impossible to connect to that database to check when processing the
CREATE DATABASE.  Also, if the wanted collation wasn't imported by
pg_import_system_collations() and isn't the server's default collation, then
the user would have to allow connection on the template0 database and create
the wanted collation there.  It doesn't seem like something that should be
recommended for a reasonably standard use case.




Re: ICU for global collation

2022-01-11 Thread Daniel Verite
Julien Rouhaud wrote:

> > I guess there's still the possibility of requiring that the ICU db-wide
> > collation of the new database does exist in the template database,
> > and then the CREATE DATABASE would refer to that collation instead of
> > an independent locale string.
> 
> That could work.  However if having the collation in the template database a
> strict requirement the something should also be done for initdb, and it will
> probably be a bigger problem.

If CREATE DATABASE referred to a collation in the template db,
either that collation already exists, or the user would have to add it
to the template db with CREATE COLLATION.
initdb already populates the template databases with a full set of
ICU collations through pg_import_system_collations().
I don't quite see what change you're seeing that would be needed in
initdb.


Best regards,
-- 
Daniel Vérité
PostgreSQL-powered mailer: https://www.manitou-mail.org
Twitter: @DanielVerite




Re: ICU for global collation

2022-01-11 Thread Peter Eisentraut

On 10.01.22 12:49, Daniel Verite wrote:

With the current patch, it's not possible, AFAICS, because the user
can't tell that the collation is non-deterministic. Presumably this
would require another option to CREATE DATABASE and another
column to store that bit of information.


Adding this would be easy, but since pattern matching currently does not 
support nondeterministic collations, if you make a global collation 
nondeterministic, a lot of system views, psql, pg_dump queries etc. 
would break, so it's not practical.  I view this is an orthogonal 
project.  Once we can support this without breaking system views etc., 
then it's easy to enable with a new column in pg_database.



The "daticucol" column also suggests that we don't expect to add
other collation providers in the future. Maybe a pair of columns like
(datcollprovider, datcolllocale) would be more future-proof,
or a (datcollprovider, datcolllocale, datcollisdeterministic)
triplet if non-deterministic collations are allowed.


I don't expect many new collation providers.  So I don't think an 
EAV-like storage would be helpful.  The other problem is that we don't 
know what we need.  For example, the libc provider needs both a collate 
and a ctype value, so that wouldn't fit into that scheme nicely.



Also, pg_collation has "collversion" to detect a mismatch between
the ICU runtime and existing indexes. I don't see that field
for the db-wide ICU collation, so maybe we currently miss the capability
to detect the mismatch for the db-wide collation?


Yeah, I think I need to add a datcollversion field and the associated 
checks.





Re: ICU for global collation

2022-01-11 Thread Peter Eisentraut



On 07.01.22 10:03, Julien Rouhaud wrote:

I changed the datcollate, datctype, and the new daticucoll fields to type
text (from name).  That way, the daticucoll field can be set to null if it's
not applicable.  Also, the limit of 63 characters can actually be a problem
if you want to use some combination of the options that ICU locales offer.
And for less extreme uses, having variable-length fields will save some
storage, since typical locale names are much shorter.


I understand the need to have daticucoll as text, however it's not clear to me
why this has to be changed for datcollate and datctype?  IIUC those will only
ever contain libc-based collation and are still mandatory?


Right.  I just did this for consistency.  It would be strange otherwise 
to have some fields as name and some as text.  Arguably, using "name" 
here was wrong to begin with, since they are not really object names. 
Maybe there used to be a reason to avoid variable-length fields in 
pg_database, but there isn't one now AFAICT.



- pg_upgrade

It checks (in check_locale_and_encoding()) the compatibility for each database,
and it looks like the daticucoll field should also be verified.  Other than
that I don't think there is anything else needed for the pg_upgrade part as
everything else should be handled by pg_dump (I didn't look at the changes yet
given the below problems).


Ok, I have added this and will include it in my next patch submission.


- CREATE DATABASE



- initdb


Ok, some work is needed to make these interfaces behave sensibly in 
various combinations.  I will look into that.





Re: ICU for global collation

2022-01-11 Thread Julien Rouhaud
On Tue, Jan 11, 2022 at 10:10:25AM +0100, Peter Eisentraut wrote:
> 
> On 10.01.22 07:00, Julien Rouhaud wrote:
> > 
> > So I tried to run Noah's benchmark to see if I could reproduce the slowdown.
> > Unfortunately the results I'm getting don't really make sense as removing 
> > the
> > optimisation brings a 15% speedup, and with a few more runs I can see that I
> > have about 25% noise, so there isn't much I can do to help.
> 
> Heh, I had that same experience, it actually got faster without the
> optimization, but then got lost in the noise on further testing.

Ah, so it's not just my machine :)

> Looking back at those discussions, I don't think those old test results are
> relevant anymore.  In the patch that was being tested there,
> pg_newlocale_from_collation(), did not contain
> 
> if (collid == DEFAULT_COLLATION_OID)
> return (pg_locale_t) 0;
> 
> so the default collation actually went through most or all of the function
> and did a lot of work.  That would understandably be quite slow.  But just
> calling a function and returning immediately should not be a problem.
> Otherwise, the call to check_collation_set() in varstr_cmp() and elsewhere
> would be just as bad.

I didn't noticed that.  That definitely explain why the performance concern
isn't valid anymore.

> So, unless there are concerns, I'm going to see about making a patch to call
> pg_newlocale_from_collation() even with the default collation. That would
> make the actual feature patch quite a bit smaller, since we won't have to
> patch every call site of pg_newlocale_from_collation().

+1 for me!




Re: ICU for global collation

2022-01-11 Thread Peter Eisentraut



On 10.01.22 07:00, Julien Rouhaud wrote:

And then I changed in varstr_cmp():

 if (collid != DEFAULT_COLLATION_OID)
 mylocale = pg_newlocale_from_collation(collid);

to just

 mylocale = pg_newlocale_from_collation(collid);

I find that the \timing results are indistinguishable.  (I used locale
"en_US.UTF-8" and made sure that that code path is actually hit.)

Does anyone have other insights?


Looking at the git history, you added this comment in 414c5a2ea65.

After a bit a digging in the lists, I found that you introduced it to fix a
reported 13% slowdown in varstr_cmp():
https://www.postgresql.org/message-id/20110129075253.GA18784%40tornado.leadboat.com
https://www.postgresql.org/message-id/1296748408.6442.1.camel%40vanquo.pezone.net


So I tried to run Noah's benchmark to see if I could reproduce the slowdown.
Unfortunately the results I'm getting don't really make sense as removing the
optimisation brings a 15% speedup, and with a few more runs I can see that I
have about 25% noise, so there isn't much I can do to help.


Heh, I had that same experience, it actually got faster without the 
optimization, but then got lost in the noise on further testing.


Looking back at those discussions, I don't think those old test results 
are relevant anymore.  In the patch that was being tested there, 
pg_newlocale_from_collation(), did not contain


if (collid == DEFAULT_COLLATION_OID)
return (pg_locale_t) 0;

so the default collation actually went through most or all of the 
function and did a lot of work.  That would understandably be quite 
slow.  But just calling a function and returning immediately should not 
be a problem.  Otherwise, the call to check_collation_set() in 
varstr_cmp() and elsewhere would be just as bad.


So, unless there are concerns, I'm going to see about making a patch to 
call pg_newlocale_from_collation() even with the default collation. 
That would make the actual feature patch quite a bit smaller, since we 
won't have to patch every call site of pg_newlocale_from_collation().





Re: ICU for global collation

2022-01-10 Thread Julien Rouhaud
On Mon, Jan 10, 2022 at 03:45:47PM +0100, Daniel Verite wrote:
> 
> By that I understand that CREATE DATABASE is limited to copying a template
> database and then not write anything into it beyond that, as it's
> not even connected to it.

Yes.

> I guess there's still the possibility of requiring that the ICU db-wide
> collation of the new database does exist in the template database,
> and then the CREATE DATABASE would refer to that collation instead of
> an independent locale string.

That could work.  However if having the collation in the template database a
strict requirement the something should also be done for initdb, and it will
probably be a bigger problem.




Re: ICU for global collation

2022-01-10 Thread Daniel Verite
Julien Rouhaud wrote:

> > The lack of these fields overall suggest the idea that when CREATE
> > DATABASE is called with a global ICU collation, what if it somehow
> > inserted the collation into pg_collation in the new database?
> > Then pg_database would just store the collation oid and no other
> > collation-related field would need to be added into it, now
> > or in the future.
> 
> I don't think it would be doable given the single-database-per-backend
> restriction.

By that I understand that CREATE DATABASE is limited to copying a template
database and then not write anything into it beyond that, as it's
not even connected to it.
I guess there's still the possibility of requiring that the ICU db-wide
collation of the new database does exist in the template database,
and then the CREATE DATABASE would refer to that collation instead of
an independent locale string.


Best regards,
-- 
Daniel Vérité
PostgreSQL-powered mailer: https://www.manitou-mail.org
Twitter: @DanielVerite




Re: ICU for global collation

2022-01-10 Thread Julien Rouhaud
On Mon, Jan 10, 2022 at 12:49:07PM +0100, Daniel Verite wrote:
> 
> The "daticucol" column also suggests that we don't expect to add
> other collation providers in the future. Maybe a pair of columns like
> (datcollprovider, datcolllocale) would be more future-proof,
> or a (datcollprovider, datcolllocale, datcollisdeterministic)
> triplet if non-deterministic collations are allowed.

I'm not sure about the non-deterministic default collation given the current
restrictions with it, but the extra column seems like a good idea.  It would
require a bit more thinking, as we would need a second collation column in
pg_database for any default provider that's not libc.

> Also, pg_collation has "collversion" to detect a mismatch between
> the ICU runtime and existing indexes. I don't see that field
> for the db-wide ICU collation, so maybe we currently miss the capability
> to detect the mismatch for the db-wide collation?

I don't think that storing a version there will really help.  There's no
guarantee that any object has been created with the version of the collation
that was installed when the database was created.  And we would still need
to store a version with each underlying object anyway, as rebuilding all broken
dependencies can last for a long time, including a server restart.

> The lack of these fields overall suggest the idea that when CREATE
> DATABASE is called with a global ICU collation, what if it somehow
> inserted the collation into pg_collation in the new database?
> Then pg_database would just store the collation oid and no other
> collation-related field would need to be added into it, now
> or in the future.

I don't think it would be doable given the single-database-per-backend
restriction.




Re: ICU for global collation

2022-01-10 Thread Daniel Verite
Peter Eisentraut wrote:

> Unlike in the previous patch, where the ICU 
> collation name was written in datcollate, there is now a third column 
> (daticucoll), so we can store all three values.

I think some users would want their db-wide ICU collation to be
case/accent-insensitive. Postgres users are trained to expect
case-sensitive comparisons, but some apps initially made for
e.g. MySQL or MS-SQL that use such collations natively would be easier
to port to Postgres.
IIRC, that was the context for some questions where people were
enquiring about db-wide ICU collations.

With the current patch, it's not possible, AFAICS, because the user
can't tell that the collation is non-deterministic. Presumably this
would require another option to CREATE DATABASE and another
column to store that bit of information.

The "daticucol" column also suggests that we don't expect to add
other collation providers in the future. Maybe a pair of columns like
(datcollprovider, datcolllocale) would be more future-proof,
or a (datcollprovider, datcolllocale, datcollisdeterministic)
triplet if non-deterministic collations are allowed.

Also, pg_collation has "collversion" to detect a mismatch between
the ICU runtime and existing indexes. I don't see that field
for the db-wide ICU collation, so maybe we currently miss the capability
to detect the mismatch for the db-wide collation?

The lack of these fields overall suggest the idea that when CREATE
DATABASE is called with a global ICU collation, what if it somehow
inserted the collation into pg_collation in the new database?
Then pg_database would just store the collation oid and no other
collation-related field would need to be added into it, now
or in the future.


Best regards,
-- 
Daniel Vérité
PostgreSQL-powered mailer: https://www.manitou-mail.org
Twitter: @DanielVerite




Re: ICU for global collation

2022-01-09 Thread Julien Rouhaud
On Mon, Jan 10, 2022 at 11:25:08AM +0800, Julien Rouhaud wrote:
> On Fri, Jan 07, 2022 at 03:25:28PM +0100, Peter Eisentraut wrote:
> > 
> > I tested this a bit.  I used the following setup:
> > 
> > create table t1 (a text);
> > insert into t1 select md5(generate_series(1, 1000)::text);
> > select count(*) from t1 where a > '';
> > 
> > And then I changed in varstr_cmp():
> > 
> > if (collid != DEFAULT_COLLATION_OID)
> > mylocale = pg_newlocale_from_collation(collid);
> > 
> > to just
> > 
> > mylocale = pg_newlocale_from_collation(collid);
> > 
> > I find that the \timing results are indistinguishable.  (I used locale
> > "en_US.UTF-8" and made sure that that code path is actually hit.)
> > 
> > Does anyone have other insights?
> 
> Looking at the git history, you added this comment in 414c5a2ea65.
> 
> After a bit a digging in the lists, I found that you introduced it to fix a
> reported 13% slowdown in varstr_cmp():
> https://www.postgresql.org/message-id/20110129075253.GA18784%40tornado.leadboat.com
> https://www.postgresql.org/message-id/1296748408.6442.1.camel%40vanquo.pezone.net

So I tried to run Noah's benchmark to see if I could reproduce the slowdown.
Unfortunately the results I'm getting don't really make sense as removing the
optimisation brings a 15% speedup, and with a few more runs I can see that I
have about 25% noise, so there isn't much I can do to help.




Re: ICU for global collation

2022-01-09 Thread Julien Rouhaud
On Fri, Jan 07, 2022 at 03:25:28PM +0100, Peter Eisentraut wrote:
> 
> I tested this a bit.  I used the following setup:
> 
> create table t1 (a text);
> insert into t1 select md5(generate_series(1, 1000)::text);
> select count(*) from t1 where a > '';
> 
> And then I changed in varstr_cmp():
> 
> if (collid != DEFAULT_COLLATION_OID)
> mylocale = pg_newlocale_from_collation(collid);
> 
> to just
> 
> mylocale = pg_newlocale_from_collation(collid);
> 
> I find that the \timing results are indistinguishable.  (I used locale
> "en_US.UTF-8" and made sure that that code path is actually hit.)
> 
> Does anyone have other insights?

Looking at the git history, you added this comment in 414c5a2ea65.

After a bit a digging in the lists, I found that you introduced it to fix a
reported 13% slowdown in varstr_cmp():
https://www.postgresql.org/message-id/20110129075253.GA18784%40tornado.leadboat.com
https://www.postgresql.org/message-id/1296748408.6442.1.camel%40vanquo.pezone.net




Re: ICU for global collation

2022-01-07 Thread Daniel Verite
Julien Rouhaud wrote:

> If you want a database with an ICU default collation the lc_collate
> and lc_ctype should inherit what's in the template database and not
> what was provided in the LOCALE I think.  You could still probably
> overload them in some scenario, but without a list of what isn't
> ICU-aware I can't really be sure of how often one might have to do
> it.

I guess we'd need that when creating a database with a different
encoding than the template databases, at least.

About what's not ICU-aware, I believe the most significant part in
core is the Full Text Search parser.
It doesn't care about sorting strings, but it relies on the functions
from POSIX , which depend on LC_CTYPE
(it looks however that this could be improved by following
what has been done in backend/regex/regc_pg_locale.c, which has
comparable needs and calls ICU functions when applicable).

Also, any extension is potentially concerned. Surely many
extensions call functions from ctype.h assuming that
the current value of LC_CTYPE works with the data they handle.


Best regards,
-- 
Daniel Vérité
PostgreSQL-powered mailer: https://www.manitou-mail.org
Twitter: @DanielVerite




Re: ICU for global collation

2022-01-07 Thread Peter Eisentraut

On 04.01.22 17:03, Peter Eisentraut wrote:
There are really a lot of places with this new code.  Maybe it could 
be some
new function/macro to wrap that for the normal case (e.g. not 
formatting.c)?


Right, we could just put this into pg_newlocale_from_collation(), but 
the comment there says


  * In fact, they shouldn't call this function at all when they are dealing
  * with the default locale.  That can save quite a bit in hotspots.

I don't know how to assess that.


I tested this a bit.  I used the following setup:

create table t1 (a text);
insert into t1 select md5(generate_series(1, 1000)::text);
select count(*) from t1 where a > '';

And then I changed in varstr_cmp():

if (collid != DEFAULT_COLLATION_OID)
mylocale = pg_newlocale_from_collation(collid);

to just

mylocale = pg_newlocale_from_collation(collid);

I find that the \timing results are indistinguishable.  (I used locale 
"en_US.UTF-8" and made sure that that code path is actually hit.)


Does anyone have other insights?




Re: ICU for global collation

2022-01-07 Thread Julien Rouhaud
Hi,

I looked a bit more in this patch and I have some additional remarks.

On Thu, Dec 30, 2021 at 01:07:21PM +0100, Peter Eisentraut wrote:
> 
> So this is a different approach: If you choose ICU as the default locale for
> a database, you still need to specify lc_ctype and lc_collate settings, as
> before.  Unlike in the previous patch, where the ICU collation name was
> written in datcollate, there is now a third column (daticucoll), so we can
> store all three values.  This fixes the described problem.  Other than that,
> once you get all the initial settings right, it basically just works: The
> places that have ICU support now will use a database-wide ICU collation if
> appropriate, the places that don't have ICU support continue to use the
> global libc locale settings.

So just to confirm a database can now have 2 different *default* collations: a
libc-based one for everything that doesn't work with ICU and a ICU-based (if
specified) for everything else, and the ICU-based is optional, so if not
provided everything works as before, using the libc based default collation.

As I mentioned I think this approach is sensible.  However, should we document
what are the things that are not ICU-aware?

> I changed the datcollate, datctype, and the new daticucoll fields to type
> text (from name).  That way, the daticucoll field can be set to null if it's
> not applicable.  Also, the limit of 63 characters can actually be a problem
> if you want to use some combination of the options that ICU locales offer.
> And for less extreme uses, having variable-length fields will save some
> storage, since typical locale names are much shorter.

I understand the need to have daticucoll as text, however it's not clear to me
why this has to be changed for datcollate and datctype?  IIUC those will only
ever contain libc-based collation and are still mandatory?
> 
> For the same reasons and to keep things consistent, I also changed the
> analogous pg_collation fields like that.

The respective fields in pg_collation are now nullable, so the changes there
sounds ok.

Digging a bit more in the patch here are some things that looks problematic.

- pg_upgrade

It checks (in check_locale_and_encoding()) the compatibility for each database,
and it looks like the daticucoll field should also be verified.  Other than
that I don't think there is anything else needed for the pg_upgrade part as
everything else should be handled by pg_dump (I didn't look at the changes yet
given the below problems).

- CREATE DATABASE

There's a new COLLATION_PROVIDER option, but the overall behavior seems quite
unintuitive.  As far as I can see the idea is to use LOCALE for the ICU default
collation, but as it's providing a default for the other values it's quite
annoying.  For instance:

=# CREATE DATABASE db COLLATION_PROVIDER icu LOCALE 'fr-x-icu' LC_COLLATE 
'en_GB.UTF-8';;
ERROR:  42809: invalid locale name: "fr-x-icu"
LOCATION:  createdb, dbcommands.c:397

Looking at the code it's actually complaining about LC_CTYPE.  If you want a
database with an ICU default collation the lc_collate and lc_ctype should
inherit what's in the template database and not what was provided in  the
LOCALE I think.  You could still probably overload them in some scenario, but
without a list of what isn't ICU-aware I can't really be sure of how often one
might have to do it.

Now, if I specify everything as needed it looks like it's missing some checks
on the ICU default collation when not using template0:

=# CREATE DATABASE db COLLATION_PROVIDER icu LOCALE 'en-x-icu' LC_COLLATE 
'en_GB.UTF-8' LC_CTYPE 'en_GB.UTF-8';;
CREATE DATABASE

=# SELECT datname, datcollate, datctype, daticucoll FROM pg_database ;
  datname  | datcollate  |  datctype   | daticucoll
---+-+-+
 postgres  | en_GB.UTF-8 | en_GB.UTF-8 | fr-x-icu
 db| en_GB.UTF-8 | en_GB.UTF-8 | en-x-icu
 template1 | en_GB.UTF-8 | en_GB.UTF-8 | fr-x-icu
 template0 | en_GB.UTF-8 | en_GB.UTF-8 | fr-x-icu
(4 rows)

Unless I'm missing something the same concerns about collation incompatibility
with objects in the source database should also apply for the ICU collation?

While at it, I'm not exactly sure of what the COLLATION_PROVIDER is supposed to
mean, as the same commands but with a libc provider is accepted and has
the exact same result:

=# CREATE DATABASE db2 COLLATION_PROVIDER libc LOCALE 'en-x-icu' LC_COLLATE 
'en_GB.UTF-8' LC_CTYPE 'en_GB.UTF-8';;
CREATE DATABASE

=# SELECT datname, datcollate, datctype, daticucoll FROM pg_database ;
  datname  | datcollate  |  datctype   | daticucoll
---+-+-+
 postgres  | en_GB.UTF-8 | en_GB.UTF-8 | fr-x-icu
 db| en_GB.UTF-8 | en_GB.UTF-8 | en-x-icu
 template1 | en_GB.UTF-8 | en_GB.UTF-8 | fr-x-icu
 template0 | en_GB.UTF-8 | en_GB.UTF-8 | fr-x-icu
 db2   | en_GB.UTF-8 | en_GB.UTF-8 | en-x-icu
(5 rows)

Shouldn't db2 have a NULL daticucoll, and if so also complain about

Re: ICU for global collation

2022-01-06 Thread Julien Rouhaud
On Thu, Jan 06, 2022 at 01:55:55PM +, Finnerty, Jim wrote:
> 
> I didn't notice anything version-specific about the patch.  Would any
> modifications be needed to backport it to pg13 and pg14?

This is a new feature so it can't be backported.  The changes aren't big and
mostly touches places that didn't change in a long time so I don't think that
it would take much effort if you wanted to backport it on your own forks.

> After this patch goes in, the big next thing would be to support
> nondeterministic collations for LIKE, ILIKE and pattern matching operators in
> general.  Is anyone interested in working on that?

As far as I know you're the last person that seemed to be working on that topic
back in March :)




Re: ICU for global collation

2022-01-06 Thread Finnerty, Jim

I didn't notice anything version-specific about the patch.  Would any 
modifications be needed to backport it to pg13 and pg14?

After this patch goes in, the big next thing would be to support 
nondeterministic collations for LIKE, ILIKE and pattern matching operators in 
general.  Is anyone interested in working on that?

On 1/5/22, 10:36 PM, "Julien Rouhaud"  wrote:

CAUTION: This email originated from outside of the organization. Do not 
click links or open attachments unless you can confirm the sender and know the 
content is safe.



On Tue, Jan 04, 2022 at 05:03:10PM +0100, Peter Eisentraut wrote:
> On 04.01.22 03:21, Julien Rouhaud wrote:
>
> > > - if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
> > > - mylocale = pg_newlocale_from_collation(collid);
> > > + if (!lc_collate_is_c(collid))
> > > + {
> > > + if (collid != DEFAULT_COLLATION_OID)
> > > + mylocale = pg_newlocale_from_collation(collid);
> > > + else if (default_locale.provider == COLLPROVIDER_ICU)
> > > + mylocale = _locale;
> > > + }
> >
> > There are really a lot of places with this new code.  Maybe it could be 
some
> > new function/macro to wrap that for the normal case (e.g. not 
formatting.c)?
>
> Right, we could just put this into pg_newlocale_from_collation(), but the
> comment there says
>
>  * In fact, they shouldn't call this function at all when they are dealing
>  * with the default locale.  That can save quite a bit in hotspots.
>
> I don't know how to assess that.
>
> We could pack this into a macro or inline function if we are concerned 
about
> this.

Yes that was my idea, just have a new function (inline function or a macro
then since pg_newlocale_from_collation() clearly warns about performance
concerns) that have the whole
is-not-c-collation-and-is-default-collation-or-icu-collation logic and calls
pg_newlocale_from_collation() only when needed.





Re: ICU for global collation

2022-01-05 Thread Julien Rouhaud
On Tue, Jan 04, 2022 at 05:03:10PM +0100, Peter Eisentraut wrote:
> On 04.01.22 03:21, Julien Rouhaud wrote:
> 
> > > - if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
> > > - mylocale = pg_newlocale_from_collation(collid);
> > > + if (!lc_collate_is_c(collid))
> > > + {
> > > + if (collid != DEFAULT_COLLATION_OID)
> > > + mylocale = pg_newlocale_from_collation(collid);
> > > + else if (default_locale.provider == COLLPROVIDER_ICU)
> > > + mylocale = _locale;
> > > + }
> > 
> > There are really a lot of places with this new code.  Maybe it could be some
> > new function/macro to wrap that for the normal case (e.g. not formatting.c)?
> 
> Right, we could just put this into pg_newlocale_from_collation(), but the
> comment there says
> 
>  * In fact, they shouldn't call this function at all when they are dealing
>  * with the default locale.  That can save quite a bit in hotspots.
> 
> I don't know how to assess that.
> 
> We could pack this into a macro or inline function if we are concerned about
> this.

Yes that was my idea, just have a new function (inline function or a macro
then since pg_newlocale_from_collation() clearly warns about performance
concerns) that have the whole
is-not-c-collation-and-is-default-collation-or-icu-collation logic and calls
pg_newlocale_from_collation() only when needed.




Re: ICU for global collation

2022-01-04 Thread Peter Eisentraut

On 04.01.22 03:21, Julien Rouhaud wrote:

@@ -2774,6 +2776,7 @@ dumpDatabase(Archive *fout)
appendPQExpBuffer(dbQry, "SELECT tableoid, oid, datname, "
  "(%s datdba) AS dba, "
  "pg_encoding_to_char(encoding) AS 
encoding, "
+ "datcollprovider, "


This needs to be in a new pg 15+ branch, not in the pg 9.3+.


ok


-   if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
-   mylocale = pg_newlocale_from_collation(collid);
+   if (!lc_collate_is_c(collid))
+   {
+   if (collid != DEFAULT_COLLATION_OID)
+   mylocale = pg_newlocale_from_collation(collid);
+   else if (default_locale.provider == COLLPROVIDER_ICU)
+   mylocale = _locale;
+   }


There are really a lot of places with this new code.  Maybe it could be some
new function/macro to wrap that for the normal case (e.g. not formatting.c)?


Right, we could just put this into pg_newlocale_from_collation(), but 
the comment there says


 * In fact, they shouldn't call this function at all when they are dealing
 * with the default locale.  That can save quite a bit in hotspots.

I don't know how to assess that.

We could pack this into a macro or inline function if we are concerned 
about this.





Re: ICU for global collation

2022-01-03 Thread Julien Rouhaud
Hi,

On Thu, Dec 30, 2021 at 01:07:21PM +0100, Peter Eisentraut wrote:
> 
> So this is a different approach: If you choose ICU as the default locale for
> a database, you still need to specify lc_ctype and lc_collate settings, as
> before.  Unlike in the previous patch, where the ICU collation name was
> written in datcollate, there is now a third column (daticucoll), so we can
> store all three values.  This fixes the described problem.  Other than that,
> once you get all the initial settings right, it basically just works: The
> places that have ICU support now will use a database-wide ICU collation if
> appropriate, the places that don't have ICU support continue to use the
> global libc locale settings.

That looks sensible to me.

> @@ -2774,6 +2776,7 @@ dumpDatabase(Archive *fout)
>   appendPQExpBuffer(dbQry, "SELECT tableoid, oid, datname, "
> "(%s datdba) AS dba, "
> 
> "pg_encoding_to_char(encoding) AS encoding, "
> +   "datcollprovider, "

This needs to be in a new pg 15+ branch, not in the pg 9.3+.

> - if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
> - mylocale = pg_newlocale_from_collation(collid);
> + if (!lc_collate_is_c(collid))
> + {
> + if (collid != DEFAULT_COLLATION_OID)
> + mylocale = pg_newlocale_from_collation(collid);
> + else if (default_locale.provider == COLLPROVIDER_ICU)
> + mylocale = _locale;
> + }

There are really a lot of places with this new code.  Maybe it could be some
new function/macro to wrap that for the normal case (e.g. not formatting.c)?




Re: ICU for global collation

2021-12-30 Thread Peter Eisentraut


There were a few inquiries about this topic recently, so I dug up the 
old thread and patch.  What we got stuck on last time was that we can't 
just swap out all locale support in a database for ICU.  We still need 
to set the usual locale environment, otherwise some things that are not 
ICU aware will break or degrade.  I had initially anticipated fixing 
that by converting everything that uses libc locales to ICU.  But that 
turned out to be tedious and ultimately not very useful as far as the 
user-facing result is concerned, so I gave up.


So this is a different approach: If you choose ICU as the default locale 
for a database, you still need to specify lc_ctype and lc_collate 
settings, as before.  Unlike in the previous patch, where the ICU 
collation name was written in datcollate, there is now a third column 
(daticucoll), so we can store all three values.  This fixes the 
described problem.  Other than that, once you get all the initial 
settings right, it basically just works: The places that have ICU 
support now will use a database-wide ICU collation if appropriate, the 
places that don't have ICU support continue to use the global libc 
locale settings.


I changed the datcollate, datctype, and the new daticucoll fields to 
type text (from name).  That way, the daticucoll field can be set to 
null if it's not applicable.  Also, the limit of 63 characters can 
actually be a problem if you want to use some combination of the options 
that ICU locales offer.  And for less extreme uses, having 
variable-length fields will save some storage, since typical locale 
names are much shorter.


For the same reasons and to keep things consistent, I also changed the 
analogous pg_collation fields like that.  This also removes some weird 
code that has to check that colcollate and colctype have to be the same 
for ICU, so it's overall cleaner.From 4eb9fbac238c1abf481fa43431ecc22e782a5290 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut 
Date: Thu, 30 Dec 2021 12:47:24 +0100
Subject: [PATCH v3] Add option to use ICU as global collation provider

This adds the option to use ICU as the default collation provider for
either the whole cluster or a database.  New options for initdb,
createdb, and CREATE DATABASE are used to select this.

Discussion: 
https://www.postgresql.org/message-id/flat/5e756dd6-0e91-d778-96fd-b1bcb06c161a%402ndquadrant.com
---
 doc/src/sgml/catalogs.sgml|  13 +-
 doc/src/sgml/ref/create_database.sgml |  16 ++
 doc/src/sgml/ref/createdb.sgml|   9 +
 doc/src/sgml/ref/initdb.sgml  |  23 ++
 src/backend/access/hash/hashfunc.c|  18 +-
 src/backend/catalog/pg_collation.c|  24 +-
 src/backend/commands/collationcmds.c  | 120 +++---
 src/backend/commands/dbcommands.c |  93 ++--
 src/backend/regex/regc_pg_locale.c|   7 +-
 src/backend/utils/adt/formatting.c|   6 +
 src/backend/utils/adt/like.c  |  20 +-
 src/backend/utils/adt/like_support.c  |   2 +
 src/backend/utils/adt/pg_locale.c | 223 +++---
 src/backend/utils/adt/varchar.c   |  22 +-
 src/backend/utils/adt/varlena.c   |  26 +-
 src/backend/utils/init/postinit.c |  37 ++-
 src/bin/initdb/Makefile   |   2 +
 src/bin/initdb/initdb.c   |  62 -
 src/bin/initdb/t/001_initdb.pl|  18 +-
 src/bin/pg_dump/pg_dump.c |  16 ++
 src/bin/psql/describe.c   |  23 +-
 src/bin/psql/tab-complete.c   |   2 +-
 src/bin/scripts/Makefile  |   2 +
 src/bin/scripts/createdb.c|   9 +
 src/bin/scripts/t/020_createdb.pl |  20 +-
 src/include/catalog/pg_collation.dat  |   3 +-
 src/include/catalog/pg_collation.h|   6 +-
 src/include/catalog/pg_database.dat   |   2 +-
 src/include/catalog/pg_database.h |  16 +-
 src/include/utils/pg_locale.h |   6 +
 .../regress/expected/collate.icu.utf8.out |  10 +-
 src/test/regress/sql/collate.icu.utf8.sql |   8 +-
 32 files changed, 665 insertions(+), 199 deletions(-)

diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index 03e2537b07..89e7279030 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -2368,7 +2368,7 @@ pg_collation 
Columns
 
  
   
-   collcollate name
+   collcollate text
   
   
LC_COLLATE for this collation object
@@ -2377,13 +2377,22 @@ pg_collation 
Columns
 
  
   
-   collctype name
+   collctype text
   
   
LC_CTYPE for this collation object
   
  
 
+ 
+  
+   collicucoll text
+  
+  
+   ICU collation string
+  
+ 
+
  
   
collversion text
diff --git a/doc/src/sgml/ref/create_database.sgml 
b/doc/src

  1   2   >