date:20231123


On 24/11/2023 00:33, Thomas Munro wrote:

Hi,

While following along with Tristan and Heikki's thread about signals
in psql, it occurred to me that the documentation atop pqsignal() is
not very good:

  * we don't explain what problem it originally solved
  * we don't explain why it's still needed today
  * we don't explain what else it does for us today
  * we describe the backend implementation for Windows incorrectly (mea culpa)
  * we vaguely mention one issue with Windows frontend code, but I
think the point made is misleading, and we don't convey the scale of
the differences

Here is my attempt to improve it.


Thanks!


This is program 10.12 from Advanced Programming in the UNIX
Environment, with minor changes.
In the copy I found online (3rd edition), it's "Figure 10.18", not 
"program 10.12".


Other than that, looks good.

--
Heikki Linnakangas
Neon (https://neon.tech)

Re: [PATCH] fix race condition in libpq (related to ssl connections)

On Wed, Nov 22, 2023 at 10:43:32AM +0900, Michael Paquier wrote:
> I am not really convinced that we require a second mutex here, as
> there is always a concern with inter-locking changes.  I may be
> missing something, of course, but BIO_s_socket() is just a pointer to
> a set of callbacks hidden in bss_sock.c with BIO_meth_new() and
> BIO_get_new_index() assigning some centralized data to handle the
> methods in a controlled way in OpenSSL.

I was looking at the origin of this one, and this is an issue coming
down to 8bb14cdd33de that has removed the ssl_config_mutex taken in
pgtls_open_client() when we called my_SSL_set_fd().  The commit has
accidentally missed that path with the static BIO method where the
mutex mattered.

> We only case about
> initializing once for the sake of libpq's threads, so wouldn't it be
> better to move my_BIO_s_socket() in pgtls_init() where the
> initialization of the BIO methods would be protected by
> ssl_config_mutex?  That's basically what Willi means in his first
> message, no?

I've looked at this idea, and finished by being unhappy with the error
handling that we are currently assuming in my_SSL_set_fd() in the
event of an error in the bio method setup, which would be most likely
an OOM, so let's use ssl_config_mutex in my_BIO_s_socket().  Another
thing is that I have minimized the manipulation of my_bio_methods in
the setup routine.

I've been also testing the risks of collusion, and it takes me quite a
a few tries with hundred threads to reproduce the failure even without
any forced sleep, so that seems really hard to reach.

Please find attached a patch (still need to do more checks with older
versions of OpenSSL).  Any thoughts or comments?
--
Michael
From dc1bc40271311868ec4840be2c246efa705dc7a5 Mon Sep 17 00:00:00 2001
From: Michael Paquier 
Date: Fri, 24 Nov 2023 16:47:03 +0900
Subject: [PATCH v2] Fix race condition in initialization of my_bio_methods
 static variable

---
 src/interfaces/libpq/fe-secure-openssl.c | 61 
 1 file changed, 40 insertions(+), 21 deletions(-)

diff --git a/src/interfaces/libpq/fe-secure-openssl.c b/src/interfaces/libpq/fe-secure-openssl.c
index f1192d28f2..30a83fbd01 100644
--- a/src/interfaces/libpq/fe-secure-openssl.c
+++ b/src/interfaces/libpq/fe-secure-openssl.c
@@ -1885,6 +1885,13 @@ my_sock_write(BIO *h, const char *buf, int size)
 static BIO_METHOD *
 my_BIO_s_socket(void)
 {
+	BIO_METHOD *res;
+
+	if (pthread_mutex_lock(&ssl_config_mutex))
+		return NULL;
+
+	res = my_bio_methods;
+
 	if (!my_bio_methods)
 	{
 		BIO_METHOD *biom = (BIO_METHOD *) BIO_s_socket();
@@ -1893,39 +1900,51 @@ my_BIO_s_socket(void)
 
 		my_bio_index = BIO_get_new_index();
 		if (my_bio_index == -1)
-			return NULL;
+			goto err;
 		my_bio_index |= (BIO_TYPE_DESCRIPTOR | BIO_TYPE_SOURCE_SINK);
-		my_bio_methods = BIO_meth_new(my_bio_index, "libpq socket");
-		if (!my_bio_methods)
-			return NULL;
+		res = BIO_meth_new(my_bio_index, "libpq socket");
+		if (!res)
+			goto err;
 
 		/*
 		 * As of this writing, these functions never fail. But check anyway,
 		 * like OpenSSL's own examples do.
 		 */
-		if (!BIO_meth_set_write(my_bio_methods, my_sock_write) ||
-			!BIO_meth_set_read(my_bio_methods, my_sock_read) ||
-			!BIO_meth_set_gets(my_bio_methods, BIO_meth_get_gets(biom)) ||
-			!BIO_meth_set_puts(my_bio_methods, BIO_meth_get_puts(biom)) ||
-			!BIO_meth_set_ctrl(my_bio_methods, BIO_meth_get_ctrl(biom)) ||
-			!BIO_meth_set_create(my_bio_methods, BIO_meth_get_create(biom)) ||
-			!BIO_meth_set_destroy(my_bio_methods, BIO_meth_get_destroy(biom)) ||
-			!BIO_meth_set_callback_ctrl(my_bio_methods, BIO_meth_get_callback_ctrl(biom)))
+		if (!BIO_meth_set_write(res, my_sock_write) ||
+			!BIO_meth_set_read(res, my_sock_read) ||
+			!BIO_meth_set_gets(res, BIO_meth_get_gets(biom)) ||
+			!BIO_meth_set_puts(res, BIO_meth_get_puts(biom)) ||
+			!BIO_meth_set_ctrl(res, BIO_meth_get_ctrl(biom)) ||
+			!BIO_meth_set_create(res, BIO_meth_get_create(biom)) ||
+			!BIO_meth_set_destroy(res, BIO_meth_get_destroy(biom)) ||
+			!BIO_meth_set_callback_ctrl(res, BIO_meth_get_callback_ctrl(biom)))
 		{
-			BIO_meth_free(my_bio_methods);
-			my_bio_methods = NULL;
-			return NULL;
+			goto err;
 		}
 #else
-		my_bio_methods = malloc(sizeof(BIO_METHOD));
-		if (!my_bio_methods)
-			return NULL;
-		memcpy(my_bio_methods, biom, sizeof(BIO_METHOD));
-		my_bio_methods->bread = my_sock_read;
-		my_bio_methods->bwrite = my_sock_write;
+		res = malloc(sizeof(BIO_METHOD));
+		if (!res)
+			goto err;
+		memcpy(res, res, sizeof(BIO_METHOD));
+		res->bread = my_sock_read;
+		res->bwrite = my_sock_write;
 #endif
 	}
+
+	my_bio_methods = res;
+	pthread_mutex_unlock(&ssl_config_mutex);
 	return my_bio_methods;
+
+err:
+#ifdef HAVE_BIO_METH_NEW
+	if (res)
+		BIO_meth_free(res);
+#else
+	if (res)
+		free(res);
+#endif
+	pthread_mutex_unlock(&ssl_config_mutex);
+	return NULL;
 }
 
 /* This should exactly match OpenSSL's SSL_set_fd except for using my BIO */
-- 
2.

Re: How to accurately determine when a relation should use local buffers?

2023-11-23 Thread Давыдов Виталий


Hi Aleksander,
I sort of suspect that you are working on a very specific extension
and/or feature for PG fork. Any chance you could give us more details
about the case?I'm trying to adapt a multimaster solution to some changes in 
pg16. We replicate temp table DDL due to some reasons. Furthermore, such tables 
should be accessible from other processes than the replication receiver process 
on a replica, and they still should be temporary. I understand that DML 
replication for temporary tables will cause a severe performance degradation. 
But it is not our case.

There are some changes in ReadBuffer logic if to compare with pg15. To define 
which buffers to use, ReadBuffer used SmgrIsTemp function in pg15. The decision 
was based on backend id of the relation. In pg16 the decision is based on 
relpersistence attribute, that caused some problems on my side. My opinion, we 
should choose local buffers based on backend ids of relations, not on its 
persistence. Additional check for relpersistence prior to backend id may 
improve the performance in some cases, I think. The internal design may become 
more flexible as a result.

With best regards,
Vitaly Davydov

Re: Don't use bms_membership in places where it's not needed

2023-11-23 Thread Richard Guo

On Fri, Nov 24, 2023 at 12:06 PM David Rowley  wrote:

> In the attached, I've adjusted the code to use the latter of the two
> above methods in 3 places.  In examine_variable() this reduces the
> complexity of the logic quite a bit and saves calling bms_is_member()
> in addition to bms_singleton_member().


+1 to the idea.

I think you have a typo in distribute_restrictinfo_to_rels.  We should
remove the call of bms_singleton_member and use relid instead.

--- a/src/backend/optimizer/plan/initsplan.c
+++ b/src/backend/optimizer/plan/initsplan.c
@@ -2644,7 +2644,7 @@ distribute_restrictinfo_to_rels(PlannerInfo *root,
 * There is only one relation participating in the clause, so it
 * is a restriction clause for that relation.
 */
-   rel = find_base_rel(root, bms_singleton_member(relids));
+   rel = find_base_rel(root, relid);

Thanks
Richard

Re: [PATCH] fix race condition in libpq (related to ssl connections)

On Wed, Nov 22, 2023 at 02:58:15PM +1300, Thomas Munro wrote:
> On Wed, Nov 22, 2023 at 2:44 PM Michael Paquier  wrote:
>> Because --disable-thread-safe has been removed recently in
>> 68a4b58eca03.  The routine could be marked as deprecated on top of
>> saying that it always returns 1 for 17~.
> 
> See also commit ce0b0fa3 "Doc: Adjust libpq docs about thread safety."

Sure, I've noticed that in the docs, but declaring it as deprecated is
a different topic, and we don't actually use this term in the docs for
this routine, no?
--
Michael


signature.asc
Description: PGP signature

Re: Properly pathify the union planner

2023-11-23 Thread Ashutosh Bapat

On Fri, Nov 24, 2023 at 3:59 AM David Rowley  wrote:
>
> Another problem I hit was add_path() pfreeing a Path that I needed.
> This was happening due to how I'm building the final paths in the
> subquery when setop_pathkeys are set.  Because I want to always
> include the cheapest_input_path to allow that path to be used in
> hash-based UNIONs, I also want to provide sorted paths so that
> MergeAppend has something to work with.  I found cases where I'd
> add_path() the cheapest_input_path to the final rel then also attempt
> to sort that path.  Unfortunately, add_path() found the unsorted path
> and the sorted path fuzzily the same cost and opted to keep the sorted
> one due to it having better pathkeys. add_path() then pfree'd the
> cheapest_input_path which meant the Sort's subpath was gone which
> obviously caused issues in createplan.c.
>
> For now, as a temporary fix, I've just #ifdef'd out the code in
> add_path() that's pfreeing the old path.  I have drafted a patch that
> refcounts Paths, but I'm unsure if that's the correct solution as I'm
> only maintaining the refcounts in add_path() and add_partial_path(). I
> think a true correct solution would bump the refcount when a path is
> used as some other path's subpath.  That would mean having to
> recursively pfree paths up until we find one with a refcount>0. Seems
> a bit expensive for add_path() to do.

Please find my proposal to refcount paths at [1]. I did that to reduce
the memory consumed by partitionwise joins. I remember another thread
where freeing a path that was referenced by upper sort path created
minor debugging problem. [2]. I paused my work on my proposal since
there didn't seem enough justification. But it looks like the
requirement is coming up repeatedly. I am willing to resume my work if
it's needed. The email lists next TODOs. As to making the add_path()
expensive, I didn't find any noticeable impact on planning time.


[1] 
https://www.postgresql.org/message-id/CAExHW5tUcVsBkq9qT%3DL5vYz4e-cwQNw%3DKAGJrtSyzOp3F%3DXacA%40mail.gmail.com
[2] 
https://www.postgresql.org/message-id/CAM2%2B6%3DUC1mcVtM0Y_LEMBEGHTM58HEkqHPn7vau_V_YfuZjEGg%40mail.gmail.com

--
Best Wishes,
Ashutosh Bapat

Re: POC, WIP: OR-clause support for indexes

2023-11-23 Thread Alena Rybakina


On 23.11.2023 12:23, Andrei Lepikhov wrote:
I think the usage of nodeToString for the generation of clause hash is 
too expensive and buggy.
Also, in the code, you didn't resolve hash collisions. So, I've 
rewritten the patch a bit (see the attachment).
One more thing: I propose to enable transformation by default at least 
for quick detection of possible issues.
This code changes tests in many places. But, as I see it, it mostly 
demonstrates the positive effect of the transformation.


On 24.11.2023 06:30, Andrei Lepikhov wrote:


On 23/11/2023 16:23, Andrei Lepikhov wrote:
This code changes tests in many places. But, as I see it, it mostly 
demonstrates the positive effect of the transformation.


I found out that the postgres_fdw tests were impacted by the feature. 
Fix it, because the patch is on the commitfest and passes buildfarm.
Taking advantage of this, I suppressed the expression evaluation 
procedure to make regression test changes more clear.


Thank you for your work. You are right, the patch with the current 
changes looks better and works more correctly.


To be honest, I didn't think we could use JumbleExpr in this way.

--
Regards,
Alena Rybakina
Postgres Professional

Re: PL/pgSQL: Incomplete item Allow handling of %TYPE arrays, e.g. tab.col%TYPE[]

2023-11-23 Thread Pavel Stehule

pá 24. 11. 2023 v 2:12 odesílatel Quan Zongliang 
napsal:

>
>
> On 2023/11/24 03:39, Pavel Stehule wrote:
>
> >
> > I modified the documentation a little bit - we don't need to extra
> > propose SQL array syntax, I think.
> > I rewrote regress tests - we don't need to test unsupported
> > functionality (related to RECORD).
> >
> > - all tests passed
> >
> I wrote two examples of errors:
>user_id users.user_id%ROWTYPE[];
>user_id users.user_id%ROWTYPE ARRAY[4][3];
>

there were more issues in this part - the name "user_id" is a bad name for
a composite variable.  I renamed it.
+ I wrote a test related to usage type without array support.

Now, I think so this simple patch is ready for committers

Regards

Pavel



> Fixed.
>
> > Regards
> >
> > Pavel
> >
> >
> >  >
> >  > Regards
> >  >
> >  > Pavel
> >
From 8fc6d02a8cea6cc897e4290ad7724b494e330ef8 Mon Sep 17 00:00:00 2001
From: "ok...@github.com" 
Date: Thu, 23 Nov 2023 18:39:27 +0100
Subject: [PATCH] support of syntax %type[] and %rowtype[]

---
 doc/src/sgml/plpgsql.sgml | 40 ++
 src/pl/plpgsql/src/pl_comp.c  | 23 
 src/pl/plpgsql/src/pl_gram.y  | 60 +++-
 src/pl/plpgsql/src/plpgsql.h  |  1 +
 src/test/regress/expected/plpgsql.out | 80 +++
 src/test/regress/sql/plpgsql.sql  | 73 
 6 files changed, 264 insertions(+), 13 deletions(-)

diff --git a/doc/src/sgml/plpgsql.sgml b/doc/src/sgml/plpgsql.sgml
index 5977534a62..aa848c034e 100644
--- a/doc/src/sgml/plpgsql.sgml
+++ b/doc/src/sgml/plpgsql.sgml
@@ -766,6 +766,40 @@ SELECT merge_fields(t.*) FROM table1 t WHERE ... ;

   
 
+  
+   Arrays of Copying Types and Row Types
+
+
+name variable%TYPE[];
+name table_name%ROWTYPE[];
+
+
+   
+Arrays of Copying Types and Row Types is defined by appending square brackets
+([]) to the %TYPE or %ROWTYPE.
+Its definition is similar to PostgreSQL's arrays described in .
+For example:
+
+user_id users.user_id%TYPE[];
+users_row users%ROWTYPE[];
+
+The syntax allows the exact size of arrays to be specified. However, the current
+implementation ignores any supplied array size limits, i.e., the behavior is the
+same as for arrays of unspecified length.
+   
+
+   
+An alternative syntax, which conforms to the SQL standard by using the keyword
+ARRAY, can be used for one-dimensional or multi-dimensional
+arrays too:
+
+user_id users.user_id%TYPE ARRAY;
+users_row users%ROWTYPE ARRAY[4][3];
+
+As before, however, PostgreSQL does not enforce the size restriction in any case.
+   
+  
+
   
Record Types
 
@@ -794,6 +828,12 @@ SELECT merge_fields(t.*) FROM table1 t WHERE ... ;
 calling query is parsed, whereas a record variable can change its row
 structure on-the-fly.

+
+   
+The RECORD cannot be used for declaration of variable
+of an array type. "Copying Types" as shown in 
+are not supported too.
+   
   
 
   
diff --git a/src/pl/plpgsql/src/pl_comp.c b/src/pl/plpgsql/src/pl_comp.c
index a341cde2c1..a9cb15df6d 100644
--- a/src/pl/plpgsql/src/pl_comp.c
+++ b/src/pl/plpgsql/src/pl_comp.c
@@ -2095,6 +2095,29 @@ plpgsql_build_datatype(Oid typeOid, int32 typmod,
 	return typ;
 }
 
+/*
+ * Returns an array for type specified as argument.
+ */
+PLpgSQL_type *
+plpgsql_datatype_arrayof(PLpgSQL_type *dtype)
+{
+	Oid			array_typeid;
+
+	if (dtype->typisarray)
+		return dtype;
+
+	array_typeid = get_array_type(dtype->typoid);
+
+	if (!OidIsValid(array_typeid))
+		ereport(ERROR,
+(errcode(ERRCODE_UNDEFINED_OBJECT),
+ errmsg("could not find array type for data type \"%s\"",
+		format_type_be(dtype->typoid;
+
+	return plpgsql_build_datatype(array_typeid, dtype->atttypmod,
+  dtype->collation, NULL);
+}
+
 /*
  * Utility subroutine to make a PLpgSQL_type struct given a pg_type entry
  * and additional details (see comments for plpgsql_build_datatype).
diff --git a/src/pl/plpgsql/src/pl_gram.y b/src/pl/plpgsql/src/pl_gram.y
index 6a09bfdd67..aa9103cf0e 100644
--- a/src/pl/plpgsql/src/pl_gram.y
+++ b/src/pl/plpgsql/src/pl_gram.y
@@ -2789,7 +2789,7 @@ read_datatype(int tok)
 	StringInfoData ds;
 	char	   *type_name;
 	int			startlocation;
-	PLpgSQL_type *result;
+	PLpgSQL_type *result = NULL;
 	int			parenlevel = 0;
 
 	/* Should only be called while parsing DECLARE sections */
@@ -2817,15 +2817,11 @@ read_datatype(int tok)
 			   K_TYPE, "type"))
 			{
 result = plpgsql_parse_wordtype(dtname);
-if (result)
-	return result;
 			}
 			else if (tok_is_keyword(tok, &yylval,
 	K_ROWTYPE, "rowtype"))
 			{
 result = plpgsql_parse_wordrowtype(dtname);
-if (result)
-	return result;
 			}
 		}
 	}
@@ -2841,15 +2837,11 @@ read_datatype(int tok)
 			   K_TYPE, "type"))
 			{
 result = plpgsql_parse_wordtype(dtname);
-if (result)
-	return result;
 			}
 			else if (tok_is_keyword(tok, &yylval,
 	K_

Re: SLRU optimization - configurable buffer pool and partitioning the SLRU lock

2023-11-23 Thread Dilip Kumar

On Thu, Nov 23, 2023 at 11:34 AM Dilip Kumar  wrote:
>
> Note: With this testing, we have found a bug in the bank-wise
> approach, basically we are clearing a procglobal->clogGroupFirst, even
> before acquiring the bank lock that means in most of the cases there
> will be a single process in each group as a group leader

I realized that the bug fix I have done is not proper, so will send
the updated patch set with the proper fix soon.

-- 
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

Re: trying again to get incremental backup

2023-11-23 Thread Thomas Munro

On Wed, Nov 22, 2023 at 3:14 AM Jakub Wartak
 wrote:
> CFbot failed on two hosts this time, I haven't looked at the details
> yet (https://cirrus-ci.com/task/6425149646307328 -> end of EOL? ->
> LOG:  WAL summarizer process (PID 71511) was terminated by signal 6:
> Aborted?)

Robert pinged me to see if I had any ideas.

The reason it fails on Windows is because there is a special code path
for WIN32 in the patch's src/bin/pg_combinebackup/copy_file.c, but it
is incomplete: it returns early without feeding the data into the
checksum, so all the checksums have the same initial and bogus value.
I commented that part out so it took the normal path like Unix, and it
passed.

The reason it fails on Linux 32 bit with -fsanitize is because this
has uncovered a bug in xlogreader.c, which overflows a 32 bit pointer
when doing a size test that could easily be changed to non-overflowing
formulation.  AFAICS it is not a live bug because it comes to the
right conclusion without deferencing the pointer due to other checks,
but the sanitizer is not wrong to complain about it and I will post a
patch to fix that in a new thread.  With the draft patch I am testing,
the sanitizer is happy and this passes too.

Re: undetected deadlock in ALTER SUBSCRIPTION ... REFRESH PUBLICATION

2023-11-23 Thread Shlok Kyal

Hi,

I tried to reproduce the issue and was able to reproduce it with
scripts shared by Tomas.
I tried testing it from PG17 to PG 11. This issue is reproducible for
each version.

Next I would try to test with the patch in the thread shared by Amit.

Thanks,
Shlok Kumar Kyal

Don't use bms_membership in places where it's not needed

2023-11-23 Thread David Rowley

While working on the patch in [1], I noticed that ever since
00b41463c, it's now suboptimal to do the following:

switch (bms_membership(relids))
{
case BMS_EMPTY_SET:
   /* handle empty set */
   break;
case BMS_SINGLETON:
/* call bms_singleton_member() and handle singleton set */
break;
case BMS_MULTIPLE:
   /* handle multi-member set */
   break;
}

The following is cheaper as we don't need to call bms_membership() and
bms_singleton_member() for singleton sets. It also saves function call
overhead for empty sets.

if (relids == NULL)
   /* handle empty set */
else
{
int relid;

if (bms_get_singleton(relids, &relid))
/* handle singleton set */
   else
   /* handle multi-member set */
}

In the attached, I've adjusted the code to use the latter of the two
above methods in 3 places.  In examine_variable() this reduces the
complexity of the logic quite a bit and saves calling bms_is_member()
in addition to bms_singleton_member().

I'm trying to reduce the footprint of what's being worked on in [1]
and I highlighted this as something that'll help with that.

Any objections to me pushing the attached?

David

[1] 
https://postgr.es/m/caaphdvqhcnkji9crqzg-reqdxtfrwnt5rhzntdqhnrbzaau...@mail.gmail.com
diff --git a/src/backend/optimizer/plan/initsplan.c 
b/src/backend/optimizer/plan/initsplan.c
index b31d892121..83aeca36d6 100644
--- a/src/backend/optimizer/plan/initsplan.c
+++ b/src/backend/optimizer/plan/initsplan.c
@@ -2634,10 +2634,12 @@ distribute_restrictinfo_to_rels(PlannerInfo *root,
Relids  relids = restrictinfo->required_relids;
RelOptInfo *rel;
 
-   switch (bms_membership(relids))
+   if (!bms_is_empty(relids))
{
-   case BMS_SINGLETON:
+   int relid;
 
+   if (bms_get_singleton_member(relids, &relid))
+   {
/*
 * There is only one relation participating in the 
clause, so it
 * is a restriction clause for that relation.
@@ -2650,9 +2652,9 @@ distribute_restrictinfo_to_rels(PlannerInfo *root,
/* Update security level info */
rel->baserestrict_min_security = 
Min(rel->baserestrict_min_security,

 restrictinfo->security_level);
-   break;
-   case BMS_MULTIPLE:
-
+   }
+   else
+   {
/*
 * The clause is a join clause, since there is more 
than one rel
 * in its relid set.
@@ -2675,15 +2677,15 @@ distribute_restrictinfo_to_rels(PlannerInfo *root,
 * Add clause to the join lists of all the relevant 
relations.
 */
add_join_clause_to_rels(root, restrictinfo, relids);
-   break;
-   default:
-
-   /*
-* clause references no rels, and therefore we have no 
place to
-* attach it.  Shouldn't get here if callers are 
working properly.
-*/
-   elog(ERROR, "cannot cope with variable-free clause");
-   break;
+   }
+   }
+   else
+   {
+   /*
+* clause references no rels, and therefore we have no place to 
attach
+* it.  Shouldn't get here if callers are working properly.
+*/
+   elog(ERROR, "cannot cope with variable-free clause");
}
 }
 
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 35c9e3c86f..e11d022827 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -5028,22 +5028,27 @@ examine_variable(PlannerInfo *root, Node *node, int 
varRelid,
 
onerel = NULL;
 
-   switch (bms_membership(varnos))
+   if (bms_is_empty(varnos))
{
-   case BMS_EMPTY_SET:
-   /* No Vars at all ... must be pseudo-constant clause */
-   break;
-   case BMS_SINGLETON:
-   if (varRelid == 0 || bms_is_member(varRelid, varnos))
+   /* No Vars at all ... must be pseudo-constant clause */
+   }
+   else
+   {
+   int relid;
+
+   if (bms_get_singleton_member(varnos, &relid))
+   {
+   if (varRelid == 0 || varRelid == relid)
{
-   onerel = find_base_rel(root,
-  
(varRelid ? varRelid : bms_singleton_member(varnos)));
+   onerel = find_base_rel(root, relid);

Re: POC PATCH: copy from ... exceptions to: (was Re: VLDB Features)

2023-11-23 Thread Andrei Lepikhov


On 14/11/2023 17:10, Damir Belyalov wrote:

  Here is a very straw-man-level sketch of what I think might work.
  The option to COPY FROM looks something like

       ERRORS TO other_table_name (item [, item [, ...]])


I tried to implement the patch using a table and came across a number of 
questions.


Which table should we implement for this feature: a system catalog table 
or store this table as a file or create a new table?


In these cases, security and user rights management issues arise.
It is better for other users not to see error lines from another user. 
It is also not clear how access rights to this table are inherited and 
be given.


Previous reviews have given helpful ideas about storing errors in the 
new table.
It should be trivial code - use the current table name + 'err' + suffix 
as we already do in the case of conflicting auto-generated index names.
The 'errors table' must inherit any right policies from the table, to 
which we do the copy.


--
regards,
Andrei Lepikhov
Postgres Professional

RE: Synchronizing slots from primary to standby

2023-11-23 Thread Zhijie Hou (Fujitsu)

On Thursday, November 23, 2023 11:45 PM Drouvot, Bertrand 
 wrote:

Hi,

> 
> On 11/23/23 6:13 AM, Amit Kapila wrote:
> > On Tue, Nov 21, 2023 at 4:35 PM Drouvot, Bertrand
> >  wrote:
> >>
> >> On 11/21/23 10:32 AM, shveta malik wrote:
> >>> On Tue, Nov 21, 2023 at 2:02 PM shveta malik 
> wrote:
> 
> >>
> >>> v37 fails to apply to HEAD due to a recent commit e83aa9f92fdd,
> >>> rebased the patches.  PFA v37_2 patches.
> >>
> >> Thanks!
> >>
> >> Regarding the promotion flow: If the primary is available and
> >> reachable I don't think we currently try to ensure that slots are in
> >> sync. I think we'd miss the activity since the last sync and the promotion
> request or am I missing something?
> >>
> >> If the primary is available and reachable shouldn't we launch a last
> >> round of synchronization (skipping all the slots that are not in 'r' 
> >> state)?
> >>
> >
> > We may miss the last round but there is no guarantee that we can
> > ensure to sync of everything if the primary is available. Because
> > after our last sync, there could probably be some more activity.
> 
> I don't think so thanks to the fact that we ensure that logical walsenders on 
> the
> primary wait for the physical standby.
> 
> Indeed that should prevent any decoding activity on the primary while the
> promotion is in progress on the standby (at least as soon as the walreceiver 
> is
> shutdown).
> 
> So that I think that a promotion flow like:
> 
> - walreceiver shutdown
> - last round of sync
> - sync-worker shutdown
> 
> Should ensure that slots are in sync (as logical slots on the primary should 
> not
> be able to advance as soon as the walreceiver is shutdown during the
> promotion).
> 

I think it could not ensure the slots are in sync, because there is no
guarantee that the logical slot has caught up to the physical standby on
promotion and logical publisher and subscriber both could still be active
during promotion. IOW, the logical slot's LSN can still be advanced after the
walreceiver shutdown if it was far bebind the physical slot's LSN.


Best Regards,
Hou zj

Re: POC, WIP: OR-clause support for indexes

2023-11-23 Thread Andrei Lepikhov


On 23/11/2023 16:23, Andrei Lepikhov wrote:
This code changes tests in many places. But, as I see it, it mostly 
demonstrates the positive effect of the transformation.


I found out that the postgres_fdw tests were impacted by the feature. 
Fix it, because the patch is on the commitfest and passes buildfarm.
Taking advantage of this, I suppressed the expression evaluation 
procedure to make regression test changes more clear.


--
regards,
Andrei Lepikhov
Postgres Professional
From e29b35d3445d9852f3ecb9cdaa4f231c72334973 Mon Sep 17 00:00:00 2001
From: "Andrey V. Lepikhov" 
Date: Thu, 23 Nov 2023 16:00:13 +0700
Subject: [PATCH] Transform OR clause to ANY expressions.

Replace (X=N1) OR (X=N2) ... with X = ANY(N1, N2) on the preliminary stage of
optimization when we are still working with a tree expression.
Sometimes it can lead to not optimal plan. But we think it is better to have
array of elements instead of a lot of OR clauses. Here is a room for further
optimizations on decomposing that array into more optimal parts.
---
 .../postgres_fdw/expected/postgres_fdw.out|   8 +-
 src/backend/nodes/queryjumblefuncs.c  |  30 ++
 src/backend/parser/parse_expr.c   | 284 +-
 src/backend/utils/misc/guc_tables.c   |  10 +
 src/backend/utils/misc/postgresql.conf.sample |   1 +
 src/include/nodes/queryjumble.h   |   1 +
 src/include/parser/parse_expr.h   |   1 +
 src/test/regress/expected/create_index.out| 141 +++--
 src/test/regress/expected/guc.out |   3 +-
 src/test/regress/expected/inherit.out |   2 +-
 src/test/regress/expected/join.out|  62 +++-
 src/test/regress/expected/partition_prune.out | 215 +++--
 src/test/regress/expected/rules.out   |   4 +-
 src/test/regress/expected/stats_ext.out   |  12 +-
 src/test/regress/expected/sysviews.out|   3 +-
 src/test/regress/expected/tidscan.out |  23 +-
 src/test/regress/sql/create_index.sql |  32 ++
 src/test/regress/sql/join.sql |  10 +
 src/test/regress/sql/partition_prune.sql  |  22 ++
 src/test/regress/sql/tidscan.sql  |   6 +
 src/tools/pgindent/typedefs.list  |   2 +
 21 files changed, 810 insertions(+), 62 deletions(-)

diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out 
b/contrib/postgres_fdw/expected/postgres_fdw.out
index 64bcc66b8d..a8442f08aa 100644
--- a/contrib/postgres_fdw/expected/postgres_fdw.out
+++ b/contrib/postgres_fdw/expected/postgres_fdw.out
@@ -8537,18 +8537,18 @@ insert into utrtest values (2, 'qux');
 -- Check case where the foreign partition is a subplan target rel
 explain (verbose, costs off)
 update utrtest set a = 1 where a = 1 or a = 2 returning *;
- QUERY PLAN
 
-
+  QUERY PLAN   
   
+--
  Update on public.utrtest
Output: utrtest_1.a, utrtest_1.b
Foreign Update on public.remp utrtest_1
Update on public.locp utrtest_2
->  Append
  ->  Foreign Update on public.remp utrtest_1
-   Remote SQL: UPDATE public.loct SET a = 1 WHERE (((a = 1) OR (a 
= 2))) RETURNING a, b
+   Remote SQL: UPDATE public.loct SET a = 1 WHERE ((a = ANY 
('{1,2}'::integer[]))) RETURNING a, b
  ->  Seq Scan on public.locp utrtest_2
Output: 1, utrtest_2.tableoid, utrtest_2.ctid, NULL::record
-   Filter: ((utrtest_2.a = 1) OR (utrtest_2.a = 2))
+   Filter: (utrtest_2.a = ANY ('{1,2}'::integer[]))
 (10 rows)
 
 -- The new values are concatenated with ' triggered !'
diff --git a/src/backend/nodes/queryjumblefuncs.c 
b/src/backend/nodes/queryjumblefuncs.c
index 281907a4d8..99207a8670 100644
--- a/src/backend/nodes/queryjumblefuncs.c
+++ b/src/backend/nodes/queryjumblefuncs.c
@@ -135,6 +135,36 @@ JumbleQuery(Query *query)
return jstate;
 }
 
+JumbleState *
+JumbleExpr(Expr *expr, uint64 *queryId)
+{
+   JumbleState *jstate = NULL;
+
+   Assert(queryId != NULL);
+
+   jstate = (JumbleState *) palloc(sizeof(JumbleState));
+
+   /* Set up workspace for query jumbling */
+   jstate->jumble = (unsigned char *) palloc(JUMBLE_SIZE);
+   jstate->jumble_len = 0;
+   jstate->clocations_buf_size = 32;
+   jstate->clocations = (LocationLen *)
+   palloc(jstate->clocations_buf_size * sizeof(LocationLen));
+   jstate->clocations_count = 0;
+   jstate->highest_extern_param_id = 0;
+
+   /* Compute query ID */
+   _jumbleNode(jstate, (Node *) expr);
+   *queryId = DatumGetUInt64(hash_any_extended(jstate->jumble,
+

Re: GUC names in messages

On Thu, Nov 23, 2023 at 06:27:04PM +1100, Peter Smith wrote:
> There may be some changes I've missed, but hopefully, this is a nudge
> in the right direction.

Thanks for spending some time on that.

+In messages containing configuration variable names, do not include quotes
+when the names are visibly not English natural words, such as when they
+have underscores or are all-uppercase or have mixed case. Otherwise, quotes
+must be added.  Do include quotes in a message where an arbitrary variable
+name is to be expanded.
+   

That seems to describe clearly the consensus reached on the thread
(quotes for GUCs that are single terms, no quotes for names that are
obviously parameters).

In terms of messages that have predictible names, 0002 moves in the
needle in the right direction.  There seem to be more:
src/backend/postmaster/bgworker.c:  errhint("Consider increasing the
configuration parameter \"max_worker_processes\".")));
contrib/pg_prewarm/autoprewarm.c:  errhint("Consider increasing
configuration parameter \"max_worker_processes\".")));

Things like parse_and_validate_value() and set_config_option_ext()
include log strings about GUC and these use quotes.  Could these areas
be made smarter with a routine to check if quotes are applied
automatically when we have a "simple" GUC name, aka I guess made of
only lower-case characters?  This could be done with a islower() on
the string name, for instance.
--
Michael

signature.asc
Description: PGP signature

Re: [HACKERS] psql casts aspersions on server reliability

2023-11-23 Thread Laurenz Albe

On Thu, 2023-11-23 at 11:12 -0500, Bruce Momjian wrote:
> On Wed, Nov 22, 2023 at 10:25:14PM -0500, Bruce Momjian wrote:
> > Yes, you are correct.  Here is a patch that implements the FATAL test,
> > though I am not sure I have the logic correct or backwards, and I don't
> > know how to test this.  Thanks.
> 
> I developed the attached patch which seems to work better.  In testing
> kill -3 on a backend or calling elog(FATAL) in the server for a
> session, libpq's 'res' is NULL, meaning we don't have any status to
> check for PGRES_FATAL_ERROR.  It is very possible that libpq just isn't
> stuctured to have the PGRES_FATAL_ERROR at the point where we issue this
> message, and this is not worth improving.
> 
>   test=> select pg_sleep(100);
> -->   FATAL:  FATAL called
>   
>   server closed the connection unexpectedly
> -->   This probably means the server terminated null
>   before or while processing the request.
>   The connection to the server was lost. Attempting reset: Succeeded.

I don't thing "terminated null" is a meaningful message.

>   libpq_append_conn_error(conn, "server closed the connection 
> unexpectedly\n"
> - "\tThis probably means 
> the server terminated abnormally\n"
> - "\tbefore or while 
> processing the request.");
> + "\tThis probably means 
> the server terminated%s\n"
> + "\tbefore or while 
> processing the request.",
> + (conn->result == NULL) 
> ? " null" :
> + 
> (conn->result->resultStatus == PGRES_FATAL_ERROR) ?
> + "" : " abnormally");

Apart from the weird "null", will that work well for translation?

> --- a/src/interfaces/libpq/fe-protocol3.c
> +++ b/src/interfaces/libpq/fe-protocol3.c
> @@ -2158,6 +2158,7 @@ pqFunctionCall3(PGconn *conn, Oid fnid,
>   if (pqGetErrorNotice3(conn, true))
>   continue;
>   status = PGRES_FATAL_ERROR;
> + fprintf(stderr, "Got 'E'\n");
>   break;
>   case 'A':   /* notify message */
>   /* handle notify and go back to processing 
> return values */

That looks like a leftover debugging message.

Yours,
Laurenz Albe

Re: PATCH: Add REINDEX tag to event triggers

On Fri, Nov 24, 2023 at 08:00:00AM +0800, jian he wrote:
> not sure this is the right approach. But now the reindex event
> collection is after we call index_open.
> same static function (reindex_event_trigger_collect) in
> src/backend/commands/indexcmds.c and src/backend/catalog/index.c for
> now.

Knowing that there are two calls of reindex_relation() and four
callers of reindex_index(), passing the stmt looks acceptable to me to
be able to get more context from the reindex statement where a given
index has been rebuilt.

Anyway, as a whole, I am still confused by the shape of the patch..
This relies on pg_event_trigger_ddl_commands(), that reports a list of
DDL commands, but the use case you are proposing is to report a list
of the *indexes* rebuilt.  So, in its current shape, we'd report the
equivalent of N DDL commands matching the N indexes rebuilt in a
single command.  Shouldn't we use a new event type and a dedicated SQL
function to report the list of indexes newly rebuilt to get a full
list of the work that has happened?

> given that ProcessUtilitySlow supports the event triggers facility.
> so probably no need for another REINDEXOPT_ option?

Right, perhaps we don't need that if we just supply the stmt.  If
there is no requirement for it, let's avoid that, then.

- ReindexMultipleTables(stmt->name, stmt->kind, ¶ms);
+ ReindexMultipleTables(stmt, stmt->name, stmt->kind, ¶ms);

If we begin including the stmt, there is no need for these extra
arguments in the extra routines of indexcmds.c.

As far as I can see, this patch is doing too much as presented.  Could
you split the patch into more pieces, please?  Based on v4 you have
sent, there are refactoring and basic piece parts like:
- Patch to make event triggers ddl_command_start and ddl_command_stop
react on ReindexStmt, so as a command is reported in
pg_event_trigger_ddl_commands().
- Patch for GetCommandLogLevel() to switch ReindexStmt from
LOGSTMT_ALL to LOGSTMT_DDL.  True that this could be switched.
- Patch to refactor the routines of indexcmds.c and index.c to use the
ReindexStmt as argument, as a preparation of the next patch...
- Patch to add a new event type with a new SQL function to return a
list of the indexes rebuilt, with the ReindexStmt involved when the
index OID was trapped by the collection.

1) and 2) are a minimal feature in themselves, which may be enough to
satisfy your original case, and where you'd know when a REINDEX has
been run in event triggers.  3) and 4) are what you are trying to
achieve, to get the list of the indexes rebuilt knowing the context of
a given command.
--
Michael

signature.asc
Description: PGP signature

Re: Adding facility for injection points (or probe points?) for more advanced tests

On Wed, Nov 22, 2023 at 09:23:21PM +0530, Ashutosh Bapat wrote:
> On Tue, Nov 21, 2023 at 6:56 AM Michael Paquier  wrote:
>> I've never seen in recent years a need for a given test to use more
>> than 4~5 points.  But I guess that you've seen more than that wanted
>> in a prod environment with a fork of Postgres?
> 
> A given case may not require more than 4 -5 points but users may
> create scripts with many frequently required injection points and
> install handlers for them.

Sure, if a callback is generic enough it could be shared across
multiple points.

> The injection_run function is called from the place where the
> injection point is declared but that place does not know what
> injection function is going to be run. So a user can not pass
> arguments to injection declaration.

It is possible to make that predictible but it means that a callback
is most likely to be used by one single point.  This makes extensions
in charge of holding the callbacks more complicated, at the benefit of
keeping a minimal footprint in the backend code.

> What injection to run is decided
> by the injection_attach and thus one can pass arguments to it but then
> injection_attach stores the information in the shared memory from
> where it's picked up by injection_run. So even though you don't want
> to store the arguments in the shared memory, you are creating a design
> which takes us towards that direction eventually - otherwise users
> will end up writing many injection functions - one for each possible
> combination of count, sleep, conditional variable etc. But let's see
> whether that happens to be the case in practice. We will need to
> evolve this feature based on usage.

A one-one mapping between callback and point is not always necessary.
If you wish to use a combination of N points with a sleep callback and
different sleep times, one can just register a second shmem area in
the extension holding the callbacks that links the point names with
the sleep time to use.

> shmem hash size won't depend upon the number of functions we write in
> the core. Yes it will add to the core code and may add maintenance
> burden. So I understand your inclination to keep the core minimal.

Yeah, without a clear benefit, my point is just to throw the
responsibility to extension developers for now, which would mean the
addition of tests that depend on test_injection_points/, or just
install this extension optionally in other code path that need it.
Maybe 0004 should be in src/test/recovery/ and do that, actually..
I'll most likely agree with extending all the backend stuff in a more
meaningful way, but I am not sure which method should be enforced.

> If the session which attaches to an injection point is same as the
> session where the injection point is triggered (most of the ERROR and
> NOTICE injections will see this pattern), we don't need shared memory.
> There's a performance penalty to it since injection_run will look up
> shared memory. For WAIT we may or may not need shared memory. But
> let's see what other think and what usage patterns we see.

The first POC of the patch that you can find at the top of this thread
did that, actually, but this is too limited.  IMO, linking things to a
central table is just *much* more useful.

I've implemented a v5 that switches the cache to use a seconf hash
table on TopMemoryContext for the cache instead of an array.  This
makes the cache handling slightly cleaner, so your suggestion was
right.  0003 and 0004 are the same as previously, passing or failing
under the same conditions.  I'm wondering if folks have other comments
about 0001 and 0002?  It sounds to me like the consensus is that this
stuff is useful and that there are no string objections, so feel free
to comment.

I don't want to propose 0003 in the tree, just an improved version of
0004 for the test coverage (still need to improve that).
--
Michael
From 702f5fa5a9ccede4c2c4e6c056f567b77332702d Mon Sep 17 00:00:00 2001
From: Michael Paquier 
Date: Fri, 24 Nov 2023 10:35:26 +0900
Subject: [PATCH v5 1/4] Add backend facility for injection points

This adds a set of routines allowing developers to attach, detach and
run custom code based on arbitrary code paths set with a centralized
macro called INJECTION_POINT().  Injection points are registered in a
shared hash table.  Processes also use a local cache to over loading
callbacks more than necessary, cleaning up their cache if a callback has
found to be removed.
---
 src/include/pg_config.h.in|   3 +
 src/include/utils/injection_point.h   |  36 ++
 src/backend/storage/ipc/ipci.c|   3 +
 src/backend/storage/lmgr/lwlocknames.txt  |   1 +
 .../utils/activity/wait_event_names.txt   |   1 +
 src/backend/utils/misc/Makefile   |   1 +
 src/backend/utils/misc/injection_point.c  | 317 ++
 src/backend/utils/misc/meson.build|   1 +
 doc/src/sgml/installation.sgml|  30 ++
 doc/sr

Re: pg_upgrade and logical replication

2023-11-23 Thread Peter Smith

I have only trivial review comments for patch v18-0001

==
src/bin/pg_upgrade/check.c

1. check_new_cluster_subscription_configuration

+ /*
+ * A slot not created yet refers to the 'i' (initialize) state, while
+ * 'r' (ready) state refer to a slot created previously but already
+ * dropped. These states are supported states for upgrade. The other
+ * states listed below are not ok:
+ *
+ * a) SUBREL_STATE_DATASYNC: A relation upgraded while in this state
+ * would retain a replication slot, which could not be dropped by the
+ * sync worker spawned after the upgrade because the subscription ID
+ * tracked by the publisher does not match anymore.
+ *
+ * b) SUBREL_STATE_SYNCDONE: A relation upgraded while in this state
+ * would retain the replication origin in certain cases.
+ *
+ * c) SUBREL_STATE_FINISHEDCOPY: A tablesync worker spawned to work on
+ * a relation upgraded while in this state would expect an origin ID
+ * with the OID of the subscription used before the upgrade, causing
+ * it to fail.
+ *
+ * d) SUBREL_STATE_SYNCWAIT, SUBREL_STATE_CATCHUP and
+ * SUBREL_STATE_UNKNOWN: These states are not stored in the catalog,
+ * so we need not allow these states.
+ */

1a.
/while 'r' (ready) state refer to a slot/while 'r' (ready) state
refers to a slot/

1b.
/These states are supported states for upgrade./These states are
supported for pg_upgrade./

1c
/The other states listed below are not ok./The other states listed
below are not supported./

==
src/bin/pg_upgrade/t/004_subscription.pl

2.
+# --
+# Check that pg_upgrade refuses to run in:
+# a) if there's a subscription with tables in a state different than
+#'r' (ready) or 'i' (init) state and/or
+# b) if the subscription has no replication origin.
+# --

/if there's a subscription with tables in a state different than 'r'
(ready) or 'i' (init) state and/if there's a subscription with tables
in a state other than 'r' (ready) or 'i' (init) and/

==
Kind Regards,
Peter Smith.
Fujitsu Australia

Re: PL/pgSQL: Incomplete item Allow handling of %TYPE arrays, e.g. tab.col%TYPE[]

2023-11-23 Thread Quan Zongliang




On 2023/11/24 03:39, Pavel Stehule wrote:



I modified the documentation a little bit - we don't need to extra 
propose SQL array syntax, I think.
I rewrote regress tests - we don't need to test unsupported 
functionality (related to RECORD).


- all tests passed


I wrote two examples of errors:
  user_id users.user_id%ROWTYPE[];
  user_id users.user_id%ROWTYPE ARRAY[4][3];

Fixed.


Regards

Pavel


 >
 > Regards
 >
 > Pavel
diff --git a/doc/src/sgml/plpgsql.sgml b/doc/src/sgml/plpgsql.sgml
index 5977534a62..0eedf4a55d 100644
--- a/doc/src/sgml/plpgsql.sgml
+++ b/doc/src/sgml/plpgsql.sgml
@@ -766,6 +766,40 @@ SELECT merge_fields(t.*) FROM table1 t WHERE ... ;

   
 
+  
+   Arrays of Copying Types and Row Types
+
+
+name variable%TYPE[];
+name table_name%ROWTYPE[];
+
+
+   
+Arrays of Copying Types and Row Types is defined by appending square 
brackets
+([]) to the %TYPE or 
%ROWTYPE.
+Its definition is similar to PostgreSQL's arrays described in .
+For example:
+
+user_id users.user_id%TYPE[];
+user_id users%ROWTYPE[];
+
+The syntax allows the exact size of arrays to be specified. However, the 
current
+implementation ignores any supplied array size limits, i.e., the behavior 
is the
+same as for arrays of unspecified length.
+   
+
+   
+An alternative syntax, which conforms to the SQL standard by using the 
keyword
+ARRAY, can be used for one-dimensional or 
multi-dimensional
+arrays too:
+
+user_id users.user_id%TYPE ARRAY;
+user_id users%ROWTYPE ARRAY[4][3];
+
+As before, however, PostgreSQL does not enforce the size restriction in 
any case.
+   
+  
+
   
Record Types
 
@@ -794,6 +828,12 @@ SELECT merge_fields(t.*) FROM table1 t WHERE ... ;
 calling query is parsed, whereas a record variable can change its row
 structure on-the-fly.

+
+   
+The RECORD cannot be used for declaration of variable
+of an array type. "Copying Types" as shown in 
+are not supported too.
+   
   
 
   
diff --git a/src/pl/plpgsql/src/pl_comp.c b/src/pl/plpgsql/src/pl_comp.c
index a341cde2c1..a9cb15df6d 100644
--- a/src/pl/plpgsql/src/pl_comp.c
+++ b/src/pl/plpgsql/src/pl_comp.c
@@ -2095,6 +2095,29 @@ plpgsql_build_datatype(Oid typeOid, int32 typmod,
return typ;
 }
 
+/*
+ * Returns an array for type specified as argument.
+ */
+PLpgSQL_type *
+plpgsql_datatype_arrayof(PLpgSQL_type *dtype)
+{
+   Oid array_typeid;
+
+   if (dtype->typisarray)
+   return dtype;
+
+   array_typeid = get_array_type(dtype->typoid);
+
+   if (!OidIsValid(array_typeid))
+   ereport(ERROR,
+   (errcode(ERRCODE_UNDEFINED_OBJECT),
+errmsg("could not find array type for data 
type \"%s\"",
+   
format_type_be(dtype->typoid;
+
+   return plpgsql_build_datatype(array_typeid, dtype->atttypmod,
+ 
dtype->collation, NULL);
+}
+
 /*
  * Utility subroutine to make a PLpgSQL_type struct given a pg_type entry
  * and additional details (see comments for plpgsql_build_datatype).
diff --git a/src/pl/plpgsql/src/pl_gram.y b/src/pl/plpgsql/src/pl_gram.y
index 6a09bfdd67..aa9103cf0e 100644
--- a/src/pl/plpgsql/src/pl_gram.y
+++ b/src/pl/plpgsql/src/pl_gram.y
@@ -2789,7 +2789,7 @@ read_datatype(int tok)
StringInfoData ds;
char   *type_name;
int startlocation;
-   PLpgSQL_type *result;
+   PLpgSQL_type *result = NULL;
int parenlevel = 0;
 
/* Should only be called while parsing DECLARE sections */
@@ -2817,15 +2817,11 @@ read_datatype(int tok)
   K_TYPE, "type"))
{
result = plpgsql_parse_wordtype(dtname);
-   if (result)
-   return result;
}
else if (tok_is_keyword(tok, &yylval,

K_ROWTYPE, "rowtype"))
{
result = plpgsql_parse_wordrowtype(dtname);
-   if (result)
-   return result;
}
}
}
@@ -2841,15 +2837,11 @@ read_datatype(int tok)
   K_TYPE, "type"))
{
result = plpgsql_parse_wordtype(dtname);
-   if (result)
-   return result;
}
else if (tok_is_keyword(tok, &yylval,

K_ROWTYPE, "rowtype"))

Re: PATCH: Add REINDEX tag to event triggers

2023-11-23 Thread jian he

On Mon, Nov 20, 2023 at 3:34 PM Michael Paquier  wrote:
>
> On Sat, Oct 28, 2023 at 12:15:22PM +0800, jian he wrote:
> > reindex_event_trigger_collect_relation called in
> > ReindexMultipleInternal, ReindexTable (line 2979).
> > Both are "under concurrent is false" branches.
> >
> > So it should be fine.
>
> Sorry for the delay in replying.
>
> Indeed, you're right.  reindex_event_trigger_collect_relation() gets
> only called in two paths for the non-concurrent cases just after
> reindex_relation(), which is not a safe location to call it because
> this may be used in CLUSTER or TRUNCATE, and the intention of the
> patch is only to count for the objects in REINDEX.
>
> Anyway, I think that the current implementation is incorrect.  The
> object is collected after the reindex is done, which is right.
> However, an object may be reindexed but not be reported if it was
> dropped between the reindex and its endwhen collecting all the objects
> of a single relation.  Perhaps it does not matter because the object
> is gone, but that still looks incorrect to me because we finish by not
> reporting everything we should.  I think that we should put the
> collection deeper into the stack, where we know that we are holding
> the locks on the objects we are collecting.  Another side effect is
> that the patch seems to be missing toast table indexes, which are also
> reindexed for a REINDEX TABLE, for instance.  That's not right.
>
> Actually, could there be an argument for pushing the collection down
> to reindex_relation() instead to count for all the commands that?
> Even better, reindex_index() would be a better candidate because it is
> itself called within reindex_relation() for each individual indexes?
> If a code path should not be covered for event triggers, we could use
> a new REINDEXOPT_ to control that, optionally.  In short, it looks
> like we should try to reduce the number of paths calling
> reindex_event_trigger_collect(), while collect_relation() ought to be
> removed.
>
> Should REINDEX TABLE CONCURRENTLY's coverage be extended so as two new
> indexes are reported instead of just one?
>
> REINDEX SCHEMA is a command that perhaps should be tested to collect
> all the indexes rebuilt through it?
>
> A side-effect of this patch is that it impacts ddl_command_start and
> ddl_command_end with the addition of REINDEX.  Mixing that with the
> addition of a new event is strange.  I think that the addition of
> REINDEX to the existing events should be done first, as a separate
> patch.  Then a second patch should deal with the collection and the
> support of the new event.
>
> I have also dug into the archives to note that control commands have
> been proposed in the past to be added as part of event triggers, and
> it happens that I've mentioned in a comment that that this was a
> concept perhaps contrary to what event triggers should do, as these
> are intended for DDLs:
> https://www.postgresql.org/message-id/cab7npqtqz2-ycnzoq5kvbujyhq4kdsd4q55mc-fbzm8gh0b...@mail.gmail.com
>
> Philosophically, I'm open on all that and having more commands
> depending on the cases that are satisfied, FWIW, but let's organize
> the changes.
> --
> Michael


Hi.
Top level func is ExecReIndex. it will call {ReindexIndex,
ReindexTable, ReindexMultipleTables}
then trace deep down, it will call {ReindexRelationConcurrently, reindex_index}.

Imitate the CREATE INDEX logic, we need pass `const ReindexStmt *stmt`
to all the following functions:
ReindexIndex
ReindexTable
ReindexMultipleTables
ReindexPartitions
ReindexMultipleInternal
ReindexRelationConcurrently
reindex_relation
reindex_index

because the EventTriggerCollectSimpleCommand needs the `(ReindexStmt
*) parsetree`.
and we call EventTriggerCollectSimpleCommand at reindex_index or
ReindexRelationConcurrently.

The new test will cover the following case.
reindex (concurrently) schema;
reindex (concurrently) partitioned index;
reindex (concurrently) partitioned table;

not sure this is the right approach. But now the reindex event
collection is after we call index_open.
same static function (reindex_event_trigger_collect) in
src/backend/commands/indexcmds.c and src/backend/catalog/index.c for
now.
given that ProcessUtilitySlow supports the event triggers facility.
so probably no need for another REINDEXOPT_ option?

I also added a test for disabled event triggers.
From 149b4e9860f92daa810dfed4704f4c796a585b1a Mon Sep 17 00:00:00 2001
From: pgaddict 
Date: Thu, 23 Nov 2023 21:56:09 +0800
Subject: [PATCH v4 1/1] Add REINDEX tag to event triggers

This turns on the event trigger on command tag REINDEX.
This will now return a record for each index that was modified as
part of start/end ddl commands.

For concurrent reindex, this will return the OID for the new index. This
includes concurrently reindexing tables and databases. It will only
report the new index records and not the ones destroyed by the
concurrent reindex process.

We put the new index OID collection deeper into the

Re: Table AM Interface Enhancements

2023-11-23 Thread Matthias van de Meent

Hi,

On Thu, 23 Nov 2023 at 13:43, Alexander Korotkov  wrote:
>
> Hello PostgreSQL Hackers,
>
> I am pleased to submit a series of patches related to the Table Access
> Method (AM) interface, which I initially announced during my talk at
> PGCon 2023 [1]. These patches are primarily designed to support the
> OrioleDB engine, but I believe they could be beneficial for other
> table AM implementations as well.
>
> The focus of these patches is to introduce more flexibility and
> capabilities into the Table AM interface. This is particularly
> relevant for advanced use cases like index-organized tables,
> alternative MVCC implementations, etc.
>
> Here's a brief overview of the patches included in this set:

Note: no significant review of the patches, just a first response on
the cover letters and oddities I noticed:

Overall, this patchset adds significant API area to TableAmRoutine,
without adding the relevant documentation on how it's expected to be
used. With the overall size of the patchset also being very
significant, I don't think this patch is reviewable as is; the goal
isn't clear enough, the APIs aren't well explained, and the
interactions with the index API are left up in the air.

> 0001-Allow-locking-updated-tuples-in-tuple_update-and--v1.patch
>
> Optimizes the process of locking concurrently updated tuples during
> update and delete operations. Helpful for table AMs where refinding
> existing tuples is expensive.

Is this essentially an optimized implementation of the "DELETE FROM
... RETURNING *" per-tuple primitive?

> 0003-Allow-table-AM-to-store-complex-data-structures-i-v1.patch
>
> Allows table AM to store complex data structure in rd_amcache rather
> than a single chunk of memory.

I don't think we should allow arbitrarily large and arbitrarily many
chunks of data in the relcache or table caches. Why isn't one chunk
enough?

> 0004-Add-table-AM-tuple_is_current-method-v1.patch
>
> This allows us to abstract how/whether table AM uses transaction identifiers.

I'm not a fan of the indirection here. Also, assuming that table slots
don't outlive transactions, wouldn't this be a more appropriate fit
with the table tuple slot API?

> 0005-Generalize-relation-analyze-in-table-AM-interface-v1.patch
>
> Provides a more flexible API for sampling tuples, beneficial for
> non-standard table types like index-organized tables.
>
> 0006-Generalize-table-AM-API-for-INSERT-.-ON-CONFLICT-v1.patch
>
> Provides a new table AM API method to encapsulate the whole INSERT ...
> ON CONFLICT ... algorithm rather than just implementation of
> speculative tokens.

Does this not still require speculative inserts, with speculative
tokens, for secondary indexes? Why make AMs implement that all over
again?

> 0007-Allow-table-AM-tuple_insert-method-to-return-the--v1.patch
>
> This allows table AM to return a native tuple slot, which is aware of
> table AM-specific system attributes.

This seems reasonable.

> 0008-Let-table-AM-insertion-methods-control-index-inse-v1.patch
>
> Allows table AM to skip index insertions in the executor and handle
> those insertions itself.

Who handles index tuple removal then? I don't see a patch that
describes index AM changes for this...

> 0009-Custom-reloptions-for-table-AM-v1.patch
>
> Enables table AMs to define and override reloptions for tables and indexes.
>
> 0010-Notify-table-AM-about-index-creation-v1.patch
>
> Allows table AMs to prepare or update specific meta-information during
> index creation.

I don't think the described use case of this API is OK - a table AM
cannot know about the internals of index AMs, and is definitely not
allowed to overwrite the information of that index.
If I ask for an index that uses the "btree" index, then that needs to
be the AM actually used, or an error needs to be raised if it is
somehow incompatible with the table AM used. It can't be that we
silently update information and create an index that is explicitly not
what the user asked to create.

I also don't see updates in documentation, which I think is quite a
shame as I have trouble understanding some parts.

> 0012-Introduce-RowID-bytea-tuple-identifier-v1.patch
>
> `This patch introduces 'RowID', a new bytea tuple identifier, to
> overcome the limitations of the current 32-bit block number and 16-bit
> offset-based tuple identifier. This is particularly useful for
> index-organized tables and other advanced use cases.

We don't have any index methods that can handle anything but
block+offset TIDs, and I don't see any changes to the IndexAM APIs to
support these RowID tuples, so what's the plan here? I don't see any
of that in the commit message, nor in the rest of this patchset.

> Each commit message contains a detailed explanation of the changes and
> their rationale. I believe these enhancements will significantly
> improve the flexibility and capabilities of the PostgreSQL Table AM
> interface.

I've noticed there is not a lot of rationale for several of the
changes as to why Postgr

Improving the comments in pqsignal()

2023-11-23 Thread Thomas Munro

Hi,

While following along with Tristan and Heikki's thread about signals
in psql, it occurred to me that the documentation atop pqsignal() is
not very good:

 * we don't explain what problem it originally solved
 * we don't explain why it's still needed today
 * we don't explain what else it does for us today
 * we describe the backend implementation for Windows incorrectly (mea culpa)
 * we vaguely mention one issue with Windows frontend code, but I
think the point made is misleading, and we don't convey the scale of
the differences

Here is my attempt to improve it.
From 49b979d9a8333169aed4f1bc549f66e0322f01b0 Mon Sep 17 00:00:00 2001
From: Thomas Munro 
Date: Thu, 23 Nov 2023 17:30:15 +1300
Subject: [PATCH] Improve comments about pqsignal().

Explain where pqsignal() came from, what problem it originally solved
without assuming the reader is familiar with historical Unixen, why we
still need it, what it does for us now, and the key differences in
frontend code on Windows.
---
 src/port/pqsignal.c | 31 +++
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/src/port/pqsignal.c b/src/port/pqsignal.c
index 83d876db6c..817a0a5030 100644
--- a/src/port/pqsignal.c
+++ b/src/port/pqsignal.c
@@ -11,16 +11,31 @@
  * IDENTIFICATION
  *	  src/port/pqsignal.c
  *
- *	We now assume that all Unix-oid systems have POSIX sigaction(2)
- *	with support for restartable signals (SA_RESTART).  We used to also
- *	support BSD-style signal(2), but there really shouldn't be anything
- *	out there anymore that doesn't have the POSIX API.
+ *	This is program 10.12 from Advanced Programming in the UNIX Environment,
+ *	with minor changes.  It was originally a replacement needed for old SVR4
+ *	systems whose signal() behaved as if sa_flags = SA_RESETHAND | SA_NODEFER,
+ *	also known as "unreliable" signals due to races when the handler was reset.
+ *
+ *	By now, all known modern Unix systems have a "reliable" signal() call.
+ *	We still don't want to use it though, because it remains
+ *	implementation-defined by both C99 and POSIX whether the handler is reset
+ *	or signals are blocked when the handler runs, and default restart behavior
+ *	is underdocumented.  Therefore we take POSIX's advice and call sigaction()
+ *	so we can provide explicit sa_flags, but wrap it in this more convenient
+ *	traditional interface style.  It also provides a place to set any extra
+ *	flags we want everywhere, such as SA_NOCLDSTOP.
  *
  *	Windows, of course, is resolutely in a class by itself.  In the backend,
- *	we don't use this file at all; src/backend/port/win32/signal.c provides
- *	pqsignal() for the backend environment.  Frontend programs can use
- *	this version of pqsignal() if they wish, but beware that this does
- *	not provide restartable signals on Windows.
+ *	this relies on pqsigaction() in src/backend/port/win32/signal.c, which
+ *	provides limited emulation of reliable signals.  Frontend programs can use
+ *	this version of pqsignal() to forward to the native Windows signal() call
+ *	if they wish, but beware that Windows signals behave quite differently.
+ *	Only the 6 signals required by C are supported.  SIGINT handlers run in
+ *	another thread instead of interrupting an existing thread, and the others
+ *	don't interrupt system calls either, so SA_RESTART is moot.  All except
+ *	SIGFPE have SA_RESETHAND semantics, meaning the handler is reset to SIG_DFL
+ *	each time it runs.  The set of things you are allowed to do in a handler is
+ *	also much more restricted than on Unix, according to the documentation.
  *
  * 
  */
-- 
2.42.1

Re: Properly pathify the union planner

2023-11-23 Thread David Rowley

On Thu, 2 Nov 2023 at 12:42, David Rowley  wrote:
> One part that still needs work is the EquivalanceClass building.
> Because we only build the final targetlist for the Append after
> planning all the append child queries, I ended up having to populate
> the EquivalanceClasses backwards, i.e children first. add_eq_member()
> determines if you're passing a child member by checking if parent !=
> NULL.  Since I don't have a parent EquivalenceMember to pass,
> em_is_child gets set wrongly, and that causes problems because
> ec_has_const can get set to true when it shouldn't. This is a problem
> as it can make a PathKey redundant when it isn't.  I wonder if I'll
> need to change the signature of add_eq_member() and add an "is_child"
> bool to force the EM to be a child em... Needs more thought...

I've spent more time working on this and I ended up solving the above
problem by delaying the subquery path creation on the union children
until after we've built the top-level targetlist.  This allows the
parent eclasses to be correctly added before adding members for the
union children. (See build_setop_child_paths() in the patch)

There's been quite a bit of progress in other areas too.  Incremental
sorts now work:

# create table t1(a int primary key, b int not null);
# create table t2(a int primary key, b int not null);
# insert into t1 select x,x from generate_Series(1,100)x;
# insert into t2 select x,x from generate_Series(1,100)x;
# vacuum analyze t1,t2;

# explain (costs off) select * from t1 union select * from t2;
QUERY PLAN
--
 Unique
   ->  Merge Append
 Sort Key: t1.a, t1.b
 ->  Incremental Sort
   Sort Key: t1.a, t1.b
   Presorted Key: t1.a
   ->  Index Scan using t1_pkey on t1
 ->  Incremental Sort
   Sort Key: t2.a, t2.b
   Presorted Key: t2.a
   ->  Index Scan using t2_pkey on t2
(11 rows)

However, I've not yet made the MergeAppend UNIONs work when the
datatypes don't match on either side of the UNION.  For now, the
reason this does not work is due to convert_subquery_pathkeys() being
unable to find the pathkey targets in the targetlist.  The actual
targets can't be found due to the typecast.  I wondered if this could
be fixed by adding an additional projection path to the subquery when
the output columns don't match the setop->colTypes, but I'm a bit put
off by the comment in transformSetOperationTree:

> * For all non-UNKNOWN-type cases, we verify coercibility but we
> * don't modify the child's expression, for fear of changing the
> * child query's semantics.

I assume that's worried about the semantics of things like WHERE
clauses, so maybe the projection path in the subquery would be ok. I
need to spend more time on that.

Another problem I hit was add_path() pfreeing a Path that I needed.
This was happening due to how I'm building the final paths in the
subquery when setop_pathkeys are set.  Because I want to always
include the cheapest_input_path to allow that path to be used in
hash-based UNIONs, I also want to provide sorted paths so that
MergeAppend has something to work with.  I found cases where I'd
add_path() the cheapest_input_path to the final rel then also attempt
to sort that path.  Unfortunately, add_path() found the unsorted path
and the sorted path fuzzily the same cost and opted to keep the sorted
one due to it having better pathkeys. add_path() then pfree'd the
cheapest_input_path which meant the Sort's subpath was gone which
obviously caused issues in createplan.c.

For now, as a temporary fix, I've just #ifdef'd out the code in
add_path() that's pfreeing the old path.  I have drafted a patch that
refcounts Paths, but I'm unsure if that's the correct solution as I'm
only maintaining the refcounts in add_path() and add_partial_path(). I
think a true correct solution would bump the refcount when a path is
used as some other path's subpath.  That would mean having to
recursively pfree paths up until we find one with a refcount>0. Seems
a bit expensive for add_path() to do.

I've attached the updated patch.  This one is probably ready for
someone to test out. There will be more work to do, however.

David
diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out 
b/contrib/postgres_fdw/expected/postgres_fdw.out
index 22cae37a1e..51e6dd028c 100644
--- a/contrib/postgres_fdw/expected/postgres_fdw.out
+++ b/contrib/postgres_fdw/expected/postgres_fdw.out
@@ -11194,6 +11194,11 @@ DROP INDEX base_tbl1_idx;
 DROP INDEX base_tbl2_idx;
 DROP INDEX async_p3_idx;
 -- UNION queries
+SET enable_sort TO off;
+SET enable_incremental_sort TO off;
+-- Adjust fdw_startup_cost so that we get an unordered path in the Append.
+-- XXX or is it better to just accept the plan change?
+ALTER SERVER loopback2 OPTIONS (ADD fdw_startup_cost '0.00');
 EXPLAIN (VERBOSE, COSTS OFF)
 INSERT INTO result_tbl
 (SELECT a, b, 'AAA' || c FROM a

Re: Questions regarding Index AMs and natural ordering

2023-11-23 Thread Peter Geoghegan

On Thu, Nov 23, 2023 at 11:15 AM Tom Lane  wrote:
> Agreed on that, but I don't have that hard a time imagining cases
> where it might be useful for btree not to guarantee ordered output.
> IIRC, it currently has to do extra pushups to ensure that behavior
> in ScalarArrayOp cases.  We've not bothered to expand the planner
> infrastructure to distinguish "could, but doesn't" paths for btree
> scans, but that's more about it not being a priority than because it
> wouldn't make sense.

I'm glad that you brought that up, because it's an important issue for
my ScalarArrayOp patch (which Matthias referenced). My patch simply
removes this restriction from the planner -- now ScalarArrayOp clauses
aren't treated as a special case when generating path keys. This works
in all cases because the patch generalizes nbtree's approach to native
ScalarArrayOp execution, allowing index scans to work as one
continuous index scan in all cases.

As you might recall, the test case that exercises the issue is:

SELECT thousand, tenthous FROM tenk1
WHERE thousand < 2 AND tenthous IN (1001,3000)
ORDER BY thousand;

It doesn't actually make much sense to execute this as two primitive
index scans, though. The most efficient approach is to perform a
single index descent, while still being able to use a true index qual
for "tenthous" (since using a filter qual is so much slower due to the
overhead of accessing the heap just to filter out non-matching
tuples). That's what the patch does.

This would be true even without the "ORDER BY" -- accessing the leaf
page once is faster than accessing it twice (same goes for the root).
So I see no principled reason why we'd ever really want to start a
primitive index scan that wasn't "anchored to an equality constraint".
This is reliably faster, while also preserving index sort order,
almost as a side-effect.

-- 
Peter Geoghegan

Re: PL/pgSQL: Incomplete item Allow handling of %TYPE arrays, e.g. tab.col%TYPE[]

2023-11-23 Thread Pavel Stehule

čt 23. 11. 2023 v 13:28 odesílatel Quan Zongliang 
napsal:

>
>
> On 2023/11/20 17:33, Pavel Stehule wrote:
>
> >
> >
> > I did some deeper check:
> >
> > - I don't like too much parser's modification (I am sending alternative
> > own implementation) - the SQL parser allows richer syntax, and for full
> > functionality is only few lines more
> Agree.
>
> >
> > - original patch doesn't solve %ROWTYPE
> >
> > (2023-11-20 10:04:36) postgres=# select * from foo;
> > ┌┬┐
> > │ a  │ b  │
> > ╞╪╡
> > │ 10 │ 20 │
> > │ 30 │ 40 │
> > └┴┘
> > (2 rows)
> >
> > (2023-11-20 10:08:29) postgres=# do $$
> > declare v foo%rowtype[];
> > begin
> >v := array(select row(a,b) from foo);
> >raise notice '%', v;
> > end;
> > $$;
> > NOTICE:  {"(10,20)","(30,40)"}
> > DO
> >
> two little fixes
> 1. spelling mistake
>ARRAY [ icons ] --> ARRAY [ iconst ]
> 2. code bug
>if (!OidIsValid(dtype->typoid)) --> if (!OidIsValid(array_typeid))
>
>
> > - original patch doesn't solve type RECORD
> > the error message should be more intuitive, although the arrays of
> > record type can be supported, but it probably needs bigger research.
> >
> > (2023-11-20 10:10:34) postgres=# do $$
> > declare r record; v r%type[];
> > begin
> >v := array(select row(a,b) from foo);
> >raise notice '%', v;
> > end;
> > $$;
> > ERROR:  syntax error at or near "%"
> > LINE 2: declare r record; v r%type[];
> >   ^
> > CONTEXT:  invalid type name "r%type[]"
> >
> Currently only scalar variables are supported.
> This error is consistent with the r%type error. And record arrays are
> not currently supported.
> Support for r%type should be considered first. For now, let r%type[]
> report the same error as record[].
> I prefer to implement it with a new patch.
>

ok


>
> > - missing documentation
> My English is not good. I wrote it down, please correct it. Add a note
> in the "Record Types" documentation that arrays and "Copying Types" are
> not supported yet.
>
> >
> > - I don't like using the word "partitioned" in the regress test name
> > "partitioned_table". It is confusing
> fixed
>

I modified the documentation a little bit - we don't need to extra propose
SQL array syntax, I think.
I rewrote regress tests - we don't need to test unsupported functionality
(related to RECORD).

- all tests passed

Regards

Pavel


>
> >
> > Regards
> >
> > Pavel
From cb0af209ab82baa19ff916d4acccf30d3aec97b1 Mon Sep 17 00:00:00 2001
From: "ok...@github.com" 
Date: Thu, 23 Nov 2023 18:39:27 +0100
Subject: [PATCH] support of syntax %type[] and %rowtype[]

---
 doc/src/sgml/plpgsql.sgml | 40 +++
 src/pl/plpgsql/src/pl_comp.c  | 23 +
 src/pl/plpgsql/src/pl_gram.y  | 60 ++-
 src/pl/plpgsql/src/plpgsql.h  |  1 +
 src/test/regress/expected/plpgsql.out | 70 +++
 src/test/regress/sql/plpgsql.sql  | 64 
 6 files changed, 245 insertions(+), 13 deletions(-)

diff --git a/doc/src/sgml/plpgsql.sgml b/doc/src/sgml/plpgsql.sgml
index 5977534a62..d10cb31fe5 100644
--- a/doc/src/sgml/plpgsql.sgml
+++ b/doc/src/sgml/plpgsql.sgml
@@ -766,6 +766,40 @@ SELECT merge_fields(t.*) FROM table1 t WHERE ... ;

   
 
+  
+   Arrays of Copying Types and Row Types
+
+
+name variable%TYPE[];
+name table_name%ROWTYPE[];
+
+
+   
+Arrays of Copying Types and Row Types is defined by appending square brackets
+([]) to the %TYPE or %ROWTYPE.
+Its definition is similar to PostgreSQL's arrays described in .
+For example:
+
+user_id users.user_id%TYPE[];
+user_id users.user_id%ROWTYPE[];
+
+The syntax allows the exact size of arrays to be specified. However, the current
+implementation ignores any supplied array size limits, i.e., the behavior is the
+same as for arrays of unspecified length.
+   
+
+   
+An alternative syntax, which conforms to the SQL standard by using the keyword
+ARRAY, can be used for one-dimensional or multi-dimensional
+arrays too:
+
+user_id users.user_id%TYPE ARRAY;
+user_id users.user_id%ROWTYPE ARRAY[4][3];
+
+As before, however, PostgreSQL does not enforce the size restriction in any case.
+   
+  
+
   
Record Types
 
@@ -794,6 +828,12 @@ SELECT merge_fields(t.*) FROM table1 t WHERE ... ;
 calling query is parsed, whereas a record variable can change its row
 structure on-the-fly.

+
+   
+The RECORD cannot be used for declaration of variable
+of an array type. "Copying Types" as shown in 
+are not supported too.
+   
   
 
   
diff --git a/src/pl/plpgsql/src/pl_comp.c b/src/pl/plpgsql/src/pl_comp.c
index a341cde2c1..a9cb15df6d 100644
--- a/src/pl/plpgsql/src/pl_comp.c
+++ b/src/pl/plpgsql/src/pl_comp.c
@@ -2095,6 +2095,29 @@ plpgsql_build_datatype(Oid typeOid, int32 typmod,
 	return typ;
 }
 
+/*
+ * Returns an array for type specified as argument.
+ */
+PLpgSQL_type *
+plpgsql_datatype_arrayof(PLpgSQL_type *dtyp

Re: Questions regarding Index AMs and natural ordering

2023-11-23 Thread Tom Lane

Peter Geoghegan  writes:
> On Thu, Nov 23, 2023 at 9:16 AM Matthias van de Meent
>  wrote:
>> Is returning index scan results in (reverse) natural order not
>> optional but required with amcanorder? If it is required, why is the
>> am indicator called 'canorder' instead of 'willorder', 'doesorder' or
>> 'isordered'?

> I don't know. I have a hard time imagining an index AM that is
> amcanorder=true that isn't either nbtree, or something very similar
> (so similar that it seems unlikely that anybody would actually go to
> the trouble of implementing it from scratch).

Agreed on that, but I don't have that hard a time imagining cases
where it might be useful for btree not to guarantee ordered output.
IIRC, it currently has to do extra pushups to ensure that behavior
in ScalarArrayOp cases.  We've not bothered to expand the planner
infrastructure to distinguish "could, but doesn't" paths for btree
scans, but that's more about it not being a priority than because it
wouldn't make sense.  If we did put work into that, we'd probably
generate multiple indexscan Paths for the same index and same index
conditions, some of which are marked with sort ordering PathKeys and
some of which aren't (and have a flag that would eventually tell the
index AM not to bother with sorting at runtime).

> The general notion of a data type's sort order comes from its default
> btree operator class, so the whole idea of a generic sort order is
> deeply tied to the nbtree AM.

Right.  There hasn't been a reason to decouple that, just as our
notions of hashing are tied to the hash AM.  This doesn't entirely
foreclose other AMs that handle sorted output, but it constrains
them to use btree's opclasses to represent the ordering.

regards, tom lane

Re: Questions regarding Index AMs and natural ordering

2023-11-23 Thread Peter Geoghegan

On Thu, Nov 23, 2023 at 9:16 AM Matthias van de Meent
 wrote:
> For example, btree ignores any ordering scan keys that it is given in
> btrescan, which seems fine for btree because the ordering of a btree
> is static (and no other order than that is expected apart from it's
> reverse order), but this becomes problematic for other indexes that
> could return ordered data but would prefer not to have to go through
> the motions of making sure the return order is 100% correct, rather
> than a k-sorted sequence, or just the matches to the quals (like
> GIST). Is returning index scan results in (reverse) natural order not
> optional but required with amcanorder? If it is required, why is the
> am indicator called 'canorder' instead of 'willorder', 'doesorder' or
> 'isordered'?

I don't know. I have a hard time imagining an index AM that is
amcanorder=true that isn't either nbtree, or something very similar
(so similar that it seems unlikely that anybody would actually go to
the trouble of implementing it from scratch).

You didn't mention support for merge joins. That's one of the defining
characteristics of an amcanorder=true index AM, since an
"ammarkpos/amrestrpos function need only be provided if the access
method supports ordered scans". It's hard to imagine how that could
work with a loosely ordered index. It seems to imply that the scan
must always work with a simple linear order.

Cases where the planner uses a merge join often involve an index path
with an "interesting sort order" that "enables" the merge join.
Perhaps most of the alternative plans (that were almost as cheap as
the merge join plan) would have had to scan the same index in the same
way anyway, so it ends up making sense to use a merge join. The merge
join can get ordered results from the index "at no extra cost". (All
of this is implicit, of course -- the actual reason why the planner
chose the merge join plan is because it worked out to be the cheapest
plan.)

My impression is that this structure is fairly well baked in -- which
the optimizer/README goes into. That is, the planner likes to think of
paths as having an intrinsic order that merge joins can take advantage
of -- merge joins tend to win by being "globally optimal" without
being "locally optimal". So generating extra index paths that don't
have any intrinsic order (but can be ordered using a special kind of
index scan) seems like it might be awkward to integrate.

I'm far from an expert on the planner, so take anything that I say
about it with a grain of salt.

> Alternatively, if an am should be using the order scan keys from
> .amrescan and natural order scans also get scan keys, is there some
> place I find the selection process for ordered index AMs, and how this
> ordering should be interepreted? There are no good examples available
> in core code because btree is special-cased, and there are no other
> in-tree AMs that have facilities where both `amcanorderbyop` and
> `amcanorder` are set.

The general notion of a data type's sort order comes from its default
btree operator class, so the whole idea of a generic sort order is
deeply tied to the nbtree AM. That's why we sometimes have btree
operator classes for types that you'd never actually want to index (at
least not using a btree index).

-- 
Peter Geoghegan

Re: Use index to estimate expression selectivity

On 11/23/23 18:30, Tom Lane wrote:
> Bono Stebler  writes:
>> After discussing the issue on irc, it looks like it could be possible 
>> for the planner to use a partial index matching an expression exactly to 
>> estimate its selectivity.
> 
> I think going forward we're going to be more interested in extending
> CREATE STATISTICS than in adding special behaviors around indexes.
> An index is a pretty expensive thing to maintain if you really only
> want some statistics.  Contrariwise, if you need the index for
> functional reasons (perhaps to enforce some strange uniqueness
> constraint) but you don't like some decision the planner takes because
> of the existence of that index, you're kind of stuck.  So decoupling
> this stuff makes more sense from where I sit.
> 

I agree adding indexes if you only really want the statistics part would
be rather expensive, but I do think using indexes created for functional
reasons as a source of statistics is worth consideration.

Actually, I've been experimenting with using btree indexes to estimate
certain conditions (e.g. the simplest var=const), and the estimates
calculated from internal pages is quite useful. Maybe not as the primary
estimate, but to set "safe" range for non-MCV items. For example if the
traditional estimate says 1 row matches, but we see there's ~100 leaf
pages for that key, maybe we should bump up the estimate ...

But yeah, it may affect the query planning in undesirable ways. Perhaps
we could have "use for statistics" reloption or something ...

regards

-- 
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

Re: Use index to estimate expression selectivity

2023-11-23 Thread Tom Lane

Bono Stebler  writes:
> After discussing the issue on irc, it looks like it could be possible 
> for the planner to use a partial index matching an expression exactly to 
> estimate its selectivity.

I think going forward we're going to be more interested in extending
CREATE STATISTICS than in adding special behaviors around indexes.
An index is a pretty expensive thing to maintain if you really only
want some statistics.  Contrariwise, if you need the index for
functional reasons (perhaps to enforce some strange uniqueness
constraint) but you don't like some decision the planner takes because
of the existence of that index, you're kind of stuck.  So decoupling
this stuff makes more sense from where I sit.

Having said that ...

> Here is a simplified version (thanks ysch) of the issue I am facing:
> https://dbfiddle.uk/flPq8-pj
> I have tried using CREATE STATISTICS as well but haven't found a way to 
> improve the planner estimation for that query.

I assume what you did was try to make stats on "synchronized_at IS
DISTINCT FROM updated_at"?  Yeah, it does not surprise me that we fail
to match that to this query.  The trouble with expression statistics
(and expression indexes) is that it's impractical to match every
subexpression of the query to every subexpression that might be
presented by CREATE STATISTICS: you soon get into exponential
behavior.  So there's a limited set of contexts where we look for
a match.

I experimented a bit and found that if you do have statistics on that,
then "WHERE (synchronized_at IS DISTINCT FROM updated_at) IS TRUE"
will consult the stats.  Might do as a hacky workaround.

regards, tom lane

Questions regarding Index AMs and natural ordering

2023-11-23 Thread Matthias van de Meent

Hi,

Over in [0] and [1] there are patches that touch on the topic of
'natual ordering' index retrieval, and [2] also touches on the topic.
For those patches, I've been looking at how the planner and executor
indicate to index AMs that they expects the output to be ordered, and
how this ordering should work.
I've mostly found how it works for index_key opr constant, but I've
yet to find a good mental model for how the planner handles indexes
that can expose the 'intrinsic order' of data, i.e. indexes with
`amcanorder=true`, because there is very little (if any) real
documentation on what is expected from indexes when it advertises
certain features, and how the executor signals to the AM that it wants
to make use of those features.

For example, btree ignores any ordering scan keys that it is given in
btrescan, which seems fine for btree because the ordering of a btree
is static (and no other order than that is expected apart from it's
reverse order), but this becomes problematic for other indexes that
could return ordered data but would prefer not to have to go through
the motions of making sure the return order is 100% correct, rather
than a k-sorted sequence, or just the matches to the quals (like
GIST). Is returning index scan results in (reverse) natural order not
optional but required with amcanorder? If it is required, why is the
am indicator called 'canorder' instead of 'willorder', 'doesorder' or
'isordered'?

Alternatively, if an am should be using the order scan keys from
.amrescan and natural order scans also get scan keys, is there some
place I find the selection process for ordered index AMs, and how this
ordering should be interepreted? There are no good examples available
in core code because btree is special-cased, and there are no other
in-tree AMs that have facilities where both `amcanorderbyop` and
`amcanorder` are set.

I did read through indexam.sgml, but that does not give a clear answer
on this distinction of 'amcanorder' having required ordered results or
not, nor on how to interpret amrescan's orderbys argument. I also
looked at planner code where it interacts with amcanorder /
amcanorderbyop, but I couldn't wrap my head around its interactions
with indexes, either (more specifically, the ordering part of those
indexes) due to the complexity of the planner and the many layers that
the various concepts are passed through. The README in
backend/optimizer didn't answer this question for me, either.

Kind regards,

Matthias van de Meent
Neon (https://neon.tech)

[0] 
https://www.postgresql.org/message-id/flat/EB2AF704-70FC-4D73-A97A-A7884A0381B5%40kleczek.org
[1] 
https://www.postgresql.org/message-id/flat/CAH2-Wz%3DksvN_sjcnD1%2BBt-WtifRA5ok48aDYnq3pkKhxgMQpcw%40mail.gmail.com
[2] 
https://www.postgresql.org/message-id/flat/e70fa091-e338-1598-9de4-6d0ef6b693e2%40enterprisedb.com

Re: autovectorize page checksum code included elsewhere

2023-11-23 Thread Nathan Bossart

On Thu, Nov 23, 2023 at 05:50:48PM +0700, John Naylor wrote:
> On Thu, Nov 23, 2023 at 1:49 AM Nathan Bossart  
> wrote:
>> One half-formed idea I have is to introduce some sort of ./configure flag
>> that enables all the newer instructions that your CPU understands.
> 
> That's not doable,

It's not?

> but we should consider taking advantage of
> x86-64-v2, which RedHat 9 builds with. That would allow inlining CRC
> and popcount there.

Maybe we have something like --with-isa-level where you can specify
x86-64-v[1-4] or "auto" to mean "build whatever the current machine can
handle."  I can imagine packagers requiring v2 these days (it's probably
worth asking them), and that would not only allow compiling in SSE 4.2 on
many more machines, but it would also open up a path to supporting
AVX2/AVX512 and beyond.

> Not sure how to detect that easily.

I briefly looked around and didn't see a portable way to do so.  We might
have to exhaustively check the features, which doesn't seem like it'd be
too bad for x86_64, but I haven't looked too closely at other
architectures.

-- 
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

Re: undetected deadlock in ALTER SUBSCRIPTION ... REFRESH PUBLICATION



On 11/23/23 10:24, Amit Kapila wrote:
> On Wed, Nov 22, 2023 at 4:51 PM Tomas Vondra
>  wrote:
>>
>> On 11/22/23 11:38, Amit Kapila wrote:
>>>
>>> Okay. IIUC, what's going on here is that the apply worker acquires
>>> AccessShareLock on pg_subscription to update rel state for one of the
>>> tables say tbl-1, and then for another table say tbl-2, it started
>>> waiting for a state change via wait_for_relation_state_change(). I
>>> think here the fix is to commit the transaction before we go for a
>>> wait. I guess we need something along the lines of what is proposed in
>>> [1] though we have solved the problem in that thread in some other
>>> way..
>>>
>>
>> Possibly. I haven't checked if the commit might have some unexpected
>> consequences, but I can confirm I can no longer reproduce the deadlock
>> with the patch applied.
>>
> 
> Thanks for the verification. Offhand, I don't see any problem with
> doing a commit at that place but will try to think some more about it.
> I think we may want to call pgstat_report_stat(false) after commit to
> avoid a long delay in stats.
> 

Makes sense.

> I haven't verified but I think this will be a problem in
> back-branches as well.
> 

Yes. I haven't tried but I don't see why backbranches wouldn't have the
same issue.

regards

-- 
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

Re: Catalog domain not-null constraints

2023-11-23 Thread Alvaro Herrera

On 2023-Nov-23, Peter Eisentraut wrote:

> This patch set applies the explicit catalog representation of not-null
> constraints introduced by b0e96f3119 for table constraints also to domain
> not-null constraints.

I like the idea of having domain not-null constraints appear in
pg_constraint.

> Since there is no inheritance or primary keys etc., this is much simpler and
> just applies the existing infrastructure to domains as well.

If you create a table with column of domain that has a NOT NULL
constraint, what happens?  I mean, is the table column marked
attnotnull, and how does it behave?  Is there a separate pg_constraint
row for the constraint in the table?  What happens if you do
ALTER TABLE ... DROP NOT NULL for that column?

-- 
Álvaro Herrera   48°01'N 7°57'E  —  https://www.EnterpriseDB.com/
"Por suerte hoy explotó el califont porque si no me habría muerto
 de aburrido"  (Papelucho)

Re: Catalog domain not-null constraints

2023-11-23 Thread Alvaro Herrera

On 2023-Nov-23, Aleksander Alekseev wrote:

> Interestingly enough according to the documentation this syntax is
> already supported [1][2], but the actual query will fail on `master`:
> 
> ```
> =# create domain connotnull integer;
> CREATE DOMAIN
> =# alter domain connotnull add not null value;
> ERROR:  unrecognized constraint subtype: 1
> ```

Hah, nice ... this only fails in this way on master, though, as a
side-effect of previous NOT NULL work during this cycle.  So if we take
Peter's patch, we don't need to worry about it.  In 16 it behaves
properly, with a normal syntax error.

> ```
> =# create domain connotnull1 integer;
> =# create domain connotnull2 integer;
> =# alter domain connotnull1 add not null value;
> =# alter domain connotnull2 set not null;
> =# \dD
> ERROR:  unexpected null value in cached tuple for catalog
> pg_constraint column conkey
> ```

This is also a master-only problem, as "add not null" is rejected in 16
with a syntax error (and obviously \dD doesn't fail).

> NOT VALID is not supported:
> 
> ```
> =# alter domain connotnull add not null value not valid;
> ERROR:  NOT NULL constraints cannot be marked NOT VALID
> ```

Yeah, it'll take more work to let NOT NULL constraints be marked NOT
VALID, both on domains and on tables.  It'll be a good feature for sure.

-- 
Álvaro Herrera   48°01'N 7°57'E  —  https://www.EnterpriseDB.com/
"La victoria es para quien se atreve a estar solo"

Re: [HACKERS] psql casts aspersions on server reliability

2023-11-23 Thread Bruce Momjian

On Wed, Nov 22, 2023 at 10:25:14PM -0500, Bruce Momjian wrote:
> On Wed, Nov 22, 2023 at 07:38:52PM -0500, Tom Lane wrote:
> > Bruce Momjian  writes:
> > > On Wed, Sep 28, 2016 at 09:14:41AM -0400, Tom Lane wrote:
> > >> I could go along with just dropping the last sentence ("This 
> > >> probably...")
> > >> if the last error we got was FATAL level.  I don't find "unexpectedly"
> > >> to be problematic here: from the point of view of psql, and probably
> > >> of its user, the shutdown *was* unexpected.
> > 
> > > I looked at this thread from 2016 and I think the problem is the
> > > "abnormally" word, since if the server was shutdown by the administrator
> > > (most likely), it isn't abnormal.  Here is a patch to remove
> > > "abnormally".
> > 
> > I do not think this is an improvement.  The places you are changing
> > are reacting to a connection closure.  *If* we had previously gotten a
> > "FATAL: terminating connection due to administrator command" message,
> > then yeah the connection closure is expected; but if not, it isn't.
> > Your patch adds no check for that.  (As I remarked in 2016, we could
> > probably condition this on the elevel being FATAL, rather than
> > checking for specific error messages.)
> 
> Yes, you are correct.  Here is a patch that implements the FATAL test,
> though I am not sure I have the logic correct or backwards, and I don't
> know how to test this.  Thanks.

I developed the attached patch which seems to work better.  In testing
kill -3 on a backend or calling elog(FATAL) in the server for a
session, libpq's 'res' is NULL, meaning we don't have any status to
check for PGRES_FATAL_ERROR.  It is very possible that libpq just isn't
stuctured to have the PGRES_FATAL_ERROR at the point where we issue this
message, and this is not worth improving.

test=> select pg_sleep(100);
--> FATAL:  FATAL called

server closed the connection unexpectedly
--> This probably means the server terminated null
before or while processing the request.
The connection to the server was lost. Attempting reset: Succeeded.

-- 
  Bruce Momjian  https://momjian.us
  EDB  https://enterprisedb.com

  Only you can decide what is important to you.
diff --git a/src/interfaces/libpq/fe-misc.c b/src/interfaces/libpq/fe-misc.c
index 660cdec93c..64faad19df 100644
--- a/src/interfaces/libpq/fe-misc.c
+++ b/src/interfaces/libpq/fe-misc.c
@@ -749,8 +749,11 @@ retry4:
 	 */
 definitelyEOF:
 	libpq_append_conn_error(conn, "server closed the connection unexpectedly\n"
-			"\tThis probably means the server terminated abnormally\n"
-			"\tbefore or while processing the request.");
+			"\tThis probably means the server terminated%s\n"
+			"\tbefore or while processing the request.",
+			(conn->result == NULL) ? " null" :
+			(conn->result->resultStatus == PGRES_FATAL_ERROR) ?
+			"" : " abnormally");
 
 	/* Come here if lower-level code already set a suitable errorMessage */
 definitelyFailed:
diff --git a/src/interfaces/libpq/fe-protocol3.c b/src/interfaces/libpq/fe-protocol3.c
index 5613c56b14..03914b97fc 100644
--- a/src/interfaces/libpq/fe-protocol3.c
+++ b/src/interfaces/libpq/fe-protocol3.c
@@ -2158,6 +2158,7 @@ pqFunctionCall3(PGconn *conn, Oid fnid,
 if (pqGetErrorNotice3(conn, true))
 	continue;
 status = PGRES_FATAL_ERROR;
+fprintf(stderr, "Got 'E'\n");
 break;
 			case 'A':			/* notify message */
 /* handle notify and go back to processing return values */
diff --git a/src/interfaces/libpq/fe-secure-openssl.c b/src/interfaces/libpq/fe-secure-openssl.c
index f1192d28f2..f4c7f51b0a 100644
--- a/src/interfaces/libpq/fe-secure-openssl.c
+++ b/src/interfaces/libpq/fe-secure-openssl.c
@@ -206,8 +206,11 @@ rloop:
 if (result_errno == EPIPE ||
 	result_errno == ECONNRESET)
 	libpq_append_conn_error(conn, "server closed the connection unexpectedly\n"
-			"\tThis probably means the server terminated abnormally\n"
-			"\tbefore or while processing the request.");
+			  "\tThis probably means the server terminated%s\n"
+			  "\tbefore or while processing the request.",
+			  (conn->result == NULL) ? " null" :
+			  (conn->result->resultStatus == PGRES_FATAL_ERROR) ?
+			  "" : " abnormally");
 else
 	libpq_append_conn_error(conn, "SSL SYSCALL error: %s",
 			SOCK_STRERROR(result_errno,
@@ -306,8 +309,11 @@ pgtls_write(PGconn *conn, const void *ptr, size_t len)
 result_errno = SOCK_ERRNO;
 if (result_errno == EPIPE || result_errno == ECONNRESET)
 	libpq_append_conn_error(conn, "server closed the connection unexpectedly\n"
-			"\tThis probably means the server terminated abnormally\n"
-			"\tbefore or while processing the request.");
+			  "\tThis probably means the server terminated%s\n"
+			  "\tbefore or while processing the request.",
+

Re: Synchronizing slots from primary to standby

2023-11-23 Thread Drouvot, Bertrand


Hi,

On 11/23/23 6:13 AM, Amit Kapila wrote:

On Tue, Nov 21, 2023 at 4:35 PM Drouvot, Bertrand
 wrote:


On 11/21/23 10:32 AM, shveta malik wrote:

On Tue, Nov 21, 2023 at 2:02 PM shveta malik  wrote:





v37 fails to apply to HEAD due to a recent commit e83aa9f92fdd,
rebased the patches.  PFA v37_2 patches.


Thanks!

Regarding the promotion flow: If the primary is available and reachable I don't
think we currently try to ensure that slots are in sync. I think we'd miss the
activity since the last sync and the promotion request or am I missing 
something?

If the primary is available and reachable shouldn't we launch a last round of
synchronization (skipping all the slots that are not in 'r' state)?



We may miss the last round but there is no guarantee that we can
ensure to sync of everything if the primary is available. Because
after our last sync, there could probably be some more activity.


I don't think so thanks to the fact that we ensure that logical walsenders
on the primary wait for the physical standby.

Indeed that should prevent any decoding activity on the primary while the
promotion is in progress on the standby (at least as soon as the
walreceiver is shutdown).

So that I think that a promotion flow like:

- walreceiver shutdown
- last round of sync
- sync-worker shutdown

Should ensure that slots are in sync (as logical slots on the primary
should not be able to advance as soon as the walreceiver is shutdown
during the promotion).


I think it is the user's responsibility to promote a new primary when
the old one is not required for some reason.


Do you mean they should ensure something like?

1. no more activity on the primary
2. check that the slots are in sync with the primary
3. promote

but then they could also (without the new feature we're building):

1. create and advance slots manually (pg_replication_slot_advance) on the 
standby
to sync them up at regular interval

and then before promotion:

2. ensure no more activity on the primary
3. last round of advance slots manually
3. promote

I think that ensuring the slots are in sync during promotion (should the primary
be available) would provide added value as compared to the above scenarios.


It is not only slots that
can be out of sync but even we can miss fetching some of the data. I
think this is quite similar to what we do for WAL where on finding the
promotion signal, we shut down Walreceiver and just replay any WAL
that was already received by walreceiver.



Also, the promotion
shouldn't create any problem w.r.t subscribers connecting to the new
primary because the slot's position is slightly behind what could be
requested by subscribers which means the corresponding data will be
available on the new primary.



Right.


Do you have something in mind that can create any problem if we don't
attempt additional fetching round after the promotion signal is
received?


It's not a "real" problem per say, but in case of non synced slot, I can see 2 
cases:

- publisher/subscriber case: I don't see any problem here, since after
 an "alter subscription XXX connection ''" logical replication 
should
start from the right place thanks to the replication origin associated to the
subscription.

- non publisher/subscriber case (say pg_recvlogical that does not make use of
replication origin) then:

a) data since the last sync and promotion would be decoded again
unless b) or c)
b) user manually advances the slot on the standby after promotion
c) user restarts the decoding with an appropriate --startpos option

That's for this non publisher/subscriber case that I think it would be
beneficial to try to ensure that the slots are in sync during the promotion.

Regards,

--
Bertrand Drouvot
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com

Does pg support to write a new buffer cache as an extension?

2023-11-23 Thread jacktby jacktby

Re: PoC: prefetching index leaf pages (for inserts)

On 11/23/23 14:26, Heikki Linnakangas wrote:
> On 06/11/2023 19:05, Tomas Vondra wrote:
>> As for the path forward, I think the prefetching is demonstrably
>> beneficial. There are cases where it can't help or even harms
>> performance. I think the success depends on three areas:
>>
>> (a) reducing the costs of the prefetching - For example right now we
>> build the index tuples twice (once for prefetch, once for the insert),
>> but maybe there's a way to do that only once? There are also predicate
>> indexes, and so on.
>>
>> (b) being smarter about when to prefetch - For example if we only have
>> one "prefetchable" index, it's somewhat pointless to prefetch (for
>> single-row cases). And so on.
>>
>> (c) not prefetching when already cached - This is somewhat related to
>> the previous case, but perhaps it'd be cheaper to first check if the
>> data is already cached. For shared buffers it should not be difficult,
>> for page cache we could use preadv2 with RWF_NOWAIT flag. The question
>> is if this is cheap enough to be cheaper than just doing posix_fadvise
>> (which however only deals with shared buffers).
> 
> I don't like this approach. It duplicates the tree-descend code, and it
> also duplicates the work of descending the tree at runtime. And it only
> addresses index insertion; there are a lot of places that could benefit
> from prefetching or async execution like this.
> 

Yeah, I think that's a fair assessment, although I think the amount of
duplicate code is pretty small (and perhaps it could be refactored to a
common function, which I chose not to do in the PoC patch).

> I think we should think of this as async execution rather than
> prefetching. We don't have the general infrastructure for writing async
> code, but if we did, this would be much simpler. In an async programming
> model, like you have in many other languages like Rust, python or
> javascript, there would be no separate prefetching function. Instead,
> aminsert() would return a future that can pause execution if it needs to
> do I/O. Something like this:
> 
> aminsert_futures = NIL;
> /* create a future for each index insert */
> for ()
> {
>     aminsert_futures = lappend(aminsert_futures, aminsert(...));
> }
> /* wait for all the futures to finish */
> await aminsert_futures;
> 
> The async-aware aminsert function would run to completion quickly if all
> the pages are already in cache. If you get a cache miss, it would start
> an async I/O read for the page, and yield to the other insertions until
> the I/O completes.
> 
> We already support async execution of FDWs now, with the
> ForeignAsyncRequest() and ForeignAsyncConfigureWait() callbacks. Can we
> generalize that?
> 

Interesting idea. I kinda like it in principle, however I'm not very
familiar with how our async execution works (and perhaps even with async
implementations in general), so I can't quite say how difficult would it
be to do something like that in an AM (instead of an executor).

Where exactly would be the boundary between who "creates" and "executes"
the requests, what would be the flow of execution? For the FDW it seems
fairly straightforward, because the boundary is local/remote, and the
async request is executed in a separate process.

But here everything would happen locally, so how would that work?

Imagine we're inserting a tuple into two indexes. There's a bunch of
index pages we may need to read for each index - first some internal
pages, then some leafs. Presumably we'd want to do each page read as
asynchronous, and allow transfer of control to the other index.

IIUC the async-aware aminsert() would execute an async request for the
first page it needs to read, with a callback that essentially does the
next step of index descent - reads the page, determines the next page to
read, and then do another async request. Then it'd sleep, which would
allow transfer of control to the aminsert() on the other index. And then
we'd do a similar thing for the leaf pages.

Or do I imagine things wrong?

The thing I like about this async approach is that it would allow
prefetching all index pages, while my PoC patch simply assumes all
internal pages are in cache and prefetches only the first leaf page.
That's much simpler in terms of control flow, but has clear limits.

I however wonder if there are concurrency issues. Imagine there's a
COPY, i.e. we're inserting a batch of tuples. Can you run the aminsert()
for all the tuples concurrently? Won't that have issues with the
different "async threads" modifying the index for the other threads?

If those concurrent "insert threads" would be an issue, maybe we could
make amprefetch() async-aware ...

regards

-- 
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

Re: [HACKERS] pg_upgrade vs vacuum_cost_delay

2023-11-23 Thread Magnus Hagander

On Thu, Nov 23, 2023 at 5:23 AM Bruce Momjian  wrote:
>
> On Thu, Jun 16, 2016 at 04:45:14PM +0200, Magnus Hagander wrote:
> > On Thu, Jun 16, 2016 at 4:35 PM, Euler Taveira  wrote:
> >
> > On 16-06-2016 09:05, Magnus Hagander wrote:
> > > Shouldn't pg_upgrade turn off vacuum cost delay when it vacuums the 
> > new
> > > cluster? Not talking about the post-analyze script, but when it runs
> > > vacuumdb to analyze and freeze before loading the new schema, in
> > > prepare_new_cluster()? Those run during downtime, so it seems like 
> > you'd
> > > want those to run as fast as possible.
> > >
> > Doesn't --new-options do the job?
> >
> >
> > You could, but it seems like it should do it by default.
>
> Based on this seven year old post, I realized there are minimal
> directions in pg_upgrade docs about how to generate statistics quickly,
> so I created this patch to help.
>
> We do have docs on updating planner statistics:
>
> 
> https://www.postgresql.org/docs/current/routine-vacuuming.html#VACUUM-FOR-STATISTICS
>
> but that doesn't seem to cover cases where you are doing an upgrade or
> pg_dump restore.  Should I move this information into there instead?

Wow, that's... A while :)

I don't think that final sentence really helps much - for anybody who
doesn't know that functionality well already, it will just be
confusing. At the very least it should be a link that sends you to the
documentation of how that functionality works?

But beyond that, perhaps what we'd really want (now that vacuumdb has
gained more functionality, and is used instead of the custom script
all the way) to add is a parameter --no-cost-delay that would issue a
SET to turn it off for the run?

-- 
 Magnus Hagander
 Me: https://www.hagander.net/
 Work: https://www.redpill-linpro.com/

Use index to estimate expression selectivity

2023-11-23 Thread Bono Stebler


Dear hackers,

After discussing the issue on irc, it looks like it could be possible 
for the planner to use a partial index matching an expression exactly to 
estimate its selectivity.


Here is a simplified version (thanks ysch) of the issue I am facing:

https://dbfiddle.uk/flPq8-pj

I have tried using CREATE STATISTICS as well but haven't found a way to 
improve the planner estimation for that query.


I have worked around the problem for my specific use case but that 
behavior is counter-intuitive, is there any interest in improving it?


Thank you!

Re: ALTER COLUMN ... SET EXPRESSION to alter stored generated column's expression

2023-11-23 Thread Amul Sul

On Mon, Nov 20, 2023 at 1:12 PM Peter Eisentraut 
wrote:

> On 17.11.23 13:25, Amul Sul wrote:
> > To fix this we should be doing something like ALTER COLUMN TYPE and the
> pass
> > should be AT_PASS_ALTER_TYPE (rename it or invent a new one near to
> that) so
> > that in ATRewriteCatalogs(), we would execute ATPostAlterTypeCleanup().
> >
> > I simply tried that by doing blind copy of code from
> > ATExecAlterColumnType() in
> > 0002 patch.  We don't really need to do all the stuff such as re-adding
> > indexes, constraints etc, but I am out of time for today to figure out
> the
> > optimum code and I will be away from work in the first half of the coming
> > week and the week after that. Therefore, I thought of sharing an
> approach to
> > get comments/thoughts on the direction, thanks.
>
> The exact sequencing of this seems to be tricky.  It's clear that we
> need to do it earlier than at the end.  I also think it should be
> strictly after AT_PASS_ALTER_TYPE so that the new expression can refer
> to the new type of a column.  It should also be after AT_PASS_ADD_COL,
> so that the new expression can refer to any newly added column.  But
> then it's after AT_PASS_OLD_INDEX and AT_PASS_OLD_CONSTR, is that a
> problem?
>

AT_PASS_ALTER_TYPE and AT_PASS_ADD_COL cannot be together, the ALTER TYPE
cannot see that column, I think we can adopt the same behaviour.

But, we need to have ALTER SET EXPRESSION after the ALTER TYPE since if we
add
the new generated expression for the current type (e.g.  int) and we would
alter the type (e.g. text or numeric) then that will be problematic in the
ATRewriteTable() where a new generation expression will generate value for
the
old type but the actual type is something else. Therefore I have added
AT_PASS_SET_EXPRESSION to execute after AT_PASS_ALTER_TYPE.

(It might be an option for the first version of this feature to not
> support altering columns that have constraints on them.  But we do need
> to support columns with indexes on them.  Does that work ok?  Does that
> depend on the relative order of AT_PASS_OLD_INDEX?)
>

I tried to reuse the code by borrowing code from ALTER TYPE, see if that
looks good to you.

But I have concerns, with that code reuse where we drop and re-add the
indexes
and constraints which seems unnecessary for SET EXPRESSION where column
attributes will stay the same. I don't know why ATLER TYPE does that for
index
since finish_heap_swap() anyway does reindexing. We could skip re-adding
index for SET EXPRESSION which would be fine but we could not skip the
re-addition of constraints, since rebuilding constraints for checking might
need a good amount of code copy especially for foreign key constraints.

Please have a look at the attached version, 0001 patch does the code
refactoring, and 0002 is the implementation, using the newly refactored
code to
re-add indexes and constraints for the validation. Added tests for the same.

Regards,
Amul
From e6418c3f36618c517b160ab71895975773d16f6c Mon Sep 17 00:00:00 2001
From: Amul Sul 
Date: Wed, 22 Nov 2023 18:23:56 +0530
Subject: [PATCH v5 1/2] Code refactor: separate function to find all dependent
 object on column

Move code from ATExecAlterColumnType() that finds the all the object
that depends on the column to a separate function.

Also, renamed ATPostAlterTypeCleanup() and ATPostAlterTypeParse()
function for the general use.
---
 src/backend/commands/tablecmds.c | 474 ---
 1 file changed, 248 insertions(+), 226 deletions(-)

diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 323d9bf8702..ccc152f54e9 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -557,14 +557,16 @@ static void ATPrepAlterColumnType(List **wqueue,
 static bool ATColumnChangeRequiresRewrite(Node *expr, AttrNumber varattno);
 static ObjectAddress ATExecAlterColumnType(AlteredTableInfo *tab, Relation rel,
 		   AlterTableCmd *cmd, LOCKMODE lockmode);
+static void RememberAllDependentForRebuilding(AlteredTableInfo *tab,
+			  Relation rel, AttrNumber attnum);
 static void RememberConstraintForRebuilding(Oid conoid, AlteredTableInfo *tab);
 static void RememberIndexForRebuilding(Oid indoid, AlteredTableInfo *tab);
 static void RememberStatisticsForRebuilding(Oid stxoid, AlteredTableInfo *tab);
-static void ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab,
-   LOCKMODE lockmode);
-static void ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId,
- char *cmd, List **wqueue, LOCKMODE lockmode,
- bool rewrite);
+static void ATPostAlterColumnCleanup(List **wqueue, AlteredTableInfo *tab,
+	 LOCKMODE lockmode);
+static void ATPostAlterColumnParse(Oid oldId, Oid oldRelId, Oid refRelId,
+   char *cmd, List **wqueue, LOCKMODE lockmode,
+   bool rewrite);
 static void RebuildConstraintComment(AlteredTableInfo *tab, int pass,
 	 Oid objid, Relation rel, List *domn

Re: Parallel CREATE INDEX for BRIN indexes

On 11/23/23 13:33, Matthias van de Meent wrote:
> Hi,
> 
> On Wed, 22 Nov 2023 at 20:16, Tomas Vondra
>  wrote:
>>
>> On 11/20/23 20:48, Matthias van de Meent wrote:
>>> On Wed, 8 Nov 2023 at 12:03, Tomas Vondra  
>>> wrote:

 Hi,

 here's an updated patch, addressing the review comments, and reworking
 how the work is divided between the workers & leader etc.

>>>
>>> After code-only review, here are some comments:
>>>
 +++ b/src/backend/access/brin/brin.c
 [...]
 +/* Magic numbers for parallel state sharing */
 +#define PARALLEL_KEY_BRIN_SHAREDUINT64CONST(0xA001)
 +#define PARALLEL_KEY_TUPLESORTUINT64CONST(0xA002)
>>>
>>> These shm keys use the same constants also in use in
>>> access/nbtree/nbtsort.c. While this shouldn't be an issue in normal
>>> operations, I'd prefer if we didn't actively introduce conflicting
>>> identifiers when we still have significant amounts of unused values
>>> remaining.
>>>
>>
>> Hmmm. Is there some rule of thumb how to pick these key values?
> 
> None that I know of.
> There is a warning in various places that define these constants that
> they take care to not conflict with plan node's node_id: parallel plan
> execution uses plain plan node IDs as keys, and as node_id is
> int-sized, any other key value that's created manually of value < 2^32
> should be sure that it can't be executed in a parallel backend.
> But apart from that one case, I can't find a convention, no.
> 

OK, in that case 0xB is fine.

 +#define PARALLEL_KEY_QUERY_TEXTUINT64CONST(0xA003)
>>>
>>> This is the fourth definition of a PARALLEL%_KEY_QUERY_TEXT, the
>>> others being in access/nbtree/nbtsort.c (value 0xA004, one
>>> more than brin's), backend/executor/execParallel.c
>>> (0xE008), and PARALLEL_VACUUM_KEY_QUERY_TEXT (0x3) (though
>>> I've not checked that their uses are exactly the same, I'd expect at
>>> least btree to match mostly, if not fully, 1:1).
>>> I think we could probably benefit from a less ad-hoc sharing of query
>>> texts. I don't think that needs to happen specifically in this patch,
>>> but I think it's something to keep in mind in future efforts.
>>>
>>
>> I'm afraid I don't quite get what you mean by "ad hoc sharing of query
>> texts". Are you saying we shouldn't propagate the query text to the
>> parallel workers? Why? Or what's the proper solution?
> 
> What I mean is that we have several different keys that all look like
> they contain the debug query string, and always for the same debugging
> purposes. For debugging, I think it'd be useful to use one well-known
> key, rather than N well-known keys in each of the N parallel
> subsystems.
> 
> But as mentioned, it doesn't need to happen in this patch, as that'd
> increase scope beyond brin/index ams.
> 

Agreed.

>>> I also noticed that this is likely to execute `union_tuples` many
>>> times when pages_per_range is coprime with the parallel table scan's
>>> block stride (or when we for other reasons have many tuples returned
>>> for each range); and this `union_tuples` internally allocates and
>>> deletes its own memory context for its deserialization of the 'b'
>>> tuple. I think we should just pass a scratch context instead, so that
>>> we don't have the overhead of continously creating then deleting the
>>> same memory context
>>
>> Perhaps. Looking at the code, isn't it a bit strange how union_tuples
>> uses the context? It creates the context, calls brin_deform_tuple in
>> that context, but then the rest of the function (including datumCopy and
>> similar stuff) happens in the caller's context ...
> 
> The union operator may leak (lots of) memory, so I think it makes
> sense to keep a context around that can be reset after we've extracted
> the merge result.
> 

But does the current code actually achieve that? It does create a "brin
union" context, but then it only does this:

/* Use our own memory context to avoid retail pfree */
cxt = AllocSetContextCreate(CurrentMemoryContext,
"brin union",
ALLOCSET_DEFAULT_SIZES);
oldcxt = MemoryContextSwitchTo(cxt);
db = brin_deform_tuple(bdesc, b, NULL);
MemoryContextSwitchTo(oldcxt);

Surely that does not limit the amount of memory used by the actual union
functions in any way?

>> However, I don't think the number of union_tuples calls is likely to be
>> very high, especially for large tables. Because we split the table into
>> 2048 chunks, and then cap the chunk size by 8192. For large tables
>> (where this matters) we're likely close to 8192.
> 
> I agree that the merging part of the index creation is the last part,
> and usually has no high impact on the total performance of the reindex
> operation, but in memory-constrained environments releasing and then
> requesting the same chunk of memory over and over again just isn't
> great.

OK, I'll take a look at t

Re: PoC: prefetching index leaf pages (for inserts)


On 06/11/2023 19:05, Tomas Vondra wrote:

As for the path forward, I think the prefetching is demonstrably
beneficial. There are cases where it can't help or even harms
performance. I think the success depends on three areas:

(a) reducing the costs of the prefetching - For example right now we
build the index tuples twice (once for prefetch, once for the insert),
but maybe there's a way to do that only once? There are also predicate
indexes, and so on.

(b) being smarter about when to prefetch - For example if we only have
one "prefetchable" index, it's somewhat pointless to prefetch (for
single-row cases). And so on.

(c) not prefetching when already cached - This is somewhat related to
the previous case, but perhaps it'd be cheaper to first check if the
data is already cached. For shared buffers it should not be difficult,
for page cache we could use preadv2 with RWF_NOWAIT flag. The question
is if this is cheap enough to be cheaper than just doing posix_fadvise
(which however only deals with shared buffers).


I don't like this approach. It duplicates the tree-descend code, and it 
also duplicates the work of descending the tree at runtime. And it only 
addresses index insertion; there are a lot of places that could benefit 
from prefetching or async execution like this.


I think we should think of this as async execution rather than 
prefetching. We don't have the general infrastructure for writing async 
code, but if we did, this would be much simpler. In an async programming 
model, like you have in many other languages like Rust, python or 
javascript, there would be no separate prefetching function. Instead, 
aminsert() would return a future that can pause execution if it needs to 
do I/O. Something like this:


aminsert_futures = NIL;
/* create a future for each index insert */
for ()
{
aminsert_futures = lappend(aminsert_futures, aminsert(...));
}
/* wait for all the futures to finish */
await aminsert_futures;

The async-aware aminsert function would run to completion quickly if all 
the pages are already in cache. If you get a cache miss, it would start 
an async I/O read for the page, and yield to the other insertions until 
the I/O completes.


We already support async execution of FDWs now, with the 
ForeignAsyncRequest() and ForeignAsyncConfigureWait() callbacks. Can we 
generalize that?


--
Heikki Linnakangas
Neon (https://neon.tech)

Re: Catalog domain not-null constraints

2023-11-23 Thread Aleksander Alekseev

Hi,

> This patch set applies the explicit catalog representation of not-null
> constraints introduced by b0e96f3119 for table constraints also to
> domain not-null constraints.

Interestingly enough according to the documentation this syntax is
already supported [1][2], but the actual query will fail on `master`:

```
=# create domain connotnull integer;
CREATE DOMAIN
=# alter domain connotnull add not null value;
ERROR:  unrecognized constraint subtype: 1
```

I wonder if we should reflect this limitation in the documentation
and/or show better error messages. This could be quite surprising to
the user. However if we change the documentation on the `master`
branch this patch will have to change it back.

I was curious about the semantic difference between `SET NOT NULL` and
`ADD NOT NULL value`. When I wanted to figure this out I discovered
something that seems to be a bug:

```
=# create domain connotnull1 integer;
=# create domain connotnull2 integer;
=# alter domain connotnull1 add not null value;
=# alter domain connotnull2 set not null;
=# \dD
ERROR:  unexpected null value in cached tuple for catalog
pg_constraint column conkey
```

Also it turned out that I can do both: `SET NOT NULL` and `ADD NOT
NULL value` for the same domain. Is it an intended behavior? We should
either forbid it or cover this case with a test.

NOT VALID is not supported:

```
=# alter domain connotnull add not null value not valid;
ERROR:  NOT NULL constraints cannot be marked NOT VALID
```

... and this is correct: "NOT VALID is only accepted for CHECK
constraints" [1]. This code path however doesn't seem to be
test-covered even on `master`. While on it, I suggest fixing this.

[1]: https://www.postgresql.org/docs/current/sql-alterdomain.html
[2]: https://www.postgresql.org/docs/current/sql-createdomain.html

-- 
Best regards,
Aleksander Alekseev

Re: Parallel CREATE INDEX for BRIN indexes

2023-11-23 Thread Matthias van de Meent

Hi,

On Wed, 22 Nov 2023 at 20:16, Tomas Vondra
 wrote:
>
> On 11/20/23 20:48, Matthias van de Meent wrote:
>> On Wed, 8 Nov 2023 at 12:03, Tomas Vondra  
>> wrote:
>>>
>>> Hi,
>>>
>>> here's an updated patch, addressing the review comments, and reworking
>>> how the work is divided between the workers & leader etc.
>>>
>>
>> After code-only review, here are some comments:
>>
>>> +++ b/src/backend/access/brin/brin.c
>>> [...]
>>> +/* Magic numbers for parallel state sharing */
>>> +#define PARALLEL_KEY_BRIN_SHAREDUINT64CONST(0xA001)
>>> +#define PARALLEL_KEY_TUPLESORTUINT64CONST(0xA002)
>>
>> These shm keys use the same constants also in use in
>> access/nbtree/nbtsort.c. While this shouldn't be an issue in normal
>> operations, I'd prefer if we didn't actively introduce conflicting
>> identifiers when we still have significant amounts of unused values
>> remaining.
>>
>
> Hmmm. Is there some rule of thumb how to pick these key values?

None that I know of.
There is a warning in various places that define these constants that
they take care to not conflict with plan node's node_id: parallel plan
execution uses plain plan node IDs as keys, and as node_id is
int-sized, any other key value that's created manually of value < 2^32
should be sure that it can't be executed in a parallel backend.
But apart from that one case, I can't find a convention, no.

>>> +#define PARALLEL_KEY_QUERY_TEXTUINT64CONST(0xA003)
>>
>> This is the fourth definition of a PARALLEL%_KEY_QUERY_TEXT, the
>> others being in access/nbtree/nbtsort.c (value 0xA004, one
>> more than brin's), backend/executor/execParallel.c
>> (0xE008), and PARALLEL_VACUUM_KEY_QUERY_TEXT (0x3) (though
>> I've not checked that their uses are exactly the same, I'd expect at
>> least btree to match mostly, if not fully, 1:1).
>> I think we could probably benefit from a less ad-hoc sharing of query
>> texts. I don't think that needs to happen specifically in this patch,
>> but I think it's something to keep in mind in future efforts.
>>
>
> I'm afraid I don't quite get what you mean by "ad hoc sharing of query
> texts". Are you saying we shouldn't propagate the query text to the
> parallel workers? Why? Or what's the proper solution?

What I mean is that we have several different keys that all look like
they contain the debug query string, and always for the same debugging
purposes. For debugging, I think it'd be useful to use one well-known
key, rather than N well-known keys in each of the N parallel
subsystems.

But as mentioned, it doesn't need to happen in this patch, as that'd
increase scope beyond brin/index ams.

>>> +state->bs_numtuples = brinshared->indtuples;
>>
>> My IDE complains about bs_numtuples being an integer. This is a
>> pre-existing issue, but still valid: we can hit an overflow on tables
>> with pages_per_range=1 and relsize >= 2^31 pages. Extremely unlikely,
>> but problematic nonetheless.
>>
>
> True. I think I've been hesitant to make this a double because it seems
> a bit weird to do +1 with a double, and at some point (d == d+1). But
> this seems safe, we're guaranteed to be far away from that threshold.

Yes, ignoring practical constraints like page space, we "only" have
bitspace for 2^48 tuples in each (non-partitioned) relation, so
double's 56 significant bits should be more than enough to count
tuples.

>> I also noticed that this is likely to execute `union_tuples` many
>> times when pages_per_range is coprime with the parallel table scan's
>> block stride (or when we for other reasons have many tuples returned
>> for each range); and this `union_tuples` internally allocates and
>> deletes its own memory context for its deserialization of the 'b'
>> tuple. I think we should just pass a scratch context instead, so that
>> we don't have the overhead of continously creating then deleting the
>> same memory context
>
> Perhaps. Looking at the code, isn't it a bit strange how union_tuples
> uses the context? It creates the context, calls brin_deform_tuple in
> that context, but then the rest of the function (including datumCopy and
> similar stuff) happens in the caller's context ...

The union operator may leak (lots of) memory, so I think it makes
sense to keep a context around that can be reset after we've extracted
the merge result.

> However, I don't think the number of union_tuples calls is likely to be
> very high, especially for large tables. Because we split the table into
> 2048 chunks, and then cap the chunk size by 8192. For large tables
> (where this matters) we're likely close to 8192.

I agree that the merging part of the index creation is the last part,
and usually has no high impact on the total performance of the reindex
operation, but in memory-constrained environments releasing and then
requesting the same chunk of memory over and over again just isn't
great.
Also note that parallel scan chunk sizes decrease when we're ab

Re: PL/pgSQL: Incomplete item Allow handling of %TYPE arrays, e.g. tab.col%TYPE[]

2023-11-23 Thread Quan Zongliang




On 2023/11/20 17:33, Pavel Stehule wrote:




I did some deeper check:

- I don't like too much parser's modification (I am sending alternative 
own implementation) - the SQL parser allows richer syntax, and for full 
functionality is only few lines more

Agree.



- original patch doesn't solve %ROWTYPE

(2023-11-20 10:04:36) postgres=# select * from foo;
┌┬┐
│ a  │ b  │
╞╪╡
│ 10 │ 20 │
│ 30 │ 40 │
└┴┘
(2 rows)

(2023-11-20 10:08:29) postgres=# do $$
declare v foo%rowtype[];
begin
   v := array(select row(a,b) from foo);
   raise notice '%', v;
end;
$$;
NOTICE:  {"(10,20)","(30,40)"}
DO


two little fixes
1. spelling mistake
  ARRAY [ icons ] --> ARRAY [ iconst ]
2. code bug
  if (!OidIsValid(dtype->typoid)) --> if (!OidIsValid(array_typeid))



- original patch doesn't solve type RECORD
the error message should be more intuitive, although the arrays of 
record type can be supported, but it probably needs bigger research.


(2023-11-20 10:10:34) postgres=# do $$
declare r record; v r%type[];
begin
   v := array(select row(a,b) from foo);
   raise notice '%', v;
end;
$$;
ERROR:  syntax error at or near "%"
LINE 2: declare r record; v r%type[];
                              ^
CONTEXT:  invalid type name "r%type[]"


Currently only scalar variables are supported.
This error is consistent with the r%type error. And record arrays are 
not currently supported.
Support for r%type should be considered first. For now, let r%type[] 
report the same error as record[].

I prefer to implement it with a new patch.


- missing documentation
My English is not good. I wrote it down, please correct it. Add a note 
in the "Record Types" documentation that arrays and "Copying Types" are 
not supported yet.




- I don't like using the word "partitioned" in the regress test name 
"partitioned_table". It is confusing

fixed



Regards

Paveldiff --git a/doc/src/sgml/plpgsql.sgml b/doc/src/sgml/plpgsql.sgml
index 5977534a62..ad02c9f561 100644
--- a/doc/src/sgml/plpgsql.sgml
+++ b/doc/src/sgml/plpgsql.sgml
@@ -766,6 +766,30 @@ SELECT merge_fields(t.*) FROM table1 t WHERE ... ;

   
 
+  
+   Arrays of Copying Types and Row Types
+
+
+name variable%TYPE[];
+name table_name%ROWTYPE[];
+
+
+   
+Arrays of Copying Types and Row Types is defined by appending square 
brackets
+([]) to the %TYPE or 
%ROWTYPE.
+Its definition is similar to PostgreSQL's array types. It is possible to
+specify the exact size of the array. The keyword ARRAY can also be used.
+For example:
+
+user_id users.user_id%TYPE[4][2];
+user_id users.user_id%ROWTYPE ARRAY[4][];
+
+However, the current implementation ignores any supplied array size 
limits, i.e.,
+the behavior is the same as for arrays of unspecified length.
+The current implementation does not enforce the declared number of 
dimensions either.
+   
+  
+
   
Record Types
 
@@ -794,6 +818,11 @@ SELECT merge_fields(t.*) FROM table1 t WHERE ... ;
 calling query is parsed, whereas a record variable can change its row
 structure on-the-fly.

+
+   
+RECORD does not support being defined as an array.
+"Copying Types" as shown in  is 
also not supported.
+   
   
 
   
diff --git a/src/pl/plpgsql/src/pl_comp.c b/src/pl/plpgsql/src/pl_comp.c
index a341cde2c1..a9cb15df6d 100644
--- a/src/pl/plpgsql/src/pl_comp.c
+++ b/src/pl/plpgsql/src/pl_comp.c
@@ -2095,6 +2095,29 @@ plpgsql_build_datatype(Oid typeOid, int32 typmod,
return typ;
 }
 
+/*
+ * Returns an array for type specified as argument.
+ */
+PLpgSQL_type *
+plpgsql_datatype_arrayof(PLpgSQL_type *dtype)
+{
+   Oid array_typeid;
+
+   if (dtype->typisarray)
+   return dtype;
+
+   array_typeid = get_array_type(dtype->typoid);
+
+   if (!OidIsValid(array_typeid))
+   ereport(ERROR,
+   (errcode(ERRCODE_UNDEFINED_OBJECT),
+errmsg("could not find array type for data 
type \"%s\"",
+   
format_type_be(dtype->typoid;
+
+   return plpgsql_build_datatype(array_typeid, dtype->atttypmod,
+ 
dtype->collation, NULL);
+}
+
 /*
  * Utility subroutine to make a PLpgSQL_type struct given a pg_type entry
  * and additional details (see comments for plpgsql_build_datatype).
diff --git a/src/pl/plpgsql/src/pl_gram.y b/src/pl/plpgsql/src/pl_gram.y
index 6a09bfdd67..7778bffefc 100644
--- a/src/pl/plpgsql/src/pl_gram.y
+++ b/src/pl/plpgsql/src/pl_gram.y
@@ -2789,7 +2789,7 @@ read_datatype(int tok)
StringInfoData ds;
char   *type_name;
int startlocation;
-   PLpgSQL_type *result;
+   PLpgSQL_type *result = NULL;
int parenlevel = 0;
 
/* Should only be called while parsing DECLARE sections */
@@ -2817,15 +2817,11 @@ read_datatype(int tok)

RE: Random pg_upgrade test failure on drongo

2023-11-23 Thread Hayato Kuroda (Fujitsu)

Dear Alexander,

> 
> I can easily reproduce this failure on my workstation by running 5 tests
> 003_logical_slots in parallel inside Windows VM with it's CPU resources
> limited to 50%, like so:
> VBoxManage controlvm "Windows" cpuexecutioncap 50
> 
> set PGCTLTIMEOUT=180
> python3 -c "NUMITERATIONS=20;NUMTESTS=5;import os;tsts='';exec('for i in
> range(1,NUMTESTS+1):
> tsts+=f\"pg_upgrade_{i}/003_logical_slots \"'); exec('for i in
> range(1,NUMITERATIONS+1):print(f\"iteration {i}\");
> assert(os.system(f\"meson test --num-processes {NUMTESTS} {tsts}\") == 0)')"
> ...
> iteration 2
> ninja: Entering directory `C:\src\postgresql\build'
> ninja: no work to do.
> 1/5 postgresql:pg_upgrade_2 / pg_upgrade_2/003_logical_slots
> ERROR60.30s   exit status 25
> ...
> pg_restore: error: could not execute query: ERROR:  could not create file
> "base/1/2683": File exists
> ...

Great. I do not have such an environment so I could not find. This seemed to
suggest that the failure was occurred because the system was busy.

> I agree with your analysis and would like to propose a PoC fix (see
> attached). With this patch applied, 20 iterations succeeded for me.

Thanks, here are comments. I'm quite not sure for the windows, so I may say
something wrong.

* I'm not sure why the file/directory name was changed before doing a unlink.
  Could you add descriptions?
* IIUC, the important points is the latter part, which waits until the status is
  changed. Based on that, can we remove a double rmtree() from 
cleanup_output_dirs()?
  They seems to be add for the similar motivation.

```
+   loops = 0;
+   while (lstat(curpath, &st) < 0 && 
lstat_error_was_status_delete_pending())
+   {
+   if (++loops > 100)  /* time out after 10 sec */
+   return -1;
+   pg_usleep(10);  /* us */
+   }
```

Best Regards,
Hayato Kuroda
FUJITSU LIMITED

Re: WaitEventSet resource leakage


On 22/11/2023 15:00, Alexander Lakhin wrote:

I can also confirm that the patches proposed (for master and back branches)
eliminate WES leakage as expected.

Thanks for the fix!

Maybe you would find appropriate to add the comment
/* Convenience wrappers over ResourceOwnerRemember/Forget */
above ResourceOwnerRememberWaitEventSet
just as it's added above ResourceOwnerRememberRelationRef,
ResourceOwnerRememberDSM, ResourceOwnerRememberFile, ...


Added that and fixed the Windows warning that Thomas pointed out. Pushed 
the ResourceOwner version to master, and PG_TRY-CATCH version to 14-16.


Thank you!


(As a side note, this fix doesn't resolve the issue #17828 completely,
because that large number of handles might be also consumed
legally.)


:-(

--
Heikki Linnakangas
Neon (https://neon.tech)

Re: [PATCH] psql: Add tab-complete for optional view parameters

2023-11-23 Thread Dean Rasheed

On Mon, 14 Aug 2023 at 18:34, David Zhang  wrote:
>
> it would be great to switch the order of the 3rd and the 4th line to make a
> better match for "CREATE" and "CREATE OR REPLACE" .
>

I took a look at this, and I think it's probably neater to keep the
"AS SELECT" completion for CREATE [OR REPLACE] VIEW xxx WITH (*)
separate from the already existing support for "AS SELECT" without
WITH.

A couple of other points:

1. It looks slightly neater, and works better, to complete one word at
a time -- e.g., "WITH" then "(", instead of "WITH (", since the latter
doesn't work if the user has already typed "WITH".

2. It should also complete with "=" after the option, where appropriate.

3. CREATE VIEW should offer "local" and "cascaded" after
"check_option" (though there's no point in doing likewise for the
boolean options, since they default to true, if present, and false
otherwise).

Attached is an updated patch, incorporating those comments.

Barring any further comments, I think this is ready for commit.

Regards,
Dean
From 2f143cbfe185c723419a8fffcf5cbcd921f08ea7 Mon Sep 17 00:00:00 2001
From: Christoph Heiss 
Date: Mon, 7 Aug 2023 20:37:19 +0200
Subject: [PATCH v4] psql: Add tab completion for view options.

Add support for tab completion of WITH (...) options to CREATE VIEW,
and the corresponding SET/RESET (...) support to ALTER VIEW.

Christoph Heiss, reviewed by Melih Mutlu, Vignesh C, Jim Jones,
Mikhail Gribkov, David Zhang, and me.

Discussion: https://postgr.es/m/a2075c5a-66f9-a564-f038-9ac044b03...@c8h4.io
---
 src/bin/psql/tab-complete.c | 50 ++---
 1 file changed, 46 insertions(+), 4 deletions(-)

diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c
index 006e10f5d2..049801186c 100644
--- a/src/bin/psql/tab-complete.c
+++ b/src/bin/psql/tab-complete.c
@@ -1329,6 +1329,13 @@ static const char *const table_storage_parameters[] = {
 	NULL
 };
 
+/* Optional parameters for CREATE VIEW and ALTER VIEW */
+static const char *const view_optional_parameters[] = {
+	"check_option",
+	"security_barrier",
+	"security_invoker",
+	NULL
+};
 
 /* Forward declaration of functions */
 static char **psql_completion(const char *text, int start, int end);
@@ -2216,8 +2223,7 @@ psql_completion(const char *text, int start, int end)
 		COMPLETE_WITH("TO");
 	/* ALTER VIEW  */
 	else if (Matches("ALTER", "VIEW", MatchAny))
-		COMPLETE_WITH("ALTER COLUMN", "OWNER TO", "RENAME",
-	  "SET SCHEMA");
+		COMPLETE_WITH("ALTER COLUMN", "OWNER TO", "RENAME", "RESET", "SET");
 	/* ALTER VIEW xxx RENAME */
 	else if (Matches("ALTER", "VIEW", MatchAny, "RENAME"))
 		COMPLETE_WITH_ATTR_PLUS(prev2_wd, "COLUMN", "TO");
@@ -2233,6 +2239,21 @@ psql_completion(const char *text, int start, int end)
 	/* ALTER VIEW xxx RENAME COLUMN yyy */
 	else if (Matches("ALTER", "VIEW", MatchAny, "RENAME", "COLUMN", MatchAnyExcept("TO")))
 		COMPLETE_WITH("TO");
+	/* ALTER VIEW xxx RESET ( */
+	else if (Matches("ALTER", "VIEW", MatchAny, "RESET"))
+		COMPLETE_WITH("(");
+	/* Complete ALTER VIEW xxx SET with "(" or "SCHEMA" */
+	else if (Matches("ALTER", "VIEW", MatchAny, "SET"))
+		COMPLETE_WITH("(", "SCHEMA");
+	/* ALTER VIEW xxx SET|RESET ( yyy [= zzz] ) */
+	else if (Matches("ALTER", "VIEW", MatchAny, "SET|RESET", "("))
+		COMPLETE_WITH_LIST(view_optional_parameters);
+	else if (Matches("ALTER", "VIEW", MatchAny, "SET", "(", MatchAny))
+		COMPLETE_WITH("=");
+	else if (Matches("ALTER", "VIEW", MatchAny, "SET", "(", "check_option", "="))
+		COMPLETE_WITH("local", "cascaded");
+	else if (Matches("ALTER", "VIEW", MatchAny, "SET", "(", "security_barrier|security_invoker", "="))
+		COMPLETE_WITH("true", "false");
 
 	/* ALTER MATERIALIZED VIEW  */
 	else if (Matches("ALTER", "MATERIALIZED", "VIEW", MatchAny))
@@ -3531,14 +3552,35 @@ psql_completion(const char *text, int start, int end)
 	}
 
 /* CREATE VIEW --- is allowed inside CREATE SCHEMA, so use TailMatches */
-	/* Complete CREATE [ OR REPLACE ] VIEW  with AS */
+	/* Complete CREATE [ OR REPLACE ] VIEW  with AS or WITH */
 	else if (TailMatches("CREATE", "VIEW", MatchAny) ||
 			 TailMatches("CREATE", "OR", "REPLACE", "VIEW", MatchAny))
-		COMPLETE_WITH("AS");
+		COMPLETE_WITH("AS", "WITH");
 	/* Complete "CREATE [ OR REPLACE ] VIEW  AS with "SELECT" */
 	else if (TailMatches("CREATE", "VIEW", MatchAny, "AS") ||
 			 TailMatches("CREATE", "OR", "REPLACE", "VIEW", MatchAny, "AS"))
 		COMPLETE_WITH("SELECT");
+	/* CREATE [ OR REPLACE ] VIEW  WITH ( yyy [= zzz] ) */
+	else if (TailMatches("CREATE", "VIEW", MatchAny, "WITH") ||
+			 TailMatches("CREATE", "OR", "REPLACE", "VIEW", MatchAny, "WITH"))
+		COMPLETE_WITH("(");
+	else if (TailMatches("CREATE", "VIEW", MatchAny, "WITH", "(") ||
+			 TailMatches("CREATE", "OR", "REPLACE", "VIEW", MatchAny, "WITH", "("))
+		COMPLETE_WITH_LIST(view_optional_parameters);
+	else if (TailMatches("CREATE", "VIEW", MatchAny, "WITH", "(", "check_option") ||
+			 TailMatches("CREATE", "OR", "REPLACE", "VIEW", MatchAny,

Re: Random pg_upgrade test failure on drongo

2023-11-23 Thread Alexander Lakhin


Hello Kuroda-san,

21.11.2023 13:37, Hayato Kuroda (Fujitsu) wrote:

This email tells an update. The machine drongo failed the test a week ago [1]
and finally got logfiles. PSA files.

Oh, sorry. I missed to attach files. You can see pg_upgrade_server.log for now.



I can easily reproduce this failure on my workstation by running 5 tests
003_logical_slots in parallel inside Windows VM with it's CPU resources
limited to 50%, like so:
VBoxManage controlvm "Windows" cpuexecutioncap 50

set PGCTLTIMEOUT=180
python3 -c "NUMITERATIONS=20;NUMTESTS=5;import os;tsts='';exec('for i in range(1,NUMTESTS+1): 
tsts+=f\"pg_upgrade_{i}/003_logical_slots \"'); exec('for i in range(1,NUMITERATIONS+1):print(f\"iteration {i}\"); 
assert(os.system(f\"meson test --num-processes {NUMTESTS} {tsts}\") == 0)')"

...
iteration 2
ninja: Entering directory `C:\src\postgresql\build'
ninja: no work to do.
1/5 postgresql:pg_upgrade_2 / pg_upgrade_2/003_logical_slots ERROR    
60.30s   exit status 25
...
pg_restore: error: could not execute query: ERROR:  could not create file 
"base/1/2683": File exists
...

I agree with your analysis and would like to propose a PoC fix (see
attached). With this patch applied, 20 iterations succeeded for me.

Best regards,
Alexanderdiff --git a/src/port/dirmod.c b/src/port/dirmod.c
index 07dd190cbc..5185648388 100644
--- a/src/port/dirmod.c
+++ b/src/port/dirmod.c
@@ -121,6 +121,8 @@ pgunlink(const char *path)
 	bool		is_lnk;
 	int			loops = 0;
 	struct stat st;
+	char		tmppath[MAX_PATH];
+	char	   *curpath = path;
 
 	/*
 	 * This function might be called for a regular file or for a junction
@@ -129,9 +131,14 @@ pgunlink(const char *path)
 	 * if we can unlink directly, since that's expected to be the most common
 	 * case.
 	 */
-	if (unlink(path) == 0)
-		return 0;
-	if (errno != EACCES)
+	snprintf(tmppath, sizeof(tmppath), "%s.tmp", path);
+	if (pgrename(path, tmppath) == 0)
+	{
+		if (unlink(tmppath) == 0)
+			return 0;
+		curpath = tmppath;
+	}
+	else if (errno != EACCES)
 		return -1;
 
 	/*
@@ -150,7 +157,7 @@ pgunlink(const char *path)
 	 * fail. We want to wait until the file truly goes away so that simple
 	 * recursive directory unlink algorithms work.
 	 */
-	if (lstat(path, &st) < 0)
+	if (lstat(curpath, &st) < 0)
 	{
 		if (lstat_error_was_status_delete_pending())
 			is_lnk = false;
@@ -167,7 +174,7 @@ pgunlink(const char *path)
 	 * someone else to close the file, as the caller might be holding locks
 	 * and blocking other backends.
 	 */
-	while ((is_lnk ? rmdir(path) : unlink(path)) < 0)
+	while ((is_lnk ? rmdir(curpath) : unlink(curpath)) < 0)
 	{
 		if (errno != EACCES)
 			return -1;
@@ -175,6 +182,14 @@ pgunlink(const char *path)
 			return -1;
 		pg_usleep(10);		/* us */
 	}
+
+	loops = 0;
+	while (lstat(curpath, &st) < 0 && lstat_error_was_status_delete_pending())
+	{
+		if (++loops > 100)		/* time out after 10 sec */
+			return -1;
+		pg_usleep(10);		/* us */
+	}
 	return 0;
 }

Re: Question about the Implementation of vector32_is_highbit_set on ARM

2023-11-23 Thread John Naylor

On Thu, Nov 23, 2023 at 4:29 PM Xiang Gao  wrote:
>
> Thank you for your detailed explanation.
> Can I do some testing and submit this patch?

Please do, thanks.

Re: autovectorize page checksum code included elsewhere

2023-11-23 Thread John Naylor

On Thu, Nov 23, 2023 at 1:49 AM Nathan Bossart  wrote:
>
> On Wed, Nov 22, 2023 at 02:54:13PM +0200, Ants Aasma wrote:
> > On Wed, 22 Nov 2023 at 11:44, John Naylor  wrote:
> >> Poking in those files a bit, I also see references to building with
> >> SSE 4.1. Maybe that's an avenue that we should pursue? (an indirect
> >> function call is surely worth it for page-sized data)
>
> Yes, I think we should, but we also need to be careful not to hurt
> performance on platforms that aren't able to benefit [0] [1].

Well, yes (see popcount using a direct function call on non-x86), but
I don't think it's as important for page-sized data. Also, sse4.1 is
~10 years old, I think.

> There are a couple of other threads about adding support for newer
> instructions [2] [3], and properly detecting the availability of these
> instructions seems to be a common obstacle.  We have a path forward for
> stuff that's already using a runtime check (e.g., CRC32C), but I think
> we're still trying to figure out what to do for things that must be inlined
> (e.g., simd.h).
>
> One half-formed idea I have is to introduce some sort of ./configure flag
> that enables all the newer instructions that your CPU understands.

That's not doable, but we should consider taking advantage of
x86-64-v2, which RedHat 9 builds with. That would allow inlining CRC
and popcount there. Not sure how to detect that easily.

Re: remaining sql/json patches

2023-11-23 Thread jian he

+/*
+ * Evaluate or return the step address to evaluate a coercion of a JSON item
+ * to the target type.  The former if the coercion must be done right away by
+ * calling the target type's input function, and for some types, by calling
+ * json_populate_type().
+ *
+ * Returns the step address to be performed next.
+ */
+void
+ExecEvalJsonCoercionViaPopulateOrIO(ExprState *state, ExprEvalStep *op,
+ ExprContext *econtext)

the comment seems not right? it does return anything. it did the evaluation.

some logic in ExecEvalJsonCoercionViaPopulateOrIO, like if
(SOFT_ERROR_OCCURRED(escontext_p)) and if
(!InputFunctionCallSafe){...}, seems validated twice,
ExecEvalJsonCoercionFinish also did it. I uncommented the following
part, and still passed the test.
/src/backend/executor/execExprInterp.c
4452: // if (SOFT_ERROR_OCCURRED(escontext_p))
4453: // {
4454: // post_eval->error.value = BoolGetDatum(true);
4455: // *op->resvalue = (Datum) 0;
4456: // *op->resnull = true;
4457: // }

4470: // post_eval->error.value = BoolGetDatum(true);
4471: // *op->resnull = true;
4472: // *op->resvalue = (Datum) 0;
4473: return;

Correct me if I'm wrong.
like in "empty array on empty empty object on error", the "empty
array" refers to constant literal '[]' the assumed data type is jsonb,
the "empty object" refers to const literal '{}', the assumed data type
is jsonb.

--these two queries will fail very early, before ExecEvalJsonExprPath.
SELECT JSON_QUERY(jsonb '{"a":[3,4]}', '$.a' RETURNING int4range
default '[1.1,2]' on error);
SELECT JSON_QUERY(jsonb '{"a":[3,4]}', '$.a' RETURNING int4range
default '[1.1,2]' on empty);

-these four will fail later, and will call
ExecEvalJsonCoercionViaPopulateOrIO twice.
SELECT JSON_QUERY(jsonb '{"a":[3,4]}', '$.z' RETURNING int4range empty
object on empty empty object on error);
SELECT JSON_QUERY(jsonb '{"a":[3,4]}', '$.z' RETURNING int4range empty
array on empty empty array on error);
SELECT JSON_QUERY(jsonb '{"a":[3,4]}', '$.z' RETURNING int4range empty
array on empty empty object on error);
SELECT JSON_QUERY(jsonb '{"a":[3,4]}', '$.z' RETURNING int4range empty
object on empty empty array on error);

-however these four will not fail.
SELECT JSON_QUERY(jsonb '{"a":[3,4]}', '$.z' RETURNING int4range empty
object on error);
SELECT JSON_QUERY(jsonb '{"a":[3,4]}', '$.z' RETURNING int4range empty
array on error);
SELECT JSON_QUERY(jsonb '{"a":[3,4]}', '$.z' RETURNING int4range empty
array on empty);
SELECT JSON_QUERY(jsonb '{"a":[3,4]}', '$.z' RETURNING int4range empty
object on empty);

should the last four query fail or just return null?

Re: [Proposal] global sequence implemented by snowflake ID

On Thu, Nov 23, 2023 at 10:18:59AM +, Hayato Kuroda (Fujitsu) wrote:
> * Implement as a variant of sequence access method. I found that sequence AM 
> was
>   proposed many years ago [5], but it has not been active now. It might be a
>   fundamental way but needs a huge works.

Well, that's what I can call a timely proposal.  I've been working
this week on a design for sequence AMs, while considering the cases
that the original thread wanted to handle (spoiler: there are a lot of
pieces in the original patch that are not necessary, other parts are
incorrect like dump/restore), what you are trying to do here, and more
complex scenarios in terms of globally-distributed sequences.  My plan
was to send that next week or the week after, in time for January's
CF.
--
Michael


signature.asc
Description: PGP signature

Re: Synchronizing slots from primary to standby

2023-11-23 Thread Amit Kapila

On Wed, Nov 22, 2023 at 10:02 AM Zhijie Hou (Fujitsu)
 wrote:
>
> On Tuesday, November 21, 2023 5:33 PM shveta malik  
> wrote:
> >
> >
> > v37 fails to apply to HEAD due to a recent commit e83aa9f92fdd, rebased the
> > patches.  PFA v37_2 patches.
>
> Thanks for updating the patches.
>
> I'd like to discuss one issue related to the correct handling of failover flag
> when executing ALTER SUBSCRIPTION SET (slot_name = 'new_slot')".
>
> Since the command intends to use a new slot on the primary, the new slot needs
> to reflect the "failover" state that the subscription currently has. If the
> failoverstate of the Subscription is LOGICALREP_FAILOVER_STATE_ENABLED, then I
> can reset it to LOGICALREP_FAILOVER_STATE_PENDING and allow the apply worker 
> to
> handle it the way it is handled today (just like two_phase handling).
>
> But if the failoverstate is LOGICALREP_FAILOVER_STATE_DISABLED, the original
> idea is to call walrcv_alter_slot and alter the slot from the "ALTER
> SUBSCRIPTION" handling backend itself. This works if the slot is currently
> disabled. But the " ALTER SUBSCRIPTION SET (slot_name = 'new_slot')" command 
> is
> supported even if the subscription is enabled. If the subscription is enabled,
> then calling walrcv_alter_slot() fails because the slot is still acquired by
> apply worker.
>
> So, I am thinking do we need a new mechanism to change the failover flag to
> false on an enabled subscription ? For example, we could call 
> walrcv_alter_slot
> on startup of apply worker if AllTablesyncsReady(), for both true and false
> values of failover flag. This way, every time apply worker is started, it 
> calls
> walrcv_alter_slot to set the failover flag on the primary.
>

I think for the false case, we need to execute walrcv_alter_slot()
every time at the start of apply worker and it doesn't sound like an
ideal way to achieve it.

> Or we could just document that it is user's responsibility to match the 
> failover
> property in case it changes the slot_name.
>

Personally, I think we should document this behavior instead of
complicating the patch and the user anyway has a way to achieve it.

-- 
With Regards,
Amit Kapila.

Re: Change GUC hashtable to use simplehash?

2023-11-23 Thread John Naylor

On Thu, Nov 23, 2023 at 5:34 AM Andres Freund  wrote:

> It's worth noting that the limited range of the input values means that
> there's a lot of bias toward some bits being set ('a' to 'z' all start with
> 0b011).

We can take advantage of the limited range with a single additional
instruction: After "ch |= 0x20", do "ch -= ('a' - 1)". That'll shrink
letters and underscores to the range [1,31], which fits in 5 bits.
(Other characters are much less common in a guc name). That increases
randomness and allows 12 chars to be xor'd in before the first bits
rotate around.

> If, which I mildly doubt, we can't afford to call murmurhash32 for every
> character, we could just call it for 32/5 input characters together.  Or we
> could just load up to 8 characters into an 64bit integer, can call
> murmurhash64.

I'll play around with this idea, as well.

> The fact that string_hash() is slow due to the strlen(), which causes us to
> process the input twice and which is optimized to also handle very long
> strings which typically string_hash() doesn't encounter, seems problematic far
> beyond this case. We use string_hash() in a *lot* of places, and that strlen()
> does regularly show up in profiles. We should fix that.

+1

> I think we ought to adjust our APIs around this:

> 1) The accumulator state of the hash functions should be exposed, so one can
>accumulate values into the hash state, without reducing the internal state
>to a single 32/64 bit variable.

If so, it might make sense to vendor a small, suitably licensed hash
function that already has these APIs.

While on the subject, it'd be good to have a clear separation between
in-memory and on-disk usage, so we can make breaking changes in the
former.

[Proposal] global sequence implemented by snowflake ID

2023-11-23 Thread Hayato Kuroda (Fujitsu)

Hi hackers,

I want to discuss a new feature for assigning a snowflake ID[1], which can be
cluster-wide unique numbers. Also, Snowflake ID can be allocated standalone.

# Use case

A typical use case is a multi-master system constructed by logical replication.
This feature allows multi-node system to use GENERATED values. IIUC, this is
desired in another thread [2].

When the postgres is standalone, it is quite often that a sequence is used as
default value of the primary key. However, this cannot be done on the
multi-master
system as it is because the value on nodeA might be already used on nodeB.
Logical decoding of sequence partially solves the issue, but not sufficient -
what about the case of asynchronous replication? Managing chucks of values is
worse.

# What is the formats of Snowflake ID?

Snowflake ID has a below form:

[1bit - unused] + [41bit millisecond timestamp] + [10bit machine ID] + [12bit
local sequence number]

Trivially, the millisecond timestamp represents the time when the number is
allocated.
I.e., the time nextval() is called. Using a UNIX time seems an easiest way.

Machine ID can be an arbitrary number, but recommended to be unique in the
system.
Duplicated machine ID might trigger a conflict.

## Characteristics of snowflake ID

Snowflake ID can generate a unique numbers standalone. According to the old
discussion,
allocating value spaces to each nodes was considered [3], but it must
communicating
with other nodes, this brings extra difficulties. (e.g., Which protocol would
be used?)

Also, Snowflake IDs are roughly time ordered. As Andres pointed out in the old
discussions [4], large indexes over random values perform worse.
Snowflake can avoid the situation.

Moreover, Snowflake IDs are 64-bit integer, shorter than UUID (128-bit).

# Implementation

There are several approaches for implementing a snowflake ID. For example,

* Implement as contrib module. Features needed for each components of
snowflakeID
have already been implemented in core, so basically it can be.
* Implement as a variant of sequence access method. I found that sequence AM was
proposed many years ago [5], but it has not been active now. It might be a
fundamental way but needs a huge works.

Attached patch adds a minimal contrib module which can be used for testing my
proposal.
Below shows an usage.

```
-- Create an extension
postgres=# CREATE EXTENSION snowflake_sequence ;
CREATE EXTENSION
-- Create a sequence which generates snowflake IDs
postgres=# SELECT snowflake_sequence.create_sequence('test_sequence');
create_sequence
-

(1 row)
-- Get next snowflake ID
postgres=# SELECT snowflake_sequence.nextval('test_sequence');
nextval
-
3162329056562487297
(1 row)
```

How do you think?

[1]:
https://github.com/twitter-archive/snowflake/tree/b3f6a3c6ca8e1b6847baa6ff42bf72201e2c2231
[2]:
https://www.postgresql.org/message-id/1b25328f-5f4d-9b75-b3f2-f9d9931d1b9d%40postgresql.org
[3]:
https://www.postgresql.org/message-id/CA%2BU5nMLSh4fttA4BhAknpCE-iAWgK%2BBG-_wuJS%3DEAcx7hTYn-Q%40mail.gmail.com
[4]:
https://www.postgresql.org/message-id/201210161515.54895.andres%402ndquadrant.com
[5]:
https://www.postgresql.org/message-id/flat/CA%2BU5nMLV3ccdzbqCvcedd-HfrE4dUmoFmTBPL_uJ9YjsQbR7iQ%40mail.gmail.com

Best Regards,
Hayato Kuroda
FUJITSU LIMITED

0001-initial-commit-for-snowflake_sequence.patch
Description: 0001-initial-commit-for-snowflake_sequence.patch

Re: Synchronizing slots from primary to standby

2023-11-23 Thread Ajin Cherian

On Tue, Nov 21, 2023 at 8:32 PM shveta malik  wrote:
>
> v37 fails to apply to HEAD due to a recent commit e83aa9f92fdd,
> rebased the patches.  PFA v37_2 patches.

Thanks for the patch. Some comments:
subscriptioncmds.c:
CreateSubscription()
and tablesync.c:
process_syncing_tables_for_apply()
 walrcv_create_slot(wrconn, opts.slot_name, false,
twophase_enabled,
-   CRS_NOEXPORT_SNAPSHOT, NULL);
-
-if (twophase_enabled)
-UpdateTwoPhaseState(subid,
LOGICALREP_TWOPHASE_STATE_ENABLED);
-
+   failover_enabled,
CRS_NOEXPORT_SNAPSHOT, NULL);

either here or in libpqrcv_create_slot(), shouldn't you check the
remote server version if it supports the failover flag?


+
+/*
+ * If only the slot_name is specified, it is possible
that the user intends to
+ * use an existing slot on the publisher, so here we
enable failover for the
+ * slot if requested.
+ */
+else if (opts.slot_name && failover_enabled)
+{
+walrcv_alter_slot(wrconn, opts.slot_name, opts.failover);
+ereport(NOTICE,
+(errmsg("enabled failover for replication
slot \"%s\" on publisher",
+opts.slot_name)));
+}

Here, the code only alters the slot if failover = true. You could use
"else if (opts.slot_name && IsSet(opts.specified_opts,
SUBOPT_FAILOVER)" to check if the failover flag is specified and alter
for failover=false as well. Also, shouldn't you check for the server
version if the command ALTER_REPLICATION_SLOT is supported?

slot.c:
ReplicationSlotAlter()

+void
+ReplicationSlotAlter(const char *name, bool failover)
+{
+Assert(MyReplicationSlot == NULL);
+
+ReplicationSlotAcquire(name, true);
+
+if (SlotIsPhysical(MyReplicationSlot))
+ereport(ERROR,
+errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+errmsg("cannot use %s with a physical replication slot",
+   "ALTER_REPLICATION_SLOT"));

shouldn't you release the slot by calling ReplicationSlotRelease
before erroring out?

slot.c:
+/*
+ * A helper function to validate slots specified in standby_slot_names GUCs.
+ */
+static bool
+validate_standby_slots(char **newval)
+{
+char   *rawname;
+List   *elemlist;
+ListCell   *lc;
+
+/* Need a modifiable copy of string */
+rawname = pstrdup(*newval);

rawname is not always freed.

launcher.c:

+SlotSyncWorker->hdr.proc = MyProc;
+
+before_shmem_exit(slotsync_worker_detach, (Datum) 0);
+
+LWLockRelease(SlotSyncWorkerLock);
+}

before_shmem_exit() can error out leaving the lock acquired. Maybe you
should release the lock prior to calling before_shmem_exit() because
you don't need the lock there.

regards,
Ajin Cherian
Fujitsu Australia

Re: should check collations when creating partitioned index

2023-11-23 Thread Peter Eisentraut


On 20.11.23 17:25, Tom Lane wrote:

Peter Eisentraut  writes:

On 14.11.23 17:15, Tom Lane wrote:

I don't love the patch details though.  It seems entirely wrong to check
this before we check the opclass match.



Not sure why?  The order doesn't seem to matter?


The case that was bothering me was if we had a non-collated type
versus a collated type.  That would result in throwing an error
about collation mismatch, when complaining about the opclass seems
more apropos.  However, if we do this:


I see.  That means we shouldn't raise an error on a mismatch but just do
  if (key->partcollation[i] != collationIds[j])
  continue;


it might not matter much.


Here is an updated patch that works as indicated above.

The behavior if you try to create an index with mismatching collations 
now is that it will skip over the column and complain at the end with 
something like


ERROR:  0A000: unique constraint on partitioned table must include all 
partitioning columns
DETAIL:  UNIQUE constraint on table "t1" lacks column "b" which is part 
of the partition key.


which perhaps isn't intuitive, but I think it would be the same if you 
somehow tried to build an index with different operator classes than the 
partitioning.  I think these less-specific error messages are ok in such 
edge cases.From 869a66c429eb188ceafcbd972b6e46b63fce88f3 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut 
Date: Thu, 23 Nov 2023 10:37:21 +0100
Subject: [PATCH v2] Check collation when creating partitioned index

When creating a partitioned index, the partition key must be a subset
of the index's columns.  But this currently doesn't check that the
collations between the partition key and the index definition match.
So you can construct a unique index that fails to enforce uniqueness.
(This would most likely involve a nondeterministic collation, so it
would have to be crafted explicitly and is not something that would
just happen by accident.)

This patch adds the required collation check.  As a result, any
previously allowed unique index that has a collation mismatch would no
longer be allowed to be created.

Discussion: 
https://www.postgresql.org/message-id/flat/3327cb54-f7f1-413b-8fdb-7a9dceebb938%40eisentraut.org
---
 src/backend/commands/indexcmds.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c
index 0b3b8e98b8..c7ecedbe3b 100644
--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@@ -1011,10 +1011,13 @@ DefineIndex(Oid tableId,
{
if (key->partattrs[i] == 
indexInfo->ii_IndexAttrNumbers[j])
{
-   /* Matched the column, now what about 
the equality op? */
+   /* Matched the column, now what about 
the collation and equality op? */
Oid idx_opfamily;
Oid idx_opcintype;
 
+   if (key->partcollation[i] != 
collationIds[j])
+   continue;
+
if 
(get_opclass_opfamily_and_input_type(opclassIds[j],

&idx_opfamily,

&idx_opcintype))
-- 
2.42.1

RE: Question about the Implementation of vector32_is_highbit_set on ARM

2023-11-23 Thread Xiang Gao

On Date: Mon, 20 Nov 2023 16:05:43PM +0700, John Naylor wrote:

>On Wed, Nov 8, 2023 at 2:44=E2=80=AFPM Xiang Gao  wrote:
>>  * function. We could instead adopt the behavior of Arm's vmaxvq_u32(), i=
>.e.
>>  * check each 32-bit element, but that would require an additional mask
>>  * operation on x86.
>>  */
>
>> But I still don't understand why the vmaxvq_u32 intrinsic  is not used on=
 the arm platform.

>The current use case expects all 1's or all 0's in a 32-bit lane. If
>anyone tried using it for arbitrary values, vmaxvq_u32 could give a
>different answer than on x86 using _mm_movemask_epi8, so I think
>that's the origin of that comment. But it's still a maintenance hazard
>as is, since x86 wouldn't work for arbitrary values. It seems the path
>forward is to rename this function to vector32_is_any_lane_set(), as
>in the attached (untested on Arm). That would allow each
>implementation to use the most efficient path, whether it's by 8- or
>32-bit lanes. If we someday needed to look at only the high bits, we
>would need a new function that performed the necessary masking on x86.
>
>It's possible this method could shave cycles on Arm in some 8-bit lane
>cases where we don't actually care about the high bit specifically,
>since the movemask equivalent is slow on that platform, but I haven't
>looked yet.

Thank you for your detailed explanation.
Can I do some testing and submit this patch?
IMPORTANT NOTICE: The contents of this email and any attachments are 
confidential and may also be privileged. If you are not the intended recipient, 
please notify the sender immediately and do not disclose the contents to any 
other person, use it for any purpose, or store or copy the information in any 
medium. Thank you.

Re: undetected deadlock in ALTER SUBSCRIPTION ... REFRESH PUBLICATION

2023-11-23 Thread Amit Kapila

On Wed, Nov 22, 2023 at 4:51 PM Tomas Vondra
 wrote:
>
> On 11/22/23 11:38, Amit Kapila wrote:
> >
> > Okay. IIUC, what's going on here is that the apply worker acquires
> > AccessShareLock on pg_subscription to update rel state for one of the
> > tables say tbl-1, and then for another table say tbl-2, it started
> > waiting for a state change via wait_for_relation_state_change(). I
> > think here the fix is to commit the transaction before we go for a
> > wait. I guess we need something along the lines of what is proposed in
> > [1] though we have solved the problem in that thread in some other
> > way..
> >
>
> Possibly. I haven't checked if the commit might have some unexpected
> consequences, but I can confirm I can no longer reproduce the deadlock
> with the patch applied.
>

Thanks for the verification. Offhand, I don't see any problem with
doing a commit at that place but will try to think some more about it.
I think we may want to call pgstat_report_stat(false) after commit to
avoid a long delay in stats.

I haven't verified but I think this will be a problem in back-branches as well.

-- 
With Regards,
Amit Kapila.

Re: POC, WIP: OR-clause support for indexes

2023-11-23 Thread Andrei Lepikhov


On 21/11/2023 18:31, Alena Rybakina wrote:
Sorry, I lost your changes  during the revision process. I returned 
them. I raised the patch version just in case to run ci successfully.


I think the usage of nodeToString for the generation of clause hash is 
too expensive and buggy.
Also, in the code, you didn't resolve hash collisions. So, I've 
rewritten the patch a bit (see the attachment).
One more thing: I propose to enable transformation by default at least 
for quick detection of possible issues.
This code changes tests in many places. But, as I see it, it mostly 
demonstrates the positive effect of the transformation.


--
regards,
Andrei Lepikhov
Postgres Professional
From 5071d02426ac3430f4dd61a8ad32c2847ba6f8a5 Mon Sep 17 00:00:00 2001
From: "Andrey V. Lepikhov" 
Date: Thu, 23 Nov 2023 16:00:13 +0700
Subject: [PATCH] Transform OR clause to ANY expressions.

Replace (X=N1) OR (X=N2) ... with X = ANY(N1, N2) on the preliminary stage of
optimization when we are still working with a tree expression.
Sometimes it can lead to not optimal plan. But we think it is better to have
array of elements instead of a lot of OR clauses. Here is a room for further
optimizations on decomposing that array into more optimal parts.
---
 src/backend/nodes/queryjumblefuncs.c  |  30 ++
 src/backend/parser/parse_expr.c   | 285 +-
 src/backend/utils/misc/guc_tables.c   |  10 +
 src/backend/utils/misc/postgresql.conf.sample |   1 +
 src/include/nodes/queryjumble.h   |   1 +
 src/include/parser/parse_expr.h   |   1 +
 src/test/regress/expected/create_index.out| 141 +++--
 src/test/regress/expected/create_view.out |   2 +-
 src/test/regress/expected/guc.out |   3 +-
 src/test/regress/expected/inherit.out |   2 +-
 src/test/regress/expected/join.out|  62 +++-
 src/test/regress/expected/partition_prune.out | 215 +++--
 src/test/regress/expected/rules.out   |  18 +-
 src/test/regress/expected/stats_ext.out   |  12 +-
 src/test/regress/expected/sysviews.out|   3 +-
 src/test/regress/expected/tidscan.out |  23 +-
 src/test/regress/sql/create_index.sql |  32 ++
 src/test/regress/sql/join.sql |  10 +
 src/test/regress/sql/partition_prune.sql  |  22 ++
 src/test/regress/sql/tidscan.sql  |   6 +
 src/tools/pgindent/typedefs.list  |   2 +
 21 files changed, 815 insertions(+), 66 deletions(-)

diff --git a/src/backend/nodes/queryjumblefuncs.c 
b/src/backend/nodes/queryjumblefuncs.c
index 281907a4d8..99207a8670 100644
--- a/src/backend/nodes/queryjumblefuncs.c
+++ b/src/backend/nodes/queryjumblefuncs.c
@@ -135,6 +135,36 @@ JumbleQuery(Query *query)
return jstate;
 }
 
+JumbleState *
+JumbleExpr(Expr *expr, uint64 *queryId)
+{
+   JumbleState *jstate = NULL;
+
+   Assert(queryId != NULL);
+
+   jstate = (JumbleState *) palloc(sizeof(JumbleState));
+
+   /* Set up workspace for query jumbling */
+   jstate->jumble = (unsigned char *) palloc(JUMBLE_SIZE);
+   jstate->jumble_len = 0;
+   jstate->clocations_buf_size = 32;
+   jstate->clocations = (LocationLen *)
+   palloc(jstate->clocations_buf_size * sizeof(LocationLen));
+   jstate->clocations_count = 0;
+   jstate->highest_extern_param_id = 0;
+
+   /* Compute query ID */
+   _jumbleNode(jstate, (Node *) expr);
+   *queryId = DatumGetUInt64(hash_any_extended(jstate->jumble,
+   
jstate->jumble_len,
+   
0));
+
+   if (*queryId == UINT64CONST(0))
+   *queryId = UINT64CONST(1);
+
+   return jstate;
+}
+
 /*
  * Enables query identifier computation.
  *
diff --git a/src/backend/parser/parse_expr.c b/src/backend/parser/parse_expr.c
index 64c582c344..d782642771 100644
--- a/src/backend/parser/parse_expr.c
+++ b/src/backend/parser/parse_expr.c
@@ -22,6 +22,7 @@
 #include "miscadmin.h"
 #include "nodes/makefuncs.h"
 #include "nodes/nodeFuncs.h"
+#include "nodes/queryjumble.h"
 #include "optimizer/optimizer.h"
 #include "parser/analyze.h"
 #include "parser/parse_agg.h"
@@ -43,6 +44,7 @@
 
 /* GUC parameters */
 bool   Transform_null_equals = false;
+bool   enable_or_transformation = true;
 
 
 static Node *transformExprRecurse(ParseState *pstate, Node *expr);
@@ -99,6 +101,287 @@ static Expr *make_distinct_op(ParseState *pstate, List 
*opname,
 static Node *make_nulltest_from_distinct(ParseState *pstate,

 A_Expr *distincta, Node *arg);
 
+typedef struct OrClauseGroupEntry
+{
+   Node   *node;
+   List   *consts;
+   Oid scalar_type;
+   Oid

Re: psql not responding to SIGINT upon db reconnection