Re: Synchronizing slots from primary to standby

2024-02-18 Thread Bertrand Drouvot
Hi,

On Sat, Feb 17, 2024 at 10:10:18AM +0530, Amit Kapila wrote:
> On Fri, Feb 16, 2024 at 4:10 PM shveta malik  wrote:
> >
> > On Thu, Feb 15, 2024 at 10:48 PM Bertrand Drouvot
> >  wrote:
> >
> > > 5 ===
> > >
> > > +   if (SlotSyncWorker->syncing)
> > > {
> > > -   SpinLockRelease(>mutex);
> > > +   SpinLockRelease(>mutex);
> > > ereport(ERROR,
> > > 
> > > errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
> > > errmsg("cannot synchronize replication 
> > > slots concurrently"));
> > > }
> > >
> > > worth to add a test in 040_standby_failover_slots_sync.pl for it?
> >
> > It will be very difficult to stabilize this test as we have to make
> > sure that the concurrent users (SQL function(s) and/or worker(s)) are
> > in that target function at the same time to hit it.
> >
> 
> Yeah, I also think would be tricky to write a stable test, maybe one
> can explore using a new injection point facility but I don't think it
> is worth for this error check as this appears straightforward to be
> broken in the future by other changes.

Yeah, injection point would probably be the way to go. Agree that's probably
not worth adding such a test (we can change our mind later on if needed anyway).

Regards,

-- 
Bertrand Drouvot
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com




Re: Injection points: some tools to wait and wake

2024-02-18 Thread Michael Paquier
On Mon, Feb 19, 2024 at 03:01:40PM +0900, Michael Paquier wrote:
> 0002 is a polished version of the TAP test that makes use of this
> facility, providing coverage for the bug fixed by 7863ee4def65
> (reverting this commit causes the test to fail), where a restart point 
> runs across a promotion request.  The trick is to stop the
> checkpointer in the middle of a restart point and issue a promotion
> in-between.

The CF bot has been screaming at this one on Windows because the
process started with IPC::Run::start was not properly finished, so
attached is an updated version to bring that back to green.
--
Michael
From ef27d6eba619f5106915de6b05cbeb5294263c30 Mon Sep 17 00:00:00 2001
From: Michael Paquier 
Date: Mon, 19 Feb 2024 14:35:55 +0900
Subject: [PATCH v2 1/2] injection_points: Add routines to wait and wake
 processes

This commit is made of two parts:
- A new callback that can be attached to a process to make it wait on a
condition variable.  The condition checked is registered in shared
memory by the module injection_points.
- A new SQL function to update the shared state and broadcast the update
using a condition variable.

The shared state used by the module is registered using the DSM
registry, and is optional.
---
 .../injection_points--1.0.sql |  10 ++
 .../injection_points/injection_points.c   | 104 ++
 src/tools/pgindent/typedefs.list  |   1 +
 3 files changed, 115 insertions(+)

diff --git a/src/test/modules/injection_points/injection_points--1.0.sql b/src/test/modules/injection_points/injection_points--1.0.sql
index 5944c41716..20479991f2 100644
--- a/src/test/modules/injection_points/injection_points--1.0.sql
+++ b/src/test/modules/injection_points/injection_points--1.0.sql
@@ -24,6 +24,16 @@ RETURNS void
 AS 'MODULE_PATHNAME', 'injection_points_run'
 LANGUAGE C STRICT PARALLEL UNSAFE;
 
+--
+-- injection_points_wake()
+--
+-- Wakes a condition variable waited on in an injection point.
+--
+CREATE FUNCTION injection_points_wake()
+RETURNS void
+AS 'MODULE_PATHNAME', 'injection_points_wake'
+LANGUAGE C STRICT PARALLEL UNSAFE;
+
 --
 -- injection_points_detach()
 --
diff --git a/src/test/modules/injection_points/injection_points.c b/src/test/modules/injection_points/injection_points.c
index e843e6594f..3a319b1525 100644
--- a/src/test/modules/injection_points/injection_points.c
+++ b/src/test/modules/injection_points/injection_points.c
@@ -18,18 +18,67 @@
 #include "postgres.h"
 
 #include "fmgr.h"
+#include "storage/condition_variable.h"
 #include "storage/lwlock.h"
 #include "storage/shmem.h"
+#include "storage/dsm_registry.h"
 #include "utils/builtins.h"
 #include "utils/injection_point.h"
 #include "utils/wait_event.h"
 
 PG_MODULE_MAGIC;
 
+/* Shared state information for injection points. */
+typedef struct InjectionPointSharedState
+{
+	/* protects accesses to wait_counts */
+	slock_t		lock;
+
+	/* Counter advancing when injection_points_wake() is called */
+	int			wait_counts;
+
+	/*
+	 * Condition variable that can be used in an injection point, checking
+	 * upon wait_counts when waiting.
+	 */
+	ConditionVariable wait_point;
+} InjectionPointSharedState;
+
+/* Pointer to shared-memory state. */
+static InjectionPointSharedState *inj_state = NULL;
+
+/* Wait event when waiting on condition variable */
+static uint32 injection_wait_event = 0;
+
 extern PGDLLEXPORT void injection_error(const char *name);
 extern PGDLLEXPORT void injection_notice(const char *name);
+extern PGDLLEXPORT void injection_wait(const char *name);
 
 
+static void
+injection_point_init_state(void *ptr)
+{
+	InjectionPointSharedState *state = (InjectionPointSharedState *) ptr;
+
+	state->wait_counts = 0;
+	SpinLockInit(>lock);
+	ConditionVariableInit(>wait_point);
+}
+
+static void
+injection_init_shmem(void)
+{
+	bool		found;
+
+	if (inj_state != NULL)
+		return;
+
+	inj_state = GetNamedDSMSegment("injection_points",
+   sizeof(InjectionPointSharedState),
+   injection_point_init_state,
+   );
+}
+
 /* Set of callbacks available to be attached to an injection point. */
 void
 injection_error(const char *name)
@@ -43,6 +92,38 @@ injection_notice(const char *name)
 	elog(NOTICE, "notice triggered for injection point %s", name);
 }
 
+/* Wait on a condition variable, awaken by injection_points_wake() */
+void
+injection_wait(const char *name)
+{
+	int			old_wait_counts;
+
+	if (inj_state == NULL)
+		injection_init_shmem();
+	if (injection_wait_event == 0)
+		injection_wait_event = WaitEventExtensionNew("injection_wait");
+
+	SpinLockAcquire(_state->lock);
+	old_wait_counts = inj_state->wait_counts;
+	SpinLockRelease(_state->lock);
+
+	/* And sleep.. */
+	ConditionVariablePrepareToSleep(_state->wait_point);
+	for (;;)
+	{
+		int			new_wait_counts;
+
+		SpinLockAcquire(_state->lock);
+		new_wait_counts = inj_state->wait_counts;
+		SpinLockRelease(_state->lock);
+
+		if (old_wait_counts != new_wait_counts)
+			break;
+		

Re: Fix race condition in InvalidatePossiblyObsoleteSlot()

2024-02-18 Thread Bertrand Drouvot
Hi,

On Mon, Feb 19, 2024 at 03:14:07PM +0900, Michael Paquier wrote:
> On Thu, Feb 15, 2024 at 11:24:59AM +, Bertrand Drouvot wrote:
> > Thanks for looking at it!
> > 
> > Agree, using an assertion instead in v3 attached.
> 
> And you did not forget the PG_USED_FOR_ASSERTS_ONLY.

Thanks to the "CompilerWarnings" cirrus test ;-)

> 
> > > It seems important to document why we are saving this data here; while
> > > we hold the LWLock for repslots, before we perform any termination,
> > > and we  want to protect conflict reports with incorrect data if the
> > > slot data got changed while the lwlock is temporarily released while
> > > there's a termination.
> > 
> > Yeah, comments added in v3.
> 
> The contents look rather OK, I may do some word-smithing for both.

Thanks!

> >> For example just after the kill()?  It seems to me
> >> that we should stuck the checkpointer, perform a forced upgrade of the
> >> xmins, release the checkpointer and see the effects of the change in
> >> the second loop.  Now, modules/injection_points/ does not allow that,
> >> yet, but I've posted a patch among these lines that can stop a process
> >> and release it using a condition variable (should be 0006 on [1]).  I
> >> was planning to start a new thread with a patch posted for the next CF
> >> to add this kind of facility with a regression test for an old bug,
> >> the patch needs a few hours of love, first.  I should be able to post
> >> that next week.
> > 
> > Great, that looks like a good idea!
> 
> I've been finally able to spend some time on what I had in mind and
> posted it here for the next CF:
> https://www.postgresql.org/message-id/zdluxbk5hgpol...@paquier.xyz
> 
> You should be able to reuse that the same way I do in 0002 posted on
> the thread, where I'm causing the checkpointer to wait, then wake it
> up.

Thanks! I'll look at it.

> > Are we going to fix this on master and 16 stable first and then later on 
> > add a
> > test on master with the injection points?
> 
> Still, the other patch is likely going to take a couple of weeks
> before getting into the tree, so I have no objection to fix the bug
> first and backpatch, then introduce a test.  Things have proved that
> failures could show up in the buildfarm in this area, so more time
> running this code across two branches is not a bad concept, either.

Fully agree.

Regards,

-- 
Bertrand Drouvot
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com




Re: Show WAL write and fsync stats in pg_stat_io

2024-02-18 Thread Nazir Bilal Yavuz
Hi,

On Thu, 18 Jan 2024 at 04:22, Michael Paquier  wrote:
>
> On Wed, Jan 17, 2024 at 03:20:39PM +0300, Nazir Bilal Yavuz wrote:
> > I agree with your points. While the other I/O related work is
> > happening we can discuss what we should do in the variable op_byte
> > cases. Also, this is happening only for WAL right now but if we try to
> > extend pg_stat_io in the future, that problem possibly will rise
> > again. So, it could be good to come up with a general solution, not
> > only for WAL.
>
> Okay, I've marked the patch as RwF for this CF.

I wanted to inform you that the 73f0a13266 commit changed all WALRead
calls to read variable bytes, only the WAL receiver was reading
variable bytes before.

-- 
Regards,
Nazir Bilal Yavuz
Microsoft




Re: Add pg_basetype() function to obtain a DOMAIN base type

2024-02-18 Thread jian he
On Sun, Feb 18, 2024 at 7:29 AM Tomas Vondra
 wrote:
>
>
> Also, now that I looked at the v2 patch again, I see it only really
> tweaked the pg_proc.dat entry, but the code still does PG_GETARG_OID (so
> the "any" bit is not really correct).
>

PG_GETARG_OID part indeed is wrong. so I change to following:

+Datum
+pg_basetype(PG_FUNCTION_ARGS)
+{
+ Oid oid;
+
+ oid =  get_fn_expr_argtype(fcinfo->flinfo, 0);
+ if (!SearchSysCacheExists1(TYPEOID, ObjectIdGetDatum(oid)))
+ PG_RETURN_NULL();
+
+ PG_RETURN_OID(getBaseType(oid));
+}

I still name the function as pg_basetype, feel free to change it.

+  
+   
+
+ pg_basetype
+
+pg_basetype ( "any" )
+regtype
+   
+   
+   Returns the OID of the base type of a domain or if the
argument is a basetype it returns the same type.
+   If there's a chain of domain dependencies, it will recurse
until finding the base type.
+   
compared with pg_typeof's explanation, I feel like pg_basetype's
explanation doesn't seem accurate.
However, I don't know how to rephrase it.
From a06f2de575da6e5fa45919c792f3dab2470f4927 Mon Sep 17 00:00:00 2001
From: jian he 
Date: Mon, 19 Feb 2024 15:01:19 +0800
Subject: [PATCH v3 1/1] Add pg_basetype("any") function

Currently obtaining the base type of a domain involves a complex SQL query,
this specially in the case of recursive domain types.

To solve this, use the already available getBaseType() function,
and expose it as the pg_basetype SQL function.

if the argument is not a doamin type, return the type of the argument as is.
if the argument is a type of doamin, pg_basetype will recurse the domain dependencies chain
and return the real inner base type of the domain.

discussion:  https://postgr.es/m/CAGRrpzZSX8j=MQcbCSEisFA=ic=K3bknVfnFjAv1diVJxFHJvg@mail.gmail.com
---
 doc/src/sgml/func.sgml   | 25 +++
 src/backend/utils/adt/misc.c | 16 +
 src/include/catalog/pg_proc.dat  |  3 +++
 src/test/regress/expected/domain.out | 36 
 src/test/regress/sql/domain.sql  | 16 +
 5 files changed, 96 insertions(+)

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index cf3de803..5af0f4c6 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -24778,6 +24778,31 @@ SELECT pg_type_is_visible('myschema.widget'::regtype);

   
 
+  
+   
+
+ pg_basetype
+
+pg_basetype ( "any" )
+regtype
+   
+   
+   Returns the OID of the base type of a domain or if the argument is a basetype it returns the same type.
+   If there's a chain of domain dependencies, it will recurse until finding the base type.
+   
+   
+For example:
+
+CREATE DOMAIN mytext as text;
+
+SELECT pg_basetype('mytext'::mytext);
+ pg_basetype
+---
+ text
+
+   
+  
+
   

 
diff --git a/src/backend/utils/adt/misc.c b/src/backend/utils/adt/misc.c
index 2d7d7806..30f6a2e5 100644
--- a/src/backend/utils/adt/misc.c
+++ b/src/backend/utils/adt/misc.c
@@ -45,6 +45,7 @@
 #include "utils/fmgroids.h"
 #include "utils/lsyscache.h"
 #include "utils/ruleutils.h"
+#include "utils/syscache.h"
 #include "utils/timestamp.h"
 
 
@@ -566,6 +567,21 @@ pg_typeof(PG_FUNCTION_ARGS)
 	PG_RETURN_OID(get_fn_expr_argtype(fcinfo->flinfo, 0));
 }
 
+/*
+ * Return the base type of the argument.
+ * iff the argument is not a type of domain, return the argument's type as is.
+ */
+Datum
+pg_basetype(PG_FUNCTION_ARGS)
+{
+	Oid			oid;
+
+	oid =  get_fn_expr_argtype(fcinfo->flinfo, 0);
+	if (!SearchSysCacheExists1(TYPEOID, ObjectIdGetDatum(oid)))
+		PG_RETURN_NULL();
+
+	PG_RETURN_OID(getBaseType(oid));
+}
 
 /*
  * Implementation of the COLLATE FOR expression; returns the collation
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 9c120fc2..bbd6b411 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -3877,6 +3877,9 @@
 { oid => '1619', descr => 'type of the argument',
   proname => 'pg_typeof', proisstrict => 'f', provolatile => 's',
   prorettype => 'regtype', proargtypes => 'any', prosrc => 'pg_typeof' },
+{ oid => '6312', descr => 'get the base type of a domain',
+  proname => 'pg_basetype', proisstrict => 'f', provolatile => 's',
+  prorettype => 'regtype', proargtypes => 'any', prosrc => 'pg_basetype' },
 { oid => '3162',
   descr => 'collation of the argument; implementation of the COLLATION FOR expression',
   proname => 'pg_collation_for', proisstrict => 'f', provolatile => 's',
diff --git a/src/test/regress/expected/domain.out b/src/test/regress/expected/domain.out
index 6d94e844..13bf7877 100644
--- a/src/test/regress/expected/domain.out
+++ b/src/test/regress/expected/domain.out
@@ -1207,3 +1207,39 @@ create domain testdomain1 as int constraint unsigned check (value > 0);
 alter domain testdomain1 rename constraint unsigned to 

Add an option to skip loading missing publication to avoid logical replication failure

2024-02-18 Thread vignesh C
Hi,

Currently ALTER SUBSCRIPTION ... SET PUBLICATION will break the
logical replication in certain cases. This can happen as the apply
worker will get restarted after SET PUBLICATION, the apply worker will
use the existing slot and replication origin corresponding to the
subscription. Now, it is possible that before restart the origin has
not been updated and the WAL start location points to a location prior
to where PUBLICATION pub exists which can lead to such an error. Once
this error occurs, apply worker will never be able to proceed and will
always return the same error.

There was discussion on this and Amit had posted a patch to handle
this at [2]. Amit's patch does continue using a historic snapshot but
ignores publications that are not found for the purpose of computing
RelSyncEntry attributes. We won't mark such an entry as valid till all
the publications are loaded without anything missing. This means we
won't publish operations on tables corresponding to that publication
till we found such a publication and that seems okay.
I have added an option skip_not_exist_publication to enable this
operation only when skip_not_exist_publication is specified as true.
There is no change in default behavior when skip_not_exist_publication
is specified as false.

But one thing to note with the patch (with skip_not_exist_publication
option) is that replication of few WAL entries will be skipped till
the publication is loaded like in the below example:
-- Create table in publisher and subscriber
create table t1(c1 int);
create table t2(c1 int);

-- Create publications
create publication pub1 for table t1;
create publication pub2 for table t2;

-- Create subscription
create subscription test1 connection 'dbname=postgres host=localhost
port=5432' publication pub1, pub2;

-- Drop one publication
drop publication pub1;

-- Insert in the publisher
insert into t1 values(11);
insert into t2 values(21);

-- Select in subscriber
postgres=# select * from t1;
 c1

(0 rows)

postgres=# select * from t2;
 c1

 21
(1 row)

-- Create the dropped publication in publisher
create publication pub1 for table t1;

-- Insert in the publisher
insert into t1 values(12);
postgres=# select * from t1;
 c1

 11
 12
(2 rows)

-- Select data in subscriber
postgres=# select * from t1; -- record with value 11 will be missing
in subscriber
 c1

 12
(1 row)

Thoughts?

[1] - 
https://www.postgresql.org/message-id/CAA4eK1%2BT-ETXeRM4DHWzGxBpKafLCp__5bPA_QZfFQp7-0wj4Q%40mail.gmail.com

Regards,
Vignesh
From 9b79dee26554ae4af9ff00948ab185482b071ba8 Mon Sep 17 00:00:00 2001
From: Vignesh C 
Date: Mon, 19 Feb 2024 10:20:02 +0530
Subject: [PATCH v1 1/2] Skip loading the publication if the publication does
 not exist.

Skip loading the publication if the publication does not exist.
---
 src/backend/replication/pgoutput/pgoutput.c | 28 +++--
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/src/backend/replication/pgoutput/pgoutput.c b/src/backend/replication/pgoutput/pgoutput.c
index 998f92d671..f7b6d0384d 100644
--- a/src/backend/replication/pgoutput/pgoutput.c
+++ b/src/backend/replication/pgoutput/pgoutput.c
@@ -82,7 +82,7 @@ static void pgoutput_stream_prepare_txn(LogicalDecodingContext *ctx,
 
 static bool publications_valid;
 
-static List *LoadPublications(List *pubnames);
+static List *LoadPublications(List *pubnames, bool *skipped);
 static void publication_invalidation_cb(Datum arg, int cacheid,
 		uint32 hashvalue);
 static void send_relation_and_attrs(Relation relation, TransactionId xid,
@@ -1703,9 +1703,13 @@ pgoutput_shutdown(LogicalDecodingContext *ctx)
 
 /*
  * Load publications from the list of publication names.
+ *
+ * Here, we just skip the publications that don't exist yet. 'skipped'
+ * will be true if we find any publication from the given list that doesn't
+ * exist.
  */
 static List *
-LoadPublications(List *pubnames)
+LoadPublications(List *pubnames, bool *skipped)
 {
 	List	   *result = NIL;
 	ListCell   *lc;
@@ -1713,9 +1717,12 @@ LoadPublications(List *pubnames)
 	foreach(lc, pubnames)
 	{
 		char	   *pubname = (char *) lfirst(lc);
-		Publication *pub = GetPublicationByName(pubname, false);
+		Publication *pub = GetPublicationByName(pubname, true);
 
-		result = lappend(result, pub);
+		if (pub)
+			result = lappend(result, pub);
+		else
+			*skipped = true;
 	}
 
 	return result;
@@ -1994,7 +2001,7 @@ get_rel_sync_entry(PGOutputData *data, Relation relation)
 	}
 
 	/* Validate the entry */
-	if (!entry->replicate_valid)
+	if (!entry->replicate_valid || !publications_valid)
 	{
 		Oid			schemaId = get_rel_namespace(relid);
 		List	   *pubids = GetRelationPublications(relid);
@@ -2011,6 +2018,7 @@ get_rel_sync_entry(PGOutputData *data, Relation relation)
 		bool		am_partition = get_rel_relispartition(relid);
 		char		relkind = get_rel_relkind(relid);
 		List	   *rel_publications = NIL;
+		bool		skipped_pub = false;
 
 		/* Reload publications if needed before use. 

Re: A new message seems missing a punctuation

2024-02-18 Thread Amit Kapila
On Mon, Feb 19, 2024 at 12:14 PM Amit Kapila  wrote:
>
> On Mon, Feb 19, 2024 at 11:42 AM Robert Haas  wrote:
> >
> > On Mon, Feb 19, 2024 at 11:04 AM Kyotaro Horiguchi
> >  wrote:
> > > Or I thought the values could be moved to DETAILS: line.
> >
> > Yeah, I think that's likely to be the right approach here. The details
> > aren't too clear to me.
> >
> > Does the primary error message really need to say "could not sync
> > slot"? If it will be obvious from context that we were trying to sync
> > a slot, then it would be fine to just say "ERROR: remote slot precedes
> > local slot".
> >
>
> As this is a LOG message, I feel one may need some more information on
> the context but it is not mandatory.
>
> > But I also don't quite understand what problem this is trying to
> > report. Is this slot-syncing code running on the primary or the
> > standby? If it's running on the primary, then surely it's expected
> > that the remote slot will precede the local one. And if it's running
> > on the standby, then the comments in
> > update_and_persist_local_synced_slot about waiting for the remote side
> > to catch up seem quite confusing, because surely we're chasing the
> > primary and not the other way around?
> >
>
> The local's restart_lsn could be ahead of than primary's for the very
> first sync when the WAL corresponding to the remote's restart_lsn is
> not available on standby (say due to a different wal related settings
> the required WAL has been removed when we first time tried to sync the
> slot). For more details, you can refer to comments atop slotsync.c
> starting from "If the WAL corresponding to the remote's restart_lsn
> ..."
>

Sorry, I gave the wrong reference, the comments I was referring to
start with: "If on physical standby, the WAL corresponding ...".

-- 
With Regards,
Amit Kapila.




Re: pg_upgrade and logical replication

2024-02-18 Thread Amit Kapila
On Mon, Feb 19, 2024 at 6:54 AM Hayato Kuroda (Fujitsu)
 wrote:
>
> Thanks for reviewing! PSA new version.
>

Pushed this after making minor changes in the comments.

-- 
With Regards,
Amit Kapila.




Re: Add last_commit_lsn to pg_stat_database

2024-02-18 Thread Michael Paquier
On Sun, Feb 18, 2024 at 02:28:06AM +0100, Tomas Vondra wrote:
> Thanks for the updated patch. I don't have a clear opinion on the
> feature and whether this is the way to implement it, but I have two
> simple questions.

Some users I know of would be really happy to be able to get this
information for each database in a single view, so that feels natural
to plug the information into pg_stat_database.

> 1) Do we really need to modify RecordTransactionCommitPrepared and
> XactLogCommitRecord to return the LSN of the commit record? Can't we
> just use XactLastRecEnd? It's good enough for advancing
> replorigin_session_origin_lsn, why wouldn't it be good enough for this?

Or XactLastCommitEnd?  The changes in AtEOXact_PgStat() are not really
attractive for what's a static variable in all the backends.

> 2) Earlier in this thread it was claimed the function returning the
> last_commit_lsn is STABLE, but I have to admit it's not clear to me why
> that's the case :-( All the pg_stat_get_db_* functions are marked as
> stable, so I guess it's thanks to the pgstat system ...

The consistency of the shared stats data depends on
stats_fetch_consistency.  The default of "cache" makes sure that the
values don't change across a scan, until the end of a transaction.

I have not paid much attention about that until now, but note that it
would not be the case of "none" where the data is retrieved each time
it is requested.  So it seems to me that these functions should be
actually volatile, not stable, because they could deliver different
values across the same scan as an effect of the design behind
pgstat_fetch_entry() and a non-default stats_fetch_consistency.  I may
be missing something, of course.
--
Michael


signature.asc
Description: PGP signature


Re: A new message seems missing a punctuation

2024-02-18 Thread Amit Kapila
On Mon, Feb 19, 2024 at 11:42 AM Robert Haas  wrote:
>
> On Mon, Feb 19, 2024 at 11:04 AM Kyotaro Horiguchi
>  wrote:
> > Or I thought the values could be moved to DETAILS: line.
>
> Yeah, I think that's likely to be the right approach here. The details
> aren't too clear to me.
>
> Does the primary error message really need to say "could not sync
> slot"? If it will be obvious from context that we were trying to sync
> a slot, then it would be fine to just say "ERROR: remote slot precedes
> local slot".
>

As this is a LOG message, I feel one may need some more information on
the context but it is not mandatory.

> But I also don't quite understand what problem this is trying to
> report. Is this slot-syncing code running on the primary or the
> standby? If it's running on the primary, then surely it's expected
> that the remote slot will precede the local one. And if it's running
> on the standby, then the comments in
> update_and_persist_local_synced_slot about waiting for the remote side
> to catch up seem quite confusing, because surely we're chasing the
> primary and not the other way around?
>

The local's restart_lsn could be ahead of than primary's for the very
first sync when the WAL corresponding to the remote's restart_lsn is
not available on standby (say due to a different wal related settings
the required WAL has been removed when we first time tried to sync the
slot). For more details, you can refer to comments atop slotsync.c
starting from "If the WAL corresponding to the remote's restart_lsn
..."

> But if we ignore all of that, then we could just do this:
>
> ERROR: could not sync slot information as remote slot precedes local slot
> DETAIL: Remote slot has LSN %X/%X and catalog xmin %u, but remote slot
> has LSN %X/%X and catalog xmin %u.
>

This looks good to me but instead of ERROR here we want to use LOG.

-- 
With Regards,
Amit Kapila.




Re: Add system identifier to backup manifest

2024-02-18 Thread Amul Sul
On Mon, Feb 19, 2024 at 4:22 AM Michael Paquier  wrote:

> On Thu, Feb 15, 2024 at 05:41:46PM +0530, Robert Haas wrote:
> > On Thu, Feb 15, 2024 at 3:05 PM Amul Sul  wrote:
> > > Kindly have a look at the attached version.
> >
> > IMHO, 0001 looks fine, except probably the comment could be phrased a
> > bit more nicely.
>
> And the new option should be documented at the top of the init()
> routine for perldoc.
>

Added in the attached version.


> > That can be left for whoever commits this to
> > wordsmith. Michael, what are your plans?
>
> Not much, so feel free to not wait for me.  I've just read through the
> patch because I like the idea/feature :)
>

Thank you, that helped a lot.

> 0002 seems like a reasonable approach, but there's a hunk in the wrong
> > patch: 0004 modifies pg_combinebackup's check_control_files to use
> > get_dir_controlfile() rather than git_controlfile(), but it looks to
> > me like that should be part of 0002.
>

Fixed in the attached version.


> I'm slightly concerned about 0002 that silently changes the meaning of
> get_controlfile().  That would impact extension code without people
> knowing about it when compiling, just when they run their stuff under
> 17~.
>

Agreed, now they will have an error as _could not read file "": Is
a
directory_. But, IIUC, that what usually happens with the dev version, and
the
extension needs to be updated for compatibility with the newer version for
the
same reason.

Regards,
Amul
From 354014538b16579f005bd6f4ce771b1aa22b5e02 Mon Sep 17 00:00:00 2001
From: Amul Sul 
Date: Thu, 15 Feb 2024 12:10:23 +0530
Subject: [PATCH v8 1/4] Add option to force initdb in
 PostgreSQL::Test::Cluster:init()

---
 src/bin/pg_combinebackup/t/005_integrity.pl | 19 +++
 src/test/perl/PostgreSQL/Test/Cluster.pm| 15 ++-
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/bin/pg_combinebackup/t/005_integrity.pl b/src/bin/pg_combinebackup/t/005_integrity.pl
index 3b445d0e30f..5d425209211 100644
--- a/src/bin/pg_combinebackup/t/005_integrity.pl
+++ b/src/bin/pg_combinebackup/t/005_integrity.pl
@@ -18,18 +18,13 @@ $node1->init(has_archiving => 1, allows_streaming => 1);
 $node1->append_conf('postgresql.conf', 'summarize_wal = on');
 $node1->start;
 
-# Set up another new database instance. We don't want to use the cached
-# INITDB_TEMPLATE for this, because we want it to be a separate cluster
-# with a different system ID.
-my $node2;
-{
-	local $ENV{'INITDB_TEMPLATE'} = undef;
-
-	$node2 = PostgreSQL::Test::Cluster->new('node2');
-	$node2->init(has_archiving => 1, allows_streaming => 1);
-	$node2->append_conf('postgresql.conf', 'summarize_wal = on');
-	$node2->start;
-}
+# Set up another new database instance with force initdb option. We don't want
+# to initializing database system by copying initdb template for this, because
+# we want it to be a separate cluster with a different system ID.
+my $node2 = PostgreSQL::Test::Cluster->new('node2');
+$node2->init(force_initdb => 1, has_archiving => 1, allows_streaming => 1);
+$node2->append_conf('postgresql.conf', 'summarize_wal = on');
+$node2->start;
 
 # Take a full backup from node1.
 my $backup1path = $node1->backup_dir . '/backup1';
diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm
index 07da74cf562..2b4f9a48365 100644
--- a/src/test/perl/PostgreSQL/Test/Cluster.pm
+++ b/src/test/perl/PostgreSQL/Test/Cluster.pm
@@ -495,6 +495,10 @@ a directory that's only accessible to the current user to ensure that.
 On Windows, we use SSPI authentication to ensure the same (by pg_regress
 --config-auth).
 
+force_initdb => 1 will force to initialized the cluster by initdb. Otherwise, if
+available and if there aren't any parameters, use a previously initdb'd cluster
+as a template by copying it.
+
 WAL archiving can be enabled on this node by passing the keyword parameter
 has_archiving => 1. This is disabled by default.
 
@@ -517,6 +521,7 @@ sub init
 
 	local %ENV = $self->_get_env();
 
+	$params{force_initdb} = 0 unless defined $params{force_initdb};
 	$params{allows_streaming} = 0 unless defined $params{allows_streaming};
 	$params{has_archiving} = 0 unless defined $params{has_archiving};
 
@@ -529,14 +534,14 @@ sub init
 	mkdir $self->backup_dir;
 	mkdir $self->archive_dir;
 
-	# If available and if there aren't any parameters, use a previously
-	# initdb'd cluster as a template by copying it. For a lot of tests, that's
-	# substantially cheaper. Do so only if there aren't parameters, it doesn't
-	# seem worth figuring out whether they affect compatibility.
+	# For a lot of tests, that's substantially cheaper to copy previously
+	# initdb'd cluster as a template. Do so only if force_initdb => 0, and there
+	# aren't parameters, it doesn't seem worth figuring out whether they affect
+	# compatibility.
 	#
 	# There's very similar code in pg_regress.c, but we can't easily
 	# deduplicate it until we require perl at build time.
-	

Re: Fix race condition in InvalidatePossiblyObsoleteSlot()

2024-02-18 Thread Michael Paquier
On Thu, Feb 15, 2024 at 11:24:59AM +, Bertrand Drouvot wrote:
> Thanks for looking at it!
> 
> Agree, using an assertion instead in v3 attached.

And you did not forget the PG_USED_FOR_ASSERTS_ONLY.

> > It seems important to document why we are saving this data here; while
> > we hold the LWLock for repslots, before we perform any termination,
> > and we  want to protect conflict reports with incorrect data if the
> > slot data got changed while the lwlock is temporarily released while
> > there's a termination.
> 
> Yeah, comments added in v3.

The contents look rather OK, I may do some word-smithing for both.

>> For example just after the kill()?  It seems to me
>> that we should stuck the checkpointer, perform a forced upgrade of the
>> xmins, release the checkpointer and see the effects of the change in
>> the second loop.  Now, modules/injection_points/ does not allow that,
>> yet, but I've posted a patch among these lines that can stop a process
>> and release it using a condition variable (should be 0006 on [1]).  I
>> was planning to start a new thread with a patch posted for the next CF
>> to add this kind of facility with a regression test for an old bug,
>> the patch needs a few hours of love, first.  I should be able to post
>> that next week.
> 
> Great, that looks like a good idea!

I've been finally able to spend some time on what I had in mind and
posted it here for the next CF:
https://www.postgresql.org/message-id/zdluxbk5hgpol...@paquier.xyz

You should be able to reuse that the same way I do in 0002 posted on
the thread, where I'm causing the checkpointer to wait, then wake it
up.

> Are we going to fix this on master and 16 stable first and then later on add a
> test on master with the injection points?

Still, the other patch is likely going to take a couple of weeks
before getting into the tree, so I have no objection to fix the bug
first and backpatch, then introduce a test.  Things have proved that
failures could show up in the buildfarm in this area, so more time
running this code across two branches is not a bad concept, either.
--
Michael


signature.asc
Description: PGP signature


Re: A new message seems missing a punctuation

2024-02-18 Thread Robert Haas
On Mon, Feb 19, 2024 at 11:04 AM Kyotaro Horiguchi
 wrote:
> Or I thought the values could be moved to DETAILS: line.

Yeah, I think that's likely to be the right approach here. The details
aren't too clear to me.

Does the primary error message really need to say "could not sync
slot"? If it will be obvious from context that we were trying to sync
a slot, then it would be fine to just say "ERROR: remote slot precedes
local slot".

But I also don't quite understand what problem this is trying to
report. Is this slot-syncing code running on the primary or the
standby? If it's running on the primary, then surely it's expected
that the remote slot will precede the local one. And if it's running
on the standby, then the comments in
update_and_persist_local_synced_slot about waiting for the remote side
to catch up seem quite confusing, because surely we're chasing the
primary and not the other way around?

But if we ignore all of that, then we could just do this:

ERROR: could not sync slot information as remote slot precedes local slot
DETAIL: Remote slot has LSN %X/%X and catalog xmin %u, but remote slot
has LSN %X/%X and catalog xmin %u.

which would fix the original complaint, and my point about using
English rather than just printing a bunch of values.

-- 
Robert Haas
EDB: http://www.enterprisedb.com




Injection points: some tools to wait and wake

2024-02-18 Thread Michael Paquier
Hi all,
(Ashutosh in CC as he was involved in the discussion last time.)

I have proposed on the original thread related to injection points to
have more stuff to be able to wait at an arbtrary point and wake at
will the process waiting so as it is possible to control the order of
actions taken in a test:
https://www.postgresql.org/message-id/ZTiV8tn_MIb_H2rE%40paquier.xyz

I didn't do that in the other thread out of time, but here is a patch
set to complete what I wanted, using a condition variable to wait and
wake processes:
- State is in shared memory, using a DSM tracked by the registry and
an integer counter.
- Callback to wait on a condition variable.
- SQL function to update the shared state and broadcast the update to
the condition variable.
- Use a custom wait event to track the wait in pg_stat_activity.

0001 requires no backend changes, only more stuff into the test module
injection_points so that could be backpatched assuming that the
backend is able to support injection points.  This could be expanded
into using more variables and/or states, but I don't really see a
point in introducing more without a reason to do so, and I have no
need for more at the moment.

0002 is a polished version of the TAP test that makes use of this
facility, providing coverage for the bug fixed by 7863ee4def65
(reverting this commit causes the test to fail), where a restart point 
runs across a promotion request.  The trick is to stop the
checkpointer in the middle of a restart point and issue a promotion
in-between.

Thoughts and comments are welcome.
--
Michael
From ef27d6eba619f5106915de6b05cbeb5294263c30 Mon Sep 17 00:00:00 2001
From: Michael Paquier 
Date: Mon, 19 Feb 2024 14:35:55 +0900
Subject: [PATCH 1/2] injection_points: Add routines to wait and wake processes

This commit is made of two parts:
- A new callback that can be attached to a process to make it wait on a
condition variable.  The condition checked is registered in shared
memory by the module injection_points.
- A new SQL function to update the shared state and broadcast the update
using a condition variable.

The shared state used by the module is registered using the DSM
registry, and is optional.
---
 .../injection_points--1.0.sql |  10 ++
 .../injection_points/injection_points.c   | 104 ++
 src/tools/pgindent/typedefs.list  |   1 +
 3 files changed, 115 insertions(+)

diff --git a/src/test/modules/injection_points/injection_points--1.0.sql b/src/test/modules/injection_points/injection_points--1.0.sql
index 5944c41716..20479991f2 100644
--- a/src/test/modules/injection_points/injection_points--1.0.sql
+++ b/src/test/modules/injection_points/injection_points--1.0.sql
@@ -24,6 +24,16 @@ RETURNS void
 AS 'MODULE_PATHNAME', 'injection_points_run'
 LANGUAGE C STRICT PARALLEL UNSAFE;
 
+--
+-- injection_points_wake()
+--
+-- Wakes a condition variable waited on in an injection point.
+--
+CREATE FUNCTION injection_points_wake()
+RETURNS void
+AS 'MODULE_PATHNAME', 'injection_points_wake'
+LANGUAGE C STRICT PARALLEL UNSAFE;
+
 --
 -- injection_points_detach()
 --
diff --git a/src/test/modules/injection_points/injection_points.c b/src/test/modules/injection_points/injection_points.c
index e843e6594f..3a319b1525 100644
--- a/src/test/modules/injection_points/injection_points.c
+++ b/src/test/modules/injection_points/injection_points.c
@@ -18,18 +18,67 @@
 #include "postgres.h"
 
 #include "fmgr.h"
+#include "storage/condition_variable.h"
 #include "storage/lwlock.h"
 #include "storage/shmem.h"
+#include "storage/dsm_registry.h"
 #include "utils/builtins.h"
 #include "utils/injection_point.h"
 #include "utils/wait_event.h"
 
 PG_MODULE_MAGIC;
 
+/* Shared state information for injection points. */
+typedef struct InjectionPointSharedState
+{
+	/* protects accesses to wait_counts */
+	slock_t		lock;
+
+	/* Counter advancing when injection_points_wake() is called */
+	int			wait_counts;
+
+	/*
+	 * Condition variable that can be used in an injection point, checking
+	 * upon wait_counts when waiting.
+	 */
+	ConditionVariable wait_point;
+} InjectionPointSharedState;
+
+/* Pointer to shared-memory state. */
+static InjectionPointSharedState *inj_state = NULL;
+
+/* Wait event when waiting on condition variable */
+static uint32 injection_wait_event = 0;
+
 extern PGDLLEXPORT void injection_error(const char *name);
 extern PGDLLEXPORT void injection_notice(const char *name);
+extern PGDLLEXPORT void injection_wait(const char *name);
 
 
+static void
+injection_point_init_state(void *ptr)
+{
+	InjectionPointSharedState *state = (InjectionPointSharedState *) ptr;
+
+	state->wait_counts = 0;
+	SpinLockInit(>lock);
+	ConditionVariableInit(>wait_point);
+}
+
+static void
+injection_init_shmem(void)
+{
+	bool		found;
+
+	if (inj_state != NULL)
+		return;
+
+	inj_state = GetNamedDSMSegment("injection_points",
+   sizeof(InjectionPointSharedState),
+   injection_point_init_state,
+   );
+}
+
 

Re: PGC_SIGHUP shared_buffers?

2024-02-18 Thread Robert Haas
On Mon, Feb 19, 2024 at 2:05 AM Andres Freund  wrote:
> We probably should address that independently of making shared_buffers
> PGC_SIGHUP. The queue gets absurdly large once s_b hits a few GB. It's not
> that much memory compared to the buffer blocks themselves, but a sync queue of
> many millions of entries just doesn't make sense. And a few hundred MB for
> that isn't nothing either, even if it's just a fraction of the space for the
> buffers. It makes checkpointer more susceptible to OOM as well, because
> AbsorbSyncRequests() allocates an array to copy all requests into local
> memory.

Sure, that could just be capped, if it makes sense. Although given the
thrust of this discussion, it might be even better to couple it to
something other than the size of shared_buffers.

> I'd say the vast majority of postgres instances in production run with less
> than 1GB of s_b. Just because numbers wise the majority of instances are
> running on small VMs and/or many PG instances are running on one larger
> machine.  There are a lot of instances where the total available memory is
> less than 2GB.

Whoa. That is not my experience at all. If I've ever seen such a small
system since working at EDB (since 2010!) it was just one where the
initdb-time default was never changed.

I can't help wondering if we should have some kind of memory_model
GUC, measured in T-shirt sizes or something. We've coupled a bunch of
things to shared_buffers mostly as a way of distinguishing small
systems from large ones. But if we want to make shared_buffers
dynamically changeable and we don't want to make all that other stuff
dynamically changeable, decoupling those calculations might be an
important thing to do.

On a really small system, do we even need the ability to dynamically
change shared_buffers at all? If we do, then I suspect the granule
needs to be small. But does someone want to take a system with <1GB of
shared_buffers and then scale it way, way up? I suppose it would be
nice to have the option. But you might have to make some choices, like
pick either a 16MB granule or a 128MB granule or a 1GB granule at
startup time and then stick with it? I don't know, I'm just
spitballing here, because I don't know what the real design is going
to look like yet.

> > Don't you have to still move buffers entirely out of the region you
> > want to unmap?
>
> Sure. But you can unmap at the granularity of a hardware page (there is some
> fragmentation cost on the OS / hardware page table level
> though). Theoretically you could unmap individual 8kB pages.

I thought there were problems, at least on some operating systems, if
the address space mappings became too fragmented. At least, I wouldn't
expect that you could use huge pages for shared_buffers and still
unmap little tiny bits. How would that even work?

-- 
Robert Haas
EDB: http://www.enterprisedb.com




Re: About a recently-added message

2024-02-18 Thread shveta malik
On Mon, Feb 19, 2024 at 11:10 AM Amit Kapila  wrote:
>
> On Thu, Feb 15, 2024 at 11:49 AM Kyotaro Horiguchi
>  wrote:
> >
> > At Thu, 15 Feb 2024 09:22:23 +0530, shveta malik  
> > wrote in
> > >
> > > +1 on changing the msg(s) suggested way. Please find the patch for the
> > > same. It also removes double quotes around the variable names
> >
> > Thanks for the discussion.
> >
> > With a translator hat on, I would be happy if I could determine
> > whether a word requires translation with minimal background
> > information. In this case, a translator needs to know which values
> > wal_level can take. It's relatively easy in this case, but I'm not
> > sure if this is always the case. Therefore, I would be slightly
> > happier if "logical" were double-quoted.
> >
>
> I see that we use "logical" in double quotes in various error
> messages. For example: "wal_level must be set to \"replica\" or
> \"logical\" at server start". So following that we can use the double
> quotes here as well.

Okay, now since we will have double quotes for logical. So do you
prefer the existing way of giving error msg or the changed one.

Existing:
errmsg("bad configuration for slot synchronization"),
errhint("wal_level must be >= logical."));

errmsg("bad configuration for slot synchronization"),
errhint("%s must be defined.", "primary_conninfo"));

The changed one:
errmsg("slot synchronization requires wal_level >= logical"));

errmsg("slot synchronization requires %s to be defined",
  "primary_conninfo"));

thanks
Shveta




Re: Do away with zero-padding assumption before WALRead()

2024-02-18 Thread Kyotaro Horiguchi
At Mon, 19 Feb 2024 11:02:39 +0530, Bharath Rupireddy 
 wrote in 
> > After all, the patch looks good to me.
> 
> Thanks. It was committed -
> https://git.postgresql.org/gitweb/?p=postgresql.git;a=commitdiff;h=73f0a1326608ac3a7d390706fdeec59fe4dc42c0.

Yeah. I realied that after I had already sent the mail.. No harm done:p

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center




Re: About a recently-added message

2024-02-18 Thread Amit Kapila
On Thu, Feb 15, 2024 at 11:49 AM Kyotaro Horiguchi
 wrote:
>
> At Thu, 15 Feb 2024 09:22:23 +0530, shveta malik  
> wrote in
> >
> > +1 on changing the msg(s) suggested way. Please find the patch for the
> > same. It also removes double quotes around the variable names
>
> Thanks for the discussion.
>
> With a translator hat on, I would be happy if I could determine
> whether a word requires translation with minimal background
> information. In this case, a translator needs to know which values
> wal_level can take. It's relatively easy in this case, but I'm not
> sure if this is always the case. Therefore, I would be slightly
> happier if "logical" were double-quoted.
>

I see that we use "logical" in double quotes in various error
messages. For example: "wal_level must be set to \"replica\" or
\"logical\" at server start". So following that we can use the double
quotes here as well.

-- 
With Regards,
Amit Kapila.




Re: A new message seems missing a punctuation

2024-02-18 Thread Kyotaro Horiguchi
At Mon, 19 Feb 2024 10:31:33 +0530, Robert Haas  wrote 
in 
> On Mon, Feb 19, 2024 at 10:10 AM Kyotaro Horiguchi
>  wrote:
> > A recent commit (7a424ece48) added the following message:
> >
> > > could not sync slot information as remote slot precedes local slot:
> > > remote slot "%s": LSN (%X/%X), catalog xmin (%u) local slot: LSN
> > > (%X/%X), catalog xmin (%u)
> >
> > Since it is a bit overloaded but doesn't have a separator between
> > "catalog xmin (%u)" and "local slot:", it is somewhat cluttered. Don't
> > we need something, for example a semicolon there to improve
> > readability and reduce clutter?
> 
> I think maybe we should even revise the message even more. In most
> places we do not just print out a whole bunch of values, but use a
> sentence to connect them.

Mmm. Something like this?:

"could not sync slot information: local slot LSN (%X/%X) or xmin(%u)
 behind remote (%X/%X, %u)"

Or I thought the values could be moved to DETAILS: line.

By the way, the code around the message is as follows.

> /*
>  * The remote slot didn't catch up to locally reserved position.
>  *
>  * We do not drop the slot because the restart_lsn can be ahead of the
>  * current location when recreating the slot in the next cycle. It may
>  * take more time to create such a slot. Therefore, we keep this slot
>  * and attempt the synchronization in the next cycle.
>  *
>  * XXX should this be changed to elog(DEBUG1) perhaps?
>  */
> ereport(LOG,
>   errmsg("could not sync slot information as remote slot precedes 
> local slot:"
>  " remote slot \"%s\": LSN (%X/%X), 
> catalog xmin (%u) local slot: LSN (%X/%X), catalog xmin (%u)",

If we change this message to DEBUG1, a timeout feature to show a
WARNING message might be needed for DBAs to notice that something bad
is ongoing. However, it doesn't seem appropriate as a LOG message to
me. Perhaps, a LOG message should be more like:

> "LOG:  waiting for local slot to catch up to remote"
> "DETAIL:  remote slot LSN (%X/%X); "

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center


Re: Do away with zero-padding assumption before WALRead()

2024-02-18 Thread Bharath Rupireddy
On Mon, Feb 19, 2024 at 8:26 AM Kyotaro Horiguchi
 wrote:
>
> On
> the flip side, SimpleXLogPageRead always reads a whole page and
> returns XLOG_BLCKSZ. However, as you know, the returned buffer doesn't
> contain random garbage bytes.

Is this assumption true when wal_init_zero is off? I think when
wal_init_zero is off, the last few bytes of the last page from the WAL
file may contain garbage bytes i.e. not zero bytes, no?

> Therefore, it's safe as long as the
> caller doesn't access beyond the returned count. As a result, the
> description you pointed out seems to be enough.

Right.

> After all, the patch looks good to me.

Thanks. It was committed -
https://git.postgresql.org/gitweb/?p=postgresql.git;a=commitdiff;h=73f0a1326608ac3a7d390706fdeec59fe4dc42c0.

--
Bharath Rupireddy
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com




Re: A new message seems missing a punctuation

2024-02-18 Thread shveta malik
On Mon, Feb 19, 2024 at 10:31 AM Robert Haas  wrote:
>
> On Mon, Feb 19, 2024 at 10:10 AM Kyotaro Horiguchi
>  wrote:
> > A recent commit (7a424ece48) added the following message:
> >
> > > could not sync slot information as remote slot precedes local slot:
> > > remote slot "%s": LSN (%X/%X), catalog xmin (%u) local slot: LSN
> > > (%X/%X), catalog xmin (%u)
> >
> > Since it is a bit overloaded but doesn't have a separator between
> > "catalog xmin (%u)" and "local slot:", it is somewhat cluttered. Don't
> > we need something, for example a semicolon there to improve
> > readability and reduce clutter?
>
> I think maybe we should even revise the message even more. In most
> places we do not just print out a whole bunch of values, but use a
> sentence to connect them.

I have tried to reword the msg, please have a look.

thanks
Shveta


v1-0001-Reword-LOG-msg-for-slot-sync.patch
Description: Binary data


Re: Removing unneeded self joins

2024-02-18 Thread Andrei Lepikhov

On 18/2/2024 23:18, Alexander Korotkov wrote:

On Sun, Feb 18, 2024 at 5:04 PM Alexander Korotkov  wrote:

On Sun, Feb 18, 2024 at 3:00 PM Alexander Lakhin  wrote:

09.01.2024 01:09, Alexander Korotkov wrote:

Fixed in 30b4955a46.


Please look at the following query which fails with an error since
d3d55ce57:

create table t (i int primary key);

select t3.i from t t1
  join t t2 on t1.i = t2.i,
  lateral (select t1.i limit 1) t3;

ERROR:  non-LATERAL parameter required by subquery


Thank you for spotting.  I'm looking at this.


Attached is a draft patch fixing this query.  Could you, please, recheck?
I reviewed this patch. Why do you check only the target list? I guess 
these links can be everywhere. See the patch in the attachment with the 
elaborated test and slightly changed code.


--
regards,
Andrei Lepikhov
Postgres Professional
From 7f94a3c96fd410522b87e570240cdb96b300dd31 Mon Sep 17 00:00:00 2001
From: "Andrey V. Lepikhov" 
Date: Mon, 19 Feb 2024 12:17:55 +0700
Subject: [PATCH] Replace relids in lateral subquery target list during SJE

---
 src/backend/optimizer/plan/analyzejoins.c | 29 ++-
 src/test/regress/expected/join.out| 44 +++
 src/test/regress/sql/join.sql | 12 +++
 3 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/src/backend/optimizer/plan/analyzejoins.c 
b/src/backend/optimizer/plan/analyzejoins.c
index e494acd51a..072298f66c 100644
--- a/src/backend/optimizer/plan/analyzejoins.c
+++ b/src/backend/optimizer/plan/analyzejoins.c
@@ -395,7 +395,34 @@ remove_rel_from_query(PlannerInfo *root, RelOptInfo *rel,
}
 
/* Update lateral references. */
-   replace_varno((Node *) otherrel->lateral_vars, relid, subst);
+   if (root->hasLateralRTEs)
+   {
+   RangeTblEntry *rte = root->simple_rte_array[rti];
+   ReplaceVarnoContext ctx = {.from = relid,.to = subst};
+
+   if (rte->lateral)
+   {
+   replace_varno((Node *) otherrel->lateral_vars, 
relid, subst);
+
+   /*
+* Although we pass root->parse through cleanup 
procedure,
+* but parse->rtable and rte contains refs to 
different copies
+* of the subquery.
+*/
+   if (otherrel->rtekind == RTE_SUBQUERY)
+   query_tree_walker(rte->subquery, 
replace_varno_walker, ,
+ 
QTW_EXAMINE_SORTGROUP);
+#ifdef USE_ASSERT_CHECKING
+   /* Just check possibly hidden non-replaced 
relids */
+   Assert(!bms_is_member(relid, pull_varnos(root, 
(Node *) rte->tablesample)));
+   Assert(!bms_is_member(relid, pull_varnos(root, 
(Node *) rte->functions)));
+   Assert(!bms_is_member(relid, pull_varnos(root, 
(Node *) rte->tablefunc)));
+   Assert(!bms_is_member(relid, pull_varnos(root, 
(Node *) rte->values_lists)));
+#endif
+   }
+   }
+
+
}
 
/*
diff --git a/src/test/regress/expected/join.out 
b/src/test/regress/expected/join.out
index 0c2cba8921..d560a4a6b9 100644
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -6349,6 +6349,50 @@ on true;
->  Seq Scan on int8_tbl y
 (7 rows)
 
+-- Test processing target lists in lateral subqueries
+explain (verbose, costs off)
+SELECT t3.a FROM sj t1, sj t2,
+LATERAL (SELECT t1.a WHERE t1.a <> 1
+GROUP BY (t1.a) HAVING t1.a > 0 ORDER BY t1.a LIMIT 1) t3,
+LATERAL (SELECT t1.a,t3.a WHERE t1.a <> t3.a+t2.a
+GROUP BY (t3.a) HAVING t1.a > t3.a*t3.a+t2.a/t1.a LIMIT 2) t4,
+LATERAL (SELECT * FROM sj TABLESAMPLE bernoulli(t1.a/t2.a)
+REPEATABLE (t1.a+t2.a)) t5,
+LATERAL generate_series(1, t1.a + t2.a) AS t6
+WHERE t1.a = t2.a;
+  QUERY PLAN   

+---
+ Nested Loop
+   Output: (t2.a)
+   ->  Nested Loop
+ Output: t2.a, (t2.a)
+ ->  Nested Loop
+   Output: t2.a, (t2.a)
+   ->  Nested Loop
+ Output: t2.a, (t2.a)
+ ->  Seq Scan on public.sj t2
+   Output: t2.a, t2.b, t2.c
+   Filter: (t2.a IS NOT NULL)
+ ->  Limit
+   Output: (t2.a)
+   ->  Group
+ Output: t2.a
+   

Re: A new message seems missing a punctuation

2024-02-18 Thread Robert Haas
On Mon, Feb 19, 2024 at 10:10 AM Kyotaro Horiguchi
 wrote:
> A recent commit (7a424ece48) added the following message:
>
> > could not sync slot information as remote slot precedes local slot:
> > remote slot "%s": LSN (%X/%X), catalog xmin (%u) local slot: LSN
> > (%X/%X), catalog xmin (%u)
>
> Since it is a bit overloaded but doesn't have a separator between
> "catalog xmin (%u)" and "local slot:", it is somewhat cluttered. Don't
> we need something, for example a semicolon there to improve
> readability and reduce clutter?

I think maybe we should even revise the message even more. In most
places we do not just print out a whole bunch of values, but use a
sentence to connect them.

-- 
Robert Haas
EDB: http://www.enterprisedb.com




A new message seems missing a punctuation

2024-02-18 Thread Kyotaro Horiguchi
A recent commit (7a424ece48) added the following message:

> could not sync slot information as remote slot precedes local slot:
> remote slot "%s": LSN (%X/%X), catalog xmin (%u) local slot: LSN
> (%X/%X), catalog xmin (%u)

Since it is a bit overloaded but doesn't have a separator between
"catalog xmin (%u)" and "local slot:", it is somewhat cluttered. Don't
we need something, for example a semicolon there to improve
readability and reduce clutter?

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center




RE: Synchronizing slots from primary to standby

2024-02-18 Thread Zhijie Hou (Fujitsu)
On Monday, February 19, 2024 11:39 AM shveta malik  
wrote:
> 
> On Sun, Feb 18, 2024 at 7:40 PM Zhijie Hou (Fujitsu) 
> wrote:
> >
> > On Friday, February 16, 2024 6:41 PM shveta malik 
> wrote:
> > Thanks for the patch. Here are few comments:
> 
> Thanks for the comments.
> 
> >
> > 2.
> >
> > +static bool
> > +validate_remote_info(WalReceiverConn *wrconn, int elevel)
> > ...
> > +
> > +   return (!remote_in_recovery && primary_slot_valid);
> >
> > The primary_slot_valid could be uninitialized in this function.
> 
> return (!remote_in_recovery && primary_slot_valid);
> 
> Here if remote_in_recovery is true, it will not even read primary_slot_valid. 
> It
> will read primary_slot_valid only if remote_in_recovery is false and in such a
> case primary_slot_valid will always be initialized in the else block above, 
> let me
> know if you still feel we shall initialize this to some default?

I understand that it will not be used, but some complier could report WARNING
for the un-initialized variable. The cfbot[1] seems complain about this as well.

[1] https://cirrus-ci.com/task/5416851522453504

Best Regards,
Hou zj


Re: pg_upgrade and logical replication

2024-02-18 Thread vignesh C
On Mon, 19 Feb 2024 at 06:54, Hayato Kuroda (Fujitsu)
 wrote:
>
> Dear Vignesh,
>
> Thanks for reviewing! PSA new version.
>
> >
> > Thanks for the updated patch, Few suggestions:
> > 1)  This can be moved to keep it similar to other tests:
> > +# Setup a disabled subscription. The upcoming test will check the
> > +# pg_createsubscriber won't work, so it is sufficient.
> > +$publisher->safe_psql('postgres', "CREATE PUBLICATION regress_pub1");
> > +$old_sub->safe_psql('postgres',
> > +   "CREATE SUBSCRIPTION regress_sub1 CONNECTION '$connstr'
> > PUBLICATION regress_pub1 WITH (enabled = false)"
> > +);
> > +
> > +$old_sub->stop;
> > +
> > +# --
> > +# Check that pg_upgrade fails when max_replication_slots configured in the 
> > new
> > +# cluster is less than the number of subscriptions in the old cluster.
> > +# --
> > +$new_sub->append_conf('postgresql.conf', "max_replication_slots = 0");
> > +
> > +# pg_upgrade will fail because the new cluster has insufficient
> > +# max_replication_slots.
> > +command_checks_all(
> > +   [
> > +   'pg_upgrade', '--no-sync', '-d', $old_sub->data_dir,
> > +   '-D', $new_sub->data_dir, '-b', $oldbindir,
> > +   '-B', $newbindir, '-s', $new_sub->host,
> > +   '-p', $old_sub->port, '-P', $new_sub->port,
> > +   $mode, '--check',
> > +   ],
> >
> > like below and the extra comment can be removed:
> > +# --
> > +# Check that pg_upgrade fails when max_replication_slots configured in the 
> > new
> > +# cluster is less than the number of subscriptions in the old cluster.
> > +# --
> > +# Create a disabled subscription.
> > +$publisher->safe_psql('postgres', "CREATE PUBLICATION regress_pub1");
> > +$old_sub->safe_psql('postgres',
> > +   "CREATE SUBSCRIPTION regress_sub1 CONNECTION '$connstr'
> > PUBLICATION regress_pub1 WITH (enabled = false)"
> > +);
> > +
> > +$old_sub->stop;
> > +
> > +$new_sub->append_conf('postgresql.conf', "max_replication_slots = 0");
> > +
> > +# pg_upgrade will fail because the new cluster has insufficient
> > +# max_replication_slots.
> > +command_checks_all(
> > +   [
> > +   'pg_upgrade', '--no-sync', '-d', $old_sub->data_dir,
> > +   '-D', $new_sub->data_dir, '-b', $oldbindir,
> > +   '-B', $newbindir, '-s', $new_sub->host,
> > +   '-p', $old_sub->port, '-P', $new_sub->port,
> > +   $mode, '--check',
> > +   ],
>
> Partially fixed. I moved the creation part to below but comments were kept.
>
> > 2) This comment can be slightly changed:
> > +# Change configuration as well not to start the initial sync automatically
> > +$new_sub->append_conf('postgresql.conf',
> > +   "max_logical_replication_workers = 0");
> >
> > to:
> > Change configuration so that initial table sync sync does not get
> > started automatically
>
> Fixed.
>
> > 3) The old comments were slightly better:
> > # Resume the initial sync and wait until all tables of subscription
> > # 'regress_sub5' are synchronized
> > $new_sub->append_conf('postgresql.conf',
> > "max_logical_replication_workers = 10");
> > $new_sub->restart;
> > $new_sub->safe_psql('postgres', "ALTER SUBSCRIPTION regress_sub5
> > ENABLE");
> > $new_sub->wait_for_subscription_sync($publisher, 'regress_sub5');
> >
> > Like:
> > # Enable the subscription
> > $new_sub->safe_psql('postgres', "ALTER SUBSCRIPTION regress_sub5
> > ENABLE");
> >
> > # Wait until all tables of subscription 'regress_sub5' are synchronized
> > $new_sub->wait_for_subscription_sync($publisher, 'regress_sub5');
>
> Per comments from Amit [1], I did not change.

Thanks for the updated patch, I don't have any more comments.

Regards,
Vignesh




Re: Emitting JSON to file using COPY TO

2024-02-18 Thread jian he
On Fri, Jan 19, 2024 at 4:10 PM Masahiko Sawada  wrote:
>
> if (opts_out->json_mode && is_from)
> ereport(ERROR, ...);
>
> if (!opts_out->json_mode && opts_out->force_array)
> ereport(ERROR, ...);
>
> Also these checks can be moved close to other checks at the end of
> ProcessCopyOptions().
>
> ---
> @@ -3395,6 +3395,10 @@ copy_opt_item:
> {
> $$ = makeDefElem("format", (Node *) makeString("csv"), 
> @1);
> }
> +   | JSON
> +   {
> +   $$ = makeDefElem("format", (Node *) makeString("json"), 
> @1);
> +   }
> | HEADER_P
> {
> $$ = makeDefElem("header", (Node *) makeBoolean(true), 
> @1);
> @@ -3427,6 +3431,10 @@ copy_opt_item:
> {
> $$ = makeDefElem("encoding", (Node *) makeString($2), @1);
> }
> +   | FORCE ARRAY
> +   {
> +   $$ = makeDefElem("force_array", (Node *)
> makeBoolean(true), @1);
> +   }
> ;
>
> I believe we don't need to support new options in old-style syntax.
>
you are right about the force_array case.
we don't need to add force_array related changes in gram.y.


On Wed, Jan 31, 2024 at 9:26 PM Alvaro Herrera  wrote:
>
> On 2024-Jan-23, jian he wrote:
>
> > > +   | FORMAT_LA copy_generic_opt_arg
> > > +   {
> > > +   $$ = makeDefElem("format", $2, @1);
> > > +   }
> > > ;
> > >
> > > I think it's not necessary. "format" option is already handled in
> > > copy_generic_opt_elem.
> >
> > test it, I found out this part is necessary.
> > because a query with WITH like `copy (select 1)  to stdout with
> > (format json, force_array false); ` will fail.
>
> Right, because "FORMAT JSON" is turned into FORMAT_LA JSON by parser.c
> (see base_yylex there).  I'm not really sure but I think it might be
> better to make it "| FORMAT_LA JSON" instead of invoking the whole
> copy_generic_opt_arg syntax.  Not because of performance, but just
> because it's much clearer what's going on.
>
I am not sure what alternative you are referring to.
I've rebased the patch, made some cosmetic changes.
Now I think it's pretty neat.
you can, based on it, make your change, then I may understand the
alternative you are referring to.
From b3d3d6023f96aa7971a0663d8c0bd6de50e877a5 Mon Sep 17 00:00:00 2001
From: jian he 
Date: Mon, 19 Feb 2024 10:37:18 +0800
Subject: [PATCH v9 1/2] Add another COPY fomrat: json

this format is only allowed in COPY TO operation.
discussion: https://postgr.es/m/CALvfUkBxTYy5uWPFVwpk_7ii2zgT07t3d-yR_cy4sfrrLU%3Dkcg%40mail.gmail.com
discussion: https://postgr.es/m/6a04628d-0d53-41d9-9e35-5a8dc302c34c@joeconway.com
---
 doc/src/sgml/ref/copy.sgml |   5 ++
 src/backend/commands/copy.c|  13 +++
 src/backend/commands/copyto.c  | 125 -
 src/backend/parser/gram.y  |   8 ++
 src/backend/utils/adt/json.c   |   5 +-
 src/include/commands/copy.h|   1 +
 src/include/utils/json.h   |   2 +
 src/test/regress/expected/copy.out |  54 +
 src/test/regress/sql/copy.sql  |  38 +
 9 files changed, 208 insertions(+), 43 deletions(-)

diff --git a/doc/src/sgml/ref/copy.sgml b/doc/src/sgml/ref/copy.sgml
index 55764fc1..ef9e4729 100644
--- a/doc/src/sgml/ref/copy.sgml
+++ b/doc/src/sgml/ref/copy.sgml
@@ -214,9 +214,14 @@ COPY { table_name [ ( text,
   csv (Comma Separated Values),
+  json (JavaScript Object Notation),
   or binary.
   The default is text.
  
+ 
+  The json option is allowed only in
+  COPY TO.
+ 
 

 
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index cc0786c6..5d5b733d 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -480,6 +480,8 @@ ProcessCopyOptions(ParseState *pstate,
  /* default format */ ;
 			else if (strcmp(fmt, "csv") == 0)
 opts_out->csv_mode = true;
+			else if (strcmp(fmt, "json") == 0)
+opts_out->json_mode = true;
 			else if (strcmp(fmt, "binary") == 0)
 opts_out->binary = true;
 			else
@@ -716,6 +718,11 @@ ProcessCopyOptions(ParseState *pstate,
 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
  errmsg("cannot specify HEADER in BINARY mode")));
 
+	if (opts_out->json_mode && opts_out->header_line)
+		ereport(ERROR,
+(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot specify HEADER in JSON mode")));
+
 	/* Check quote */
 	if (!opts_out->csv_mode && opts_out->quote != NULL)
 		ereport(ERROR,
@@ -793,6 +800,12 @@ ProcessCopyOptions(ParseState *pstate,
 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
  errmsg("COPY FREEZE cannot be used with COPY TO")));
 
+	/* Check json format  */
+	if (opts_out->json_mode && is_from)
+		ereport(ERROR,
+(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot use JSON mode in COPY FROM")));
+
 	if 

Re: Synchronizing slots from primary to standby

2024-02-18 Thread shveta malik
On Sun, Feb 18, 2024 at 7:40 PM Zhijie Hou (Fujitsu)
 wrote:
>
> On Friday, February 16, 2024 6:41 PM shveta malik  
> wrote:
> Thanks for the patch. Here are few comments:

Thanks for the comments.

>
> 2.
>
> +static bool
> +validate_remote_info(WalReceiverConn *wrconn, int elevel)
> ...
> +
> +   return (!remote_in_recovery && primary_slot_valid);
>
> The primary_slot_valid could be uninitialized in this function.

return (!remote_in_recovery && primary_slot_valid);

Here if remote_in_recovery is true, it will not even read
primary_slot_valid. It will read primary_slot_valid only if
remote_in_recovery is false and in such a case primary_slot_valid will
always be initialized in the else block above, let me know if you
still feel we shall initialize this to some default?

thanks
Shveta




Re: Speeding up COPY TO for uuids and arrays

2024-02-18 Thread Michael Paquier
On Sat, Feb 17, 2024 at 12:24:33PM -0800, Andres Freund wrote:
> I wonder if we should move the core part for converting to hex to numutils.c,
> we already have code the for the inverse. There does seem to be further
> optimization potential in the conversion, and that seems better done somewhere
> central rather than one type's output function. OTOH, it might not be worth
> it, given the need to add the dashes.

I'd tend to live with the current location of the code, but I'm OK if
people feel differently on this one, so I'm OK with what Laurenz is
proposing. 

>> - Patch 0003 speeds up array_out a bit by avoiding some zero
>>   byte writes.  The measured speed gain is under 2%.
> 
> Makes sense.

+1.
--
Michael


signature.asc
Description: PGP signature


Re: Returning non-terminated string in ECPG Informix-compatible function

2024-02-18 Thread Michael Paquier
On Thu, Feb 15, 2024 at 05:17:17PM +0700, Oleg Tselebrovskiy wrote:
> Thanks for review!

dt_common.c is quite amazing, the APIs that we have in it rely on
strcpy() but we have no idea of the length of the buffer string given
in input to store the result.  This would require breaking the
existing APIs or inventing new ones to be able to plug some safer
strlcpy() calls.  Not sure if it's really worth bothering.  For now,
I've applied the OOM checks on HEAD and the fix with the null
termination on all stable branches.
--
Michael


signature.asc
Description: PGP signature


Re: Do away with zero-padding assumption before WALRead()

2024-02-18 Thread Kyotaro Horiguchi
At Mon, 19 Feb 2024 11:56:22 +0900 (JST), Kyotaro Horiguchi 
 wrote in 
> Yeah, perhaps I was overly concerned. The removed comment made me
> think that someone could add code relying on the incorrect assumption
> that the remaining bytes beyond the returned count are cleared out. On
> the flip side, SimpleXLogPageRead always reads a whole page and
> returns XLOG_BLCKSZ. However, as you know, the returned buffer doesn't
> contain random garbage bytes. Therefore, it's safe as long as the

Forgot to mention that there is a case involving non-initialized
pages, but it doesn't affect the correctness of the description you
pointed out.

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center




Re: Do away with zero-padding assumption before WALRead()

2024-02-18 Thread Kyotaro Horiguchi
At Fri, 16 Feb 2024 19:50:00 +0530, Bharath Rupireddy 
 wrote in 
> On Fri, Feb 16, 2024 at 7:10 AM Kyotaro Horiguchi
>  wrote:
> > 1. It's useless to copy the whole page regardless of the 'count'. It's
> >   enough to copy only up to the 'count'. The patch looks good in this
> >   regard.
> 
> Yes, it's not needed to copy the whole page. Per my understanding
> about page transfers between disk and OS page cache - upon OS page
> cache miss, the whole page (of disk block size) gets fetched from disk
> even if we just read 'count' bytes (< disk block size).

Right, but with a possibly-different block size. Anyway that behavior
doesn't affect the result of this change. (Could affect performance
hereafter if it were not the case, though..)

> > 2. Maybe we need a comment that states the page_read callback
> >   functions leave garbage bytes beyond the returned count, due to
> >   partial copying without clearing the unused portion.
> 
> Isn't the comment around page_read callback at
> https://git.postgresql.org/gitweb/?p=postgresql.git;a=blob;f=src/include/access/xlogreader.h;h=2e9e5f43eb2de1ca9ba81afe76d21357065c61aa;hb=d57b7cc3338e9d9aa1d7c5da1b25a17c5a72dcce#l78
> enough?
> 
> "The callback shall return the number of bytes read (never more than
> XLOG_BLCKSZ), or -1 on failure."

Yeah, perhaps I was overly concerned. The removed comment made me
think that someone could add code relying on the incorrect assumption
that the remaining bytes beyond the returned count are cleared out. On
the flip side, SimpleXLogPageRead always reads a whole page and
returns XLOG_BLCKSZ. However, as you know, the returned buffer doesn't
contain random garbage bytes. Therefore, it's safe as long as the
caller doesn't access beyond the returned count. As a result, the
description you pointed out seems to be enough.

After all, the patch looks good to me.

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center


Re: Thoughts about NUM_BUFFER_PARTITIONS

2024-02-18 Thread Japin Li


On Mon, 19 Feb 2024 at 00:56, Tomas Vondra  
wrote:
> On 2/18/24 03:30, Li Japin wrote:
>>
>> I find it seems need to change MAX_SIMUL_LWLOCKS if we enlarge the 
>> NUM_BUFFER_PARTITIONS,
>> I didn’t find any comments to describe the relation between 
>> MAX_SIMUL_LWLOCKS and
>> NUM_BUFFER_PARTITIONS, am I missing someghing?
>
> IMHO the relationship is pretty simple - MAX_SIMUL_LWLOCKS needs to be
> higher than NUM_BUFFER_PARTITIONS, so that the backend can acquire all
> the partition locks if needed.
>

Thanks for the explanation!  Got it.

> There's other places that acquire a bunch of locks, and all of them need
> to be careful not to exceed MAX_SIMUL_LWLOCKS. For example gist has
> GIST_MAX_SPLIT_PAGES.
>
>
> regards




Re: [PoC] Improve dead tuple storage for lazy vacuum

2024-02-18 Thread Masahiko Sawada
On Fri, Feb 16, 2024 at 12:41 PM John Naylor  wrote:
>
> On Fri, Feb 16, 2024 at 10:05 AM Masahiko Sawada  
> wrote:
> > > v61-0007: Runtime-embeddable tids -- Optional for v17, but should
> > > reduce memory regressions, so should be considered. Up to 3 tids can
> > > be stored in the last level child pointer. It's not polished, but I'll
> > > only proceed with that if we think we need this. "flags" iis called
> > > that because it could hold tidbitmap.c booleans (recheck, lossy) in
> > > the future, in addition to reserving space for the pointer tag. Note:
> > > I hacked the tests to only have 2 offsets per block to demo, but of
> > > course both paths should be tested.
> >
> > Interesting. I've run the same benchmark tests we did[1][2] (the
> > median of 3 runs):
> >
> > monotonically ordered int column index:
> >
> > master: system usage: CPU: user: 14.91 s, system: 0.80 s, elapsed: 15.73 s
> > v-59: system usage: CPU: user: 9.67 s, system: 0.81 s, elapsed: 10.50 s
> > v-62: system usage: CPU: user: 1.94 s, system: 0.69 s, elapsed: 2.64 s
>
> Hmm, that's strange -- this test is intended to delete all records
> from the last 20% of the blocks, so I wouldn't expect any improvement
> here, only in the sparse case. Maybe something is wrong. All the more
> reason to put it off...

Okay, let's dig it deeper later.

>
> > I'm happy to see a huge improvement. While it's really fascinating to
> > me, I'm concerned about the time left until the feature freeze. We
> > need to polish both tidstore and vacuum integration patches in 5
> > weeks. Personally I'd like to have it as a separate patch for now, and
> > focus on completing the main three patches since we might face some
> > issues after pushing these patches. I think with 0007 patch it's a big
> > win but it's still a win even without 0007 patch.
>
> Agreed to not consider it for initial commit. I'll hold on to it for
> some future time.
>
> > > 2. Management of memory contexts. It's pretty verbose and messy. I
> > > think the abstraction could be better:
> > > A: tidstore currently passes CurrentMemoryContext to RT_CREATE, so we
> > > can't destroy or reset it. That means we have to do a lot of manual
> > > work.
> > > B: Passing "max_bytes" to the radix tree was my idea, I believe, but
> > > it seems the wrong responsibility. Not all uses will have a
> > > work_mem-type limit, I'm guessing. We only use it for limiting the max
> > > block size, and aset's default 8MB is already plenty small for
> > > vacuum's large limit anyway. tidbitmap.c's limit is work_mem, so
> > > smaller, and there it makes sense to limit the max blocksize this way.
> > > C: The context for values has complex #ifdefs based on the value
> > > length/varlen, but it's both too much and not enough. If we get a bump
> > > context, how would we shoehorn that in for values for vacuum but not
> > > for tidbitmap?
> > >
> > > Here's an idea: Have vacuum (or tidbitmap etc.) pass a context to
> > > TidStoreCreate(), and then to RT_CREATE. That context will contain the
> > > values (for local mem), and the node slabs will be children of the
> > > value context. That way, measuring memory usage and free-ing can just
> > > call with this parent context, and let recursion handle the rest.
> > > Perhaps the passed context can also hold the radix-tree struct, but
> > > I'm not sure since I haven't tried it. What do you think?
> >
> > If I understand your idea correctly, RT_CREATE() creates the context
> > for values as a child of the passed context and the node slabs as
> > children of the value context. That way, measuring memory usage can
> > just call with the value context. It sounds like a good idea. But it
> > was not clear to me how to address point B and C.
>
> For B & C, vacuum would create a context to pass to TidStoreCreate,
> and it wouldn't need to bother changing max block size. RT_CREATE
> would use that directly for leaves (if any), and would only create
> child slab contexts under it. It would not need to know about
> max_bytes. Modifyng your diagram a bit, something like:
>
> - caller-supplied radix tree memory context (the 3 structs -- and
> leaves, if any) (aset (or future bump?))
> - node slab contexts
>
> This might only be workable with aset, if we need to individually free
> the structs. (I haven't studied this, it was a recent idea)
> It's simpler, because with small fixed length values, we don't need to
> detect that and avoid creating a leaf context. All leaves would live
> in the same context as the structs.

Thank you for the explanation.

I think that vacuum and tidbitmap (and future users) would end up
having the same max block size calculation. And it seems slightly odd
layering to me that max-block-size-specified context is created on
vacuum (or tidbitmap) layer, a varlen-value radix tree is created by
tidstore layer, and the passed context is used for leaves (if
varlen-value is used) on radix tree layer. Another idea is to create a
max-block-size-specified 

Re: Reducing memory consumed by RestrictInfo list translations in partitionwise join planning

2024-02-18 Thread Andrei Lepikhov

On 19/2/2024 06:05, Tomas Vondra wrote:

However, this is a somewhat extreme example - it's joining 5 tables,
each with 1000 partitions, using a partition-wise join. It reduces the
amount of memory, but the planning time is still quite high (and
essentially the same as without the patch). So it's not like it'd make
them significantly more practical ... do we have any other ideas/plans
how to improve that?
The planner principle of cleaning up all allocated structures after the 
optimization stage simplifies development and code. But, if we want to 
achieve horizontal scalability on many partitions, we should introduce 
per-partition memory context and reset it in between. GEQO already has a 
short-lived memory context, making designing extensions a bit more 
challenging but nothing too painful.


--
regards,
Andrei Lepikhov
Postgres Professional





RE: pg_upgrade and logical replication

2024-02-18 Thread Hayato Kuroda (Fujitsu)
Dear Vignesh,

Thanks for reviewing! PSA new version.

> 
> Thanks for the updated patch, Few suggestions:
> 1)  This can be moved to keep it similar to other tests:
> +# Setup a disabled subscription. The upcoming test will check the
> +# pg_createsubscriber won't work, so it is sufficient.
> +$publisher->safe_psql('postgres', "CREATE PUBLICATION regress_pub1");
> +$old_sub->safe_psql('postgres',
> +   "CREATE SUBSCRIPTION regress_sub1 CONNECTION '$connstr'
> PUBLICATION regress_pub1 WITH (enabled = false)"
> +);
> +
> +$old_sub->stop;
> +
> +# --
> +# Check that pg_upgrade fails when max_replication_slots configured in the 
> new
> +# cluster is less than the number of subscriptions in the old cluster.
> +# --
> +$new_sub->append_conf('postgresql.conf', "max_replication_slots = 0");
> +
> +# pg_upgrade will fail because the new cluster has insufficient
> +# max_replication_slots.
> +command_checks_all(
> +   [
> +   'pg_upgrade', '--no-sync', '-d', $old_sub->data_dir,
> +   '-D', $new_sub->data_dir, '-b', $oldbindir,
> +   '-B', $newbindir, '-s', $new_sub->host,
> +   '-p', $old_sub->port, '-P', $new_sub->port,
> +   $mode, '--check',
> +   ],
> 
> like below and the extra comment can be removed:
> +# --
> +# Check that pg_upgrade fails when max_replication_slots configured in the 
> new
> +# cluster is less than the number of subscriptions in the old cluster.
> +# --
> +# Create a disabled subscription.
> +$publisher->safe_psql('postgres', "CREATE PUBLICATION regress_pub1");
> +$old_sub->safe_psql('postgres',
> +   "CREATE SUBSCRIPTION regress_sub1 CONNECTION '$connstr'
> PUBLICATION regress_pub1 WITH (enabled = false)"
> +);
> +
> +$old_sub->stop;
> +
> +$new_sub->append_conf('postgresql.conf', "max_replication_slots = 0");
> +
> +# pg_upgrade will fail because the new cluster has insufficient
> +# max_replication_slots.
> +command_checks_all(
> +   [
> +   'pg_upgrade', '--no-sync', '-d', $old_sub->data_dir,
> +   '-D', $new_sub->data_dir, '-b', $oldbindir,
> +   '-B', $newbindir, '-s', $new_sub->host,
> +   '-p', $old_sub->port, '-P', $new_sub->port,
> +   $mode, '--check',
> +   ],

Partially fixed. I moved the creation part to below but comments were kept.

> 2) This comment can be slightly changed:
> +# Change configuration as well not to start the initial sync automatically
> +$new_sub->append_conf('postgresql.conf',
> +   "max_logical_replication_workers = 0");
> 
> to:
> Change configuration so that initial table sync sync does not get
> started automatically

Fixed.

> 3) The old comments were slightly better:
> # Resume the initial sync and wait until all tables of subscription
> # 'regress_sub5' are synchronized
> $new_sub->append_conf('postgresql.conf',
> "max_logical_replication_workers = 10");
> $new_sub->restart;
> $new_sub->safe_psql('postgres', "ALTER SUBSCRIPTION regress_sub5
> ENABLE");
> $new_sub->wait_for_subscription_sync($publisher, 'regress_sub5');
> 
> Like:
> # Enable the subscription
> $new_sub->safe_psql('postgres', "ALTER SUBSCRIPTION regress_sub5
> ENABLE");
> 
> # Wait until all tables of subscription 'regress_sub5' are synchronized
> $new_sub->wait_for_subscription_sync($publisher, 'regress_sub5');

Per comments from Amit [1], I did not change.

[1]: 
https://www.postgresql.org/message-id/CAA4eK1Ls%2BRmJtTvOgaRXd%2BeHSY3x-KUE%3DsfEGQoU-JF_UzA62A%40mail.gmail.com

Best Regards,
Hayato Kuroda
FUJITSU LIMITED
https://www.fujitsu.com/ 



v5-0001-Fix-testcase.patch
Description: v5-0001-Fix-testcase.patch


Re: Why is pq_begintypsend so slow?

2024-02-18 Thread Sutou Kouhei
Hi,

In <20240218200906.zvihkrs46yl2j...@awork3.anarazel.de>
  "Re: Why is pq_begintypsend so slow?" on Sun, 18 Feb 2024 12:09:06 -0800,
  Andres Freund  wrote:

>> [1] 
>> https://www.postgresql.org/message-id/flat/20240215.153421.96888103784986803.kou%40clear-code.com#34df359e6d255795d16814ce138cc995

>> Do we need one FunctionCallInfoBaseData for each attribute?
>> How about sharing one FunctionCallInfoBaseData by all
>> attributes like [1]?
> 
> That makes no sense to me. You're throwing away most of the possible gains by
> having to update the FunctionCallInfo fields on every call. You're saving
> neglegible amounts of memory at a substantial runtime cost.

The number of updated fields of your approach and [1] are
same:

Your approach: 6 (I think that "fcinfo->isnull = false" is
needed though.)

+   fcinfo->args[0].value = CStringGetDatum(string);
+   fcinfo->args[0].isnull = false;
+   fcinfo->args[1].value = 
ObjectIdGetDatum(attr->typioparam);
+   fcinfo->args[1].isnull = false;
+   fcinfo->args[2].value = 
Int32GetDatum(attr->typmod);
+   fcinfo->args[2].isnull = false;

[1]: 6 (including "fcinfo->isnull = false")

+   fcinfo->flinfo = flinfo;
+   fcinfo->context = escontext;
+   fcinfo->isnull = false;
+   fcinfo->args[0].value = CStringGetDatum(str);
+   fcinfo->args[1].value = ObjectIdGetDatum(typioparam);
+   fcinfo->args[2].value = Int32GetDatum(typmod);


>> Inlining InputFuncallCallSafe() here to use pre-initialized
>> fcinfo will decrease maintainability. Because we abstract
>> InputFunctionCall family in fmgr.c. If we define a
>> InputFunctionCall variant here, we need to change both of
>> fmgr.c and here when InputFunctionCall family is changed.
>> How about defining details in fmgr.c and call it here
>> instead like [1]?
> 
> I'm not sure I buy that that is a problem. It's not like my approach was
> actually bypassing fmgr abstractions alltogether - instead it just used the
> lower level APIs, because it's a performance sensitive area.

[1] provides some optimized abstractions, which are
implemented with lower level APIs, without breaking the
abstractions.

Note that I don't have a strong opinion how to implement
this optimization. If other developers think this approach
makes sense for this optimization, I don't object it.

>> @@ -1966,7 +1992,7 @@ CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo 
>> *flinfo,
>>  if (fld_size == -1)
>>  {
>>  *isnull = true;
>> -return ReceiveFunctionCall(flinfo, NULL, typioparam, typmod);
>> +return ReceiveFunctionCall(fcinfo->flinfo, NULL, 
>> attr->typioparam, attr->typmod);
>> 
>> Why pre-initialized fcinfo isn't used here?
> 
> Because it's a prototype and because I don't think it's a common path.

How about adding a comment why we don't need to optimize
this case?


I don't have a strong opinion how to implement this
optimization as I said above. It seems that you like your
approach. So I withdraw [1]. Could you complete this
optimization? Can we proceed making COPY format extensible
without this optimization? It seems that it'll take a little
time to complete this optimization because your patch is
still WIP. And it seems that you can work on it after making
COPY format extensible.


Thanks,
-- 
kou




Re: What about Perl autodie?

2024-02-18 Thread Andrew Dunstan



On 2024-02-14 We 11:52, Peter Eisentraut wrote:

On 08.02.24 16:53, Tom Lane wrote:

Daniel Gustafsson  writes:
On 8 Feb 2024, at 08:01, Peter Eisentraut  
wrote:

I suppose we could start using it for completely new scripts.


+1, it would be nice to eventually be able to move to it everywhere 
so starting

now with new scripts may make the eventual transition smoother.


I'm still concerned about people carelessly using autodie-reliant
code in places where they shouldn't.  I offer two safer ways
forward:

1. Wait till v16 is the oldest supported branch, and then migrate
both HEAD and back branches to using autodie.

2. Don't wait, migrate them all now.  This would mean requiring
Perl 5.10.1 or later to run the TAP tests, even in back branches.

I think #2 might not be all that radical.  We have nothing older
than 5.14.0 in the buildfarm, so we don't really have much grounds
for claiming that 5.8.3 will work today.  And 5.10.1 came out in
2009, so how likely is it that anyone cares anymore?


A gentler way might be to start using some perlcritic policies like 
InputOutput::RequireCheckedOpen or the more general 
InputOutput::RequireCheckedSyscalls and add explicit error checking at 
the sites it points out.  And then if we start using autodie in the 
future, any inappropriate backpatching of calls lacking error checks 
would be caught.





Yeah, that should work.


cheers


andrew

--
Andrew Dunstan
EDB: https://www.enterprisedb.com





Re: Running the fdw test from the terminal crashes into the core-dump

2024-02-18 Thread Amit Langote
On Mon, Feb 19, 2024 at 4:44 AM Alvaro Herrera  wrote:
> On 2024-Feb-18, Tom Lane wrote:
>
> > So I'd blame this on faulty handling of the zero-partitions case in
> > the RTEPermissionInfo refactoring.  (I didn't bisect to prove that
> > a61b1f748 is exactly where it broke, but I do see that the query
> > successfully does nothing in v15.)
>
> You're right, this is the commit that broke it.  It's unclear to me if
> Amit is available to look at it, so I'll give this a look tomorrow if he
> isn't.

I'll look at this today.

-- 
Thanks, Amit Langote




Re: Reducing memory consumed by RestrictInfo list translations in partitionwise join planning

2024-02-18 Thread Tomas Vondra
Hi,

After taking a look at the patch optimizing SpecialJoinInfo allocations,
I decided to take a quick look at this one too. I don't have any
specific comments on the code yet, but it seems quite a bit more complex
than the other patch ... it's introducing a HTAB into the optimizer,
surely that has costs too.

I started by doing the same test as with the other patch, comparing
master to the two patches (applied independently and both). And I got
about this (in MB):

  tablesmaster sjinforinfo  both
  ---
   237 36   3433
   3   138129  122   113
   4   421376  363   318
   5  1495   1254 1172   931

Unlike the SpecialJoinInfo patch, I haven't seen any reduction in
planning time for this one.

The reduction in allocated memory is nice. I wonder what's allocating
the remaining memory, and we'd have to do to reduce it further.

However, this is a somewhat extreme example - it's joining 5 tables,
each with 1000 partitions, using a partition-wise join. It reduces the
amount of memory, but the planning time is still quite high (and
essentially the same as without the patch). So it's not like it'd make
them significantly more practical ... do we have any other ideas/plans
how to improve that?

AFAIK we don't expect this to improve "regular" cases with modest number
of tables / partitions, etc. But could it make them slower in some case?


regards

-- 
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company




Re: Add system identifier to backup manifest

2024-02-18 Thread Michael Paquier
On Thu, Feb 15, 2024 at 05:41:46PM +0530, Robert Haas wrote:
> On Thu, Feb 15, 2024 at 3:05 PM Amul Sul  wrote:
> > Kindly have a look at the attached version.
> 
> IMHO, 0001 looks fine, except probably the comment could be phrased a
> bit more nicely.

And the new option should be documented at the top of the init()
routine for perldoc.

> That can be left for whoever commits this to
> wordsmith. Michael, what are your plans?

Not much, so feel free to not wait for me.  I've just read through the
patch because I like the idea/feature :)

> 0002 seems like a reasonable approach, but there's a hunk in the wrong
> patch: 0004 modifies pg_combinebackup's check_control_files to use
> get_dir_controlfile() rather than git_controlfile(), but it looks to
> me like that should be part of 0002.

I'm slightly concerned about 0002 that silently changes the meaning of
get_controlfile().  That would impact extension code without people
knowing about it when compiling, just when they run their stuff under
17~.
--
Michael


signature.asc
Description: PGP signature


Re: Why is pq_begintypsend so slow?

2024-02-18 Thread Michael Paquier
On Sun, Feb 18, 2024 at 12:09:06PM -0800, Andres Freund wrote:
> On 2024-02-18 17:38:09 +0900, Sutou Kouhei wrote:
>> @@ -1966,7 +1992,7 @@ CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo 
>> *flinfo,
>>  if (fld_size == -1)
>>  {
>>  *isnull = true;
>> -return ReceiveFunctionCall(flinfo, NULL, typioparam, typmod);
>> +return ReceiveFunctionCall(fcinfo->flinfo, NULL, 
>> attr->typioparam, attr->typmod);
>> 
>> Why pre-initialized fcinfo isn't used here?
> 
> Because it's a prototype and because I don't think it's a common path.

0008 and 0010 (a bit) are the only patches of the set that touch some
of the areas that would be impacted by the refactoring to use
callbacks in the COPY code, still I don't see anything that could not
be changed in what's updated here, the one-row callback in COPY FROM
being the most touched.  So I don't quite see why each effort could
not happen on their own?

Or Andres, do you think that any improvements you've been proposing in
this area should happen before we consider refactoring the COPY code
to plug in the callbacks?  I'm a bit confused by the situation, TBH.
--
Michael


signature.asc
Description: PGP signature


Re: Patch: Add parse_type Function

2024-02-18 Thread Erik Wienhold
On 2024-02-18 20:00 +0100, Pavel Stehule wrote:
> The overhead of parse_type_and_format can be related to higher planning
> time. PL/pgSQL can assign composite without usage FROM clause.

Thanks, didn't know that this makes a difference.  In that case both
variants are on par.

BEGIN;

CREATE FUNCTION format_with_parse_type(text)
RETURNS text
LANGUAGE plpgsql
STABLE STRICT
AS $$
DECLARE
p record := parse_type($1);
BEGIN
RETURN format_type(p.typid, p.typmod);
END
$$;

CREATE FUNCTION format_with_to_regtypmod(text)
RETURNS text
LANGUAGE plpgsql
STABLE STRICT
AS $$
BEGIN
RETURN format_type(to_regtype($1), to_regtypmod($1));
END
$$;

COMMIT;

Results:

SELECT format_with_parse_type('interval second(0)');

pgbench (17devel)
transaction type: format_with_parse_type.sql
scaling factor: 1
query mode: simple
number of clients: 1
number of threads: 1
maximum number of tries: 1
duration: 10 s
number of transactions actually processed: 253530
number of failed transactions: 0 (0.000%)
latency average = 0.039 ms
initial connection time = 1.846 ms
tps = 25357.551681 (without initial connection time)

SELECT format_with_to_regtypmod('interval second(0)');

pgbench (17devel)
transaction type: format_with_to_regtypmod.sql
scaling factor: 1
query mode: simple
number of clients: 1
number of threads: 1
maximum number of tries: 1
duration: 10 s
number of transactions actually processed: 257942
number of failed transactions: 0 (0.000%)
latency average = 0.039 ms
initial connection time = 1.544 ms
tps = 25798.015526 (without initial connection time)

-- 
Erik




Re: PGC_SIGHUP shared_buffers?

2024-02-18 Thread Andres Freund
Hi,

On 2024-02-18 17:06:09 +0530, Robert Haas wrote:
> On Sat, Feb 17, 2024 at 12:38 AM Andres Freund  wrote:
> > IMO the ability to *shrink* shared_buffers dynamically and cheaply is more
> > important than growing it in a way, except that they are related of
> > course. Idling hardware is expensive, thus overcommitting hardware is very
> > attractive (I count "serverless" as part of that). To be able to overcommit
> > effectively, unused long-lived memory has to be released. I.e. shared 
> > buffers
> > needs to be shrinkable.
> 
> I see your point, but people want to scale up, too. Of course, those
> people will have to live with what we can practically implement.

Sure, I didn't intend to say that scaling up isn't useful.


> > Perhaps worth noting that there are two things limiting the size of shared
> > buffers: 1) the available buffer space 2) the available buffer *mapping*
> > space. I think making the buffer mapping resizable is considerably harder 
> > than
> > the buffers themselves. Of course pre-reserving memory for a buffer mapping
> > suitable for a huge shared_buffers is more feasible than pre-allocating all
> > that memory for the buffers themselves. But it' still mean youd have a 
> > maximum
> > set at server start.
> 
> We size the fsync queue based on shared_buffers too. That's a lot less
> important, though, and could be worked around in other ways.

We probably should address that independently of making shared_buffers
PGC_SIGHUP. The queue gets absurdly large once s_b hits a few GB. It's not
that much memory compared to the buffer blocks themselves, but a sync queue of
many millions of entries just doesn't make sense. And a few hundred MB for
that isn't nothing either, even if it's just a fraction of the space for the
buffers. It makes checkpointer more susceptible to OOM as well, because
AbsorbSyncRequests() allocates an array to copy all requests into local
memory.


> > Such a scheme still leaves you with a dependend memory read for a quite
> > frequent operation. It could turn out to nto matter hugely if the mapping
> > array is cache resident, but I don't know if we can realistically bank on
> > that.
> 
> I don't know, either. I was hoping you did. :-)
> 
> But we can rig up a test pretty easily, I think. We can just create a
> fake mapping that gives the same answers as the current calculation
> and then beat on it. Of course, if testing shows no difference, there
> is the small problem of knowing whether the test scenario was right;
> and it's also possible that an initial impact could be mitigated by
> removing some gratuitously repeated buffer # -> buffer address
> mappings. Still, I think it could provide us with a useful baseline.
> I'll throw something together when I have time, unless someone beats
> me to it.

I think such a test would be useful, although I also don't know how confident
we would be if we saw positive results. Probably depends a bit on the
generated code and how plausible it is to not see regressions.


> > I'm also somewhat concerned about the coarse granularity being problematic. 
> > It
> > seems like it'd lead to a desire to make the granule small, causing 
> > slowness.
> 
> How many people set shared_buffers to something that's not a whole
> number of GB these days?

I'd say the vast majority of postgres instances in production run with less
than 1GB of s_b. Just because numbers wise the majority of instances are
running on small VMs and/or many PG instances are running on one larger
machine.  There are a lot of instances where the total available memory is
less than 2GB.


> I mean I bet it happens, but in practice if you rounded to the nearest GB,
> or even the nearest 2GB, I bet almost nobody would really care. I think it's
> fine to be opinionated here and hold the line at a relatively large granule,
> even though in theory people could want something else.

I don't believe that at all unfortunately.


> > One big advantage of a scheme like this is that it'd be a step towards a 
> > NUMA
> > aware buffer mapping and replacement. Practically everything beyond the size
> > of a small consumer device these days has NUMA characteristics, even if not
> > "officially visible". We could make clock sweeps (or a better victim buffer
> > selection algorithm) happen within each "chunk", with some additional
> > infrastructure to choose which of the chunks to search a buffer in. Using a
> > chunk on the current numa node, except when there is a lot of imbalance
> > between buffer usage or replacement rate between chunks.
> 
> I also wondered whether this might be a useful step toward allowing
> different-sized buffers in the same buffer pool (ducks, runs away
> quickly). I don't have any particular use for that myself, but it's a
> thing some people probably want for some reason or other.

I still think that that's something that will just cause a significant cost in
complexity, and secondarily also runtime overhead, at a comparatively marginal
gain.



Re: Why is pq_begintypsend so slow?

2024-02-18 Thread Andres Freund
Hi,

On 2024-02-18 17:38:09 +0900, Sutou Kouhei wrote:
> In <20240218015955.rmw5mcmobt5hb...@awork3.anarazel.de>
>   "Re: Why is pq_begintypsend so slow?" on Sat, 17 Feb 2024 17:59:55 -0800,
>   Andres Freund  wrote:
> 
> > v3-0008-wip-make-in-out-send-recv-calls-in-copy.c-cheaper.patch
> 
> It seems that this is an alternative approach of [1].

Note that what I posted was a very lightly polished rebase of a ~4 year old
patchset.

> [1] 
> https://www.postgresql.org/message-id/flat/20240215.153421.96888103784986803.kou%40clear-code.com#34df359e6d255795d16814ce138cc995
> 
> 
> +typedef struct CopyInAttributeInfo
> +{
> + AttrNumber  num;
> + const char *name;
> +
> + Oid typioparam;
> + int32   typmod;
> +
> + FmgrInfoin_finfo;
> + union
> + {
> + FunctionCallInfoBaseData fcinfo;
> + charfcinfo_data[SizeForFunctionCallInfo(3)];
> + }   in_fcinfo;
> 
> Do we need one FunctionCallInfoBaseData for each attribute?
> How about sharing one FunctionCallInfoBaseData by all
> attributes like [1]?

That makes no sense to me. You're throwing away most of the possible gains by
having to update the FunctionCallInfo fields on every call. You're saving
neglegible amounts of memory at a substantial runtime cost.


> @@ -956,20 +956,47 @@ NextCopyFrom(CopyFromState cstate, ExprContext 
> *econtext,
>  
>   values[m] = ExecEvalExpr(defexprs[m], econtext, 
> [m]);
>   }
> -
> - /*
> -  * If ON_ERROR is specified with IGNORE, skip rows with 
> soft
> -  * errors
> -  */
> - else if (!InputFunctionCallSafe(_functions[m],
> - 
> string,
> - 
> typioparams[m],
> - 
> att->atttypmod,
> - 
> (Node *) cstate->escontext,
> - 
> [m]))
> 
> Inlining InputFuncallCallSafe() here to use pre-initialized
> fcinfo will decrease maintainability. Because we abstract
> InputFunctionCall family in fmgr.c. If we define a
> InputFunctionCall variant here, we need to change both of
> fmgr.c and here when InputFunctionCall family is changed.
> How about defining details in fmgr.c and call it here
> instead like [1]?

I'm not sure I buy that that is a problem. It's not like my approach was
actually bypassing fmgr abstractions alltogether - instead it just used the
lower level APIs, because it's a performance sensitive area.


> @@ -1966,7 +1992,7 @@ CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo 
> *flinfo,
>   if (fld_size == -1)
>   {
>   *isnull = true;
> - return ReceiveFunctionCall(flinfo, NULL, typioparam, typmod);
> + return ReceiveFunctionCall(fcinfo->flinfo, NULL, 
> attr->typioparam, attr->typmod);
> 
> Why pre-initialized fcinfo isn't used here?

Because it's a prototype and because I don't think it's a common path.

Greetings,

Andres Freund




Re: Running the fdw test from the terminal crashes into the core-dump

2024-02-18 Thread Alvaro Herrera
On 2024-Feb-18, Tom Lane wrote:

> So I'd blame this on faulty handling of the zero-partitions case in
> the RTEPermissionInfo refactoring.  (I didn't bisect to prove that
> a61b1f748 is exactly where it broke, but I do see that the query
> successfully does nothing in v15.)

You're right, this is the commit that broke it.  It's unclear to me if
Amit is available to look at it, so I'll give this a look tomorrow if he
isn't.

-- 
Álvaro Herrera   48°01'N 7°57'E  —  https://www.EnterpriseDB.com/
"Porque francamente, si para saber manejarse a uno mismo hubiera que
rendir examen... ¿Quién es el machito que tendría carnet?"  (Mafalda)




Re: Transaction timeout

2024-02-18 Thread Andrey M. Borodin
Alexander, thanks for pushing this! This is small but very awaited feature.

> On 16 Feb 2024, at 02:08, Andres Freund  wrote:
> 
> Isn't this test going to be very fragile on busy / slow machines? What if the
> pg_sleep() takes one second, because there were other tasks to schedule?  I'd
> be surprised if this didn't fail under valgrind, for example.

Even more robust tests that were bullet-proof in CI previously exhibited some 
failures on buildfarm. Currently there are 5 failures through this weekend.
Failing tests are testing interaction of idle_in_transaction_session_timeout vs 
transaction_timeout(5), and rescheduling transaction_timeout(6).
Symptoms:

[0] transaction timeout occurs when it is being scheduled. Seems like SET was 
running to long.
 step s6_begin: BEGIN ISOLATION LEVEL READ COMMITTED;
 step s6_tt: SET statement_timeout = '1s'; SET transaction_timeout = '10ms';
+s6: FATAL:  terminating connection due to transaction timeout
 step checker_sleep: SELECT pg_sleep(0.1);

[1] transaction timeout 10ms is not detected after 1s
step s6_check: SELECT count(*) FROM pg_stat_activity WHERE application_name = 
'isolation/timeouts/s6';
 count
 -
-0
+1

[2] transaction timeout is not detected in both session 5 and session 6.

So far not signle animal reported failures twice, so it's hard to say anything 
about frequency. But it seems to be significant source of failures.

So far I have these ideas:

1. Remove test sessions 5 and 6. But it seems a little strange that session 3 
did  not fail at all (it is testing interaction of statement_timeout and 
transaction_timeout). This test is very similar to test sessiont 5...
2. Increase wait times.
step checker_sleep  { SELECT pg_sleep(0.1); }
Seems not enough to observe backend timed out from pg_stat_activity. But this 
won't help from [0].
3. Reuse waiting INJECTION_POINT from [3] to make timeout tests deterministic 
and safe from race conditions. With waiting injection points we can wait as 
much as needed in current environment.

Any advices are welcome.


Best regards, Andrey Borodin.


[0] 
https://buildfarm.postgresql.org/cgi-bin/show_log.pl?nm=tamandua=2024-02-16%2020%3A06%3A51
[1] 
https://buildfarm.postgresql.org/cgi-bin/show_log.pl?nm=kestrel=2024-02-16%2001%3A45%3A10
[2] 
https://buildfarm.postgresql.org/cgi-bin/show_log.pl?nm=fairywren=2024-02-17%2001%3A55%3A45
[3] 
https://www.postgresql.org/message-id/0925f9a9-4d53-4b27-a87e-3d83a757b...@yandex-team.ru



Re: Running the fdw test from the terminal crashes into the core-dump

2024-02-18 Thread Tom Lane
Alena Rybakina  writes:
> After starting the server (initdb + pg_ctl start) I ran a regress test 
> create_misc.sql ('\i src/test/regress/sql/create_misc.sql') and, after 
> that,
> I ran the fdw test ('\i contrib/postgres_fdw/sql/postgres_fdw.sql') in 
> the psql, and it failed in the core-dump due to the worked assert.
> To be honest, such a failure occurred only if the fdw extension was not 
> installed earlier.

Thanks for the report!  This can be reproduced more simply with

z=# create table test (a int, b text) partition by list(a);
CREATE TABLE
z=# merge into test using (select 1, 'foo') as source on (true) when matched 
then do nothing;
server closed the connection unexpectedly

The MERGE produces a query tree with

   :rtable (
  {RANGETBLENTRY 
  :alias <> 
  :eref 
 {ALIAS 
 :aliasname test 
 :colnames ("a" "b")
 }
  :rtekind 0 
  :relid 49152 
  :relkind p 
  :rellockmode 3 
  :tablesample <> 
  :perminfoindex 1 
  :lateral false 
  :inh true 
  :inFromCl false 
  :securityQuals <>
  }
  ...
   )
   :rteperminfos (
  {RTEPERMISSIONINFO 
  :relid 49152 
  :inh true 
  :requiredPerms 0 
  :checkAsUser 0 
  :selectedCols (b)
  :insertedCols (b)
  :updatedCols (b)
  }
   )

and that zero for requiredPerms is what leads to the assertion
failure later.  So I'd blame this on faulty handling of the
zero-partitions case in the RTEPermissionInfo refactoring.
(I didn't bisect to prove that a61b1f748 is exactly where it
broke, but I do see that the query successfully does nothing
in v15.)

regards, tom lane




Re: Patch: Add parse_type Function

2024-02-18 Thread Pavel Stehule
Hi

ne 18. 2. 2024 v 19:50 odesílatel Erik Wienhold  napsal:

> On 2024-02-12 19:20 +0100, Tom Lane wrote:
> > I wrote:
> > > It strikes me that this is basically to_regtype() with the additional
> > > option to return the typmod.  That leads to some questions:
> >
> > BTW, another way that this problem could be approached is to use
> > to_regtype() as-is, with a separate function to obtain the typmod:
> >
> > select format_type(to_regtype('timestamp(4)'),
> to_regtypmod('timestamp(4)'));
> >
> > This is intellectually ugly, since it implies parsing the same
> > typename string twice.  But on the other hand it avoids the notational
> > pain and runtime overhead involved in using a record-returning
> > function.  So I think it might be roughly a wash for performance.
> > Question to think about is which way is easier to use.  I don't
> > have an opinion particularly; just throwing the idea out there.
>
> Out of curiosity, I benchmarked this with the attached to_regtypmod()
> patch based on David's v5 applied to a6c21887a9.  The script running
> pgbench and its output are included at the end.
>
> Just calling parse_type() vs to_regtype()/to_regtypmod() is a wash for
> performance as you thought.  But format_type() performs better with
> to_regtypmod() than with parse_type().  Accessing the record fields
> returned by parse_type() adds some overhead.
>
> to_regtypmod() is better for our use case in pgTAP which relies on
> format_type() to normalize the type name.  The implementation of
> to_regtypmod() is also simpler than parse_type().  Usage-wise, both are
> clunky IMO.
>
> Benchmark script:
>
> #!/usr/bin/env bash
>
> set -eu
>
> cat <<'SQL' > parse_type.sql
> SELECT parse_type('interval second(0)');
> SQL
>
> cat <<'SQL' > parse_type_and_format.sql
> SELECT format_type(p.typid, p.typmod) FROM parse_type('interval
> second(0)') p;
> SQL
>
> cat <<'SQL' > to_regtypmod.sql
> SELECT to_regtype('interval second(0)'), to_regtypmod('interval
> second(0)');
> SQL
>
> cat <<'SQL' > to_regtypmod_and_format.sql
> SELECT format_type(to_regtype('interval second(0)'),
> to_regtypmod('interval second(0)'));
> SQL
>
> for f in \
> parse_type.sql \
> parse_type_and_format.sql \
> to_regtypmod.sql \
> to_regtypmod_and_format.sql
> do
> pgbench -n -f "$f" -T10 postgres
> echo
> done
>
> pgbench output:
>
> pgbench (17devel)
> transaction type: parse_type.sql
> scaling factor: 1
> query mode: simple
> number of clients: 1
> number of threads: 1
> maximum number of tries: 1
> duration: 10 s
> number of transactions actually processed: 277017
> number of failed transactions: 0 (0.000%)
> latency average = 0.036 ms
> initial connection time = 1.623 ms
> tps = 27706.005513 (without initial connection time)
>
> pgbench (17devel)
> transaction type: parse_type_and_format.sql
> scaling factor: 1
> query mode: simple
> number of clients: 1
> number of threads: 1
> maximum number of tries: 1
> duration: 10 s
> number of transactions actually processed: 222487
> number of failed transactions: 0 (0.000%)
> latency average = 0.045 ms
> initial connection time = 1.603 ms
> tps = 22252.095670 (without initial connection time)
>
> pgbench (17devel)
> transaction type: to_regtypmod.sql
> scaling factor: 1
> query mode: simple
> number of clients: 1
> number of threads: 1
> maximum number of tries: 1
> duration: 10 s
> number of transactions actually processed: 276134
> number of failed transactions: 0 (0.000%)
> latency average = 0.036 ms
> initial connection time = 1.570 ms
> tps = 27617.628259 (without initial connection time)
>
> pgbench (17devel)
> transaction type: to_regtypmod_and_format.sql
> scaling factor: 1
> query mode: simple
> number of clients: 1
> number of threads: 1
> maximum number of tries: 1
> duration: 10 s
> number of transactions actually processed: 270820
> number of failed transactions: 0 (0.000%)
> latency average = 0.037 ms
> initial connection time = 1.631 ms
> tps = 27086.331104 (without initial connection time)
>

The overhead of parse_type_and_format can be related to higher planning
time. PL/pgSQL can assign composite without usage FROM clause.

Regards

Pavel


> --
> Erik
>


Re: Patch: Add parse_type Function

2024-02-18 Thread Erik Wienhold
On 2024-02-12 19:20 +0100, Tom Lane wrote:
> I wrote:
> > It strikes me that this is basically to_regtype() with the additional
> > option to return the typmod.  That leads to some questions:
> 
> BTW, another way that this problem could be approached is to use
> to_regtype() as-is, with a separate function to obtain the typmod:
> 
> select format_type(to_regtype('timestamp(4)'), to_regtypmod('timestamp(4)'));
> 
> This is intellectually ugly, since it implies parsing the same
> typename string twice.  But on the other hand it avoids the notational
> pain and runtime overhead involved in using a record-returning
> function.  So I think it might be roughly a wash for performance.
> Question to think about is which way is easier to use.  I don't
> have an opinion particularly; just throwing the idea out there.

Out of curiosity, I benchmarked this with the attached to_regtypmod()
patch based on David's v5 applied to a6c21887a9.  The script running
pgbench and its output are included at the end.

Just calling parse_type() vs to_regtype()/to_regtypmod() is a wash for
performance as you thought.  But format_type() performs better with
to_regtypmod() than with parse_type().  Accessing the record fields
returned by parse_type() adds some overhead.

to_regtypmod() is better for our use case in pgTAP which relies on
format_type() to normalize the type name.  The implementation of
to_regtypmod() is also simpler than parse_type().  Usage-wise, both are
clunky IMO.

Benchmark script:

#!/usr/bin/env bash

set -eu

cat <<'SQL' > parse_type.sql
SELECT parse_type('interval second(0)');
SQL

cat <<'SQL' > parse_type_and_format.sql
SELECT format_type(p.typid, p.typmod) FROM parse_type('interval 
second(0)') p;
SQL

cat <<'SQL' > to_regtypmod.sql
SELECT to_regtype('interval second(0)'), to_regtypmod('interval 
second(0)');
SQL

cat <<'SQL' > to_regtypmod_and_format.sql
SELECT format_type(to_regtype('interval second(0)'), 
to_regtypmod('interval second(0)'));
SQL

for f in \
parse_type.sql \
parse_type_and_format.sql \
to_regtypmod.sql \
to_regtypmod_and_format.sql
do
pgbench -n -f "$f" -T10 postgres
echo
done

pgbench output:

pgbench (17devel)
transaction type: parse_type.sql
scaling factor: 1
query mode: simple
number of clients: 1
number of threads: 1
maximum number of tries: 1
duration: 10 s
number of transactions actually processed: 277017
number of failed transactions: 0 (0.000%)
latency average = 0.036 ms
initial connection time = 1.623 ms
tps = 27706.005513 (without initial connection time)

pgbench (17devel)
transaction type: parse_type_and_format.sql
scaling factor: 1
query mode: simple
number of clients: 1
number of threads: 1
maximum number of tries: 1
duration: 10 s
number of transactions actually processed: 222487
number of failed transactions: 0 (0.000%)
latency average = 0.045 ms
initial connection time = 1.603 ms
tps = 22252.095670 (without initial connection time)

pgbench (17devel)
transaction type: to_regtypmod.sql
scaling factor: 1
query mode: simple
number of clients: 1
number of threads: 1
maximum number of tries: 1
duration: 10 s
number of transactions actually processed: 276134
number of failed transactions: 0 (0.000%)
latency average = 0.036 ms
initial connection time = 1.570 ms
tps = 27617.628259 (without initial connection time)

pgbench (17devel)
transaction type: to_regtypmod_and_format.sql
scaling factor: 1
query mode: simple
number of clients: 1
number of threads: 1
maximum number of tries: 1
duration: 10 s
number of transactions actually processed: 270820
number of failed transactions: 0 (0.000%)
latency average = 0.037 ms
initial connection time = 1.631 ms
tps = 27086.331104 (without initial connection time)

-- 
Erik
>From 0b60432a84d63a7fccaae0fe123a0aa2ae67493b Mon Sep 17 00:00:00 2001
From: Erik Wienhold 
Date: Sun, 18 Feb 2024 17:33:35 +0100
Subject: [PATCH] Add to_regtypmod() for benchmarking against parse_type()

---
 src/backend/utils/adt/regproc.c   | 18 ++
 src/include/catalog/pg_proc.dat   |  3 ++
 src/test/regress/expected/regproc.out | 51 +++
 src/test/regress/sql/regproc.sql  | 26 ++
 4 files changed, 98 insertions(+)

diff --git a/src/backend/utils/adt/regproc.c b/src/backend/utils/adt/regproc.c
index 4fee27a139..6285dc7192 100644
--- 

Running the fdw test from the terminal crashes into the core-dump

2024-02-18 Thread Alena Rybakina

Hi, hackers!

After starting the server (initdb + pg_ctl start) I ran a regress test 
create_misc.sql ('\i src/test/regress/sql/create_misc.sql') and, after 
that,
I ran the fdw test ('\i contrib/postgres_fdw/sql/postgres_fdw.sql') in 
the psql, and it failed in the core-dump due to the worked assert.


To be honest, such a failure occurred only if the fdw extension was not 
installed earlier.



script to reproduce the error:

./configure CFLAGS='-g -ggdb -O0' --enable-debug --enable-cassert 
--prefix=`pwd`/tmp_install && make -j8 -s install


export CDIR=$(pwd)
export PGDATA=$CDIR/postgres_data
rm -r $PGDATA
mkdir $PGDATA
${CDIR}/tmp_install/bin/initdb -D $PGDATA >> log.txt
${CDIR}/tmp_install/bin/pg_ctl -D $PGDATA -l logfile start

${CDIR}/tmp_install/bin/psql -d postgres -f 
src/test/regress/sql/create_misc.sql &&
${CDIR}/tmp_install/bin/psql -d postgres -f 
contrib/postgres_fdw/sql/postgres_fdw.sql



The query, where the problem is:

-- MERGE ought to fail cleanly
merge into itrtest using (select 1, 'foo') as source on (true)
  when matched then do nothing;

Coredump:

#5  0x555d1451f483 in ExceptionalCondition 
(conditionName=0x555d146bba13 "requiredPerms != 0", 
fileName=0x555d146bb7b0 "execMain.c",

    lineNumber=654) at assert.c:66
#6  0x555d14064ebe in ExecCheckOneRelPerms (perminfo=0x555d1565ef90) 
at execMain.c:654
#7  0x555d14064d94 in ExecCheckPermissions 
(rangeTable=0x555d1565eef0, rteperminfos=0x555d1565efe0, 
ereport_on_violation=true) at execMain.c:623
#8  0x555d140652ca in InitPlan (queryDesc=0x555d156cde50, eflags=0) 
at execMain.c:850
#9  0x555d140644a8 in standard_ExecutorStart 
(queryDesc=0x555d156cde50, eflags=0) at execMain.c:266
#10 0x555d140641ec in ExecutorStart (queryDesc=0x555d156cde50, 
eflags=0) at execMain.c:145

#11 0x555d1432c025 in ProcessQuery (plan=0x555d1565f3e0,
    sourceText=0x555d1551b020 "merge into itrtest using (select 1, 
'foo') as source on (true)\n  when matched then do nothing;", params=0x0,

    queryEnv=0x0, dest=0x555d1565f540, qc=0x7fffc9454080) at pquery.c:155
#12 0x555d1432dbd8 in PortalRunMulti (portal=0x555d15597ef0, 
isTopLevel=true, setHoldSnapshot=false, dest=0x555d1565f540, 
altdest=0x555d1565f540,

    qc=0x7fffc9454080) at pquery.c:1277
#13 0x555d1432d0cf in PortalRun (portal=0x555d15597ef0, 
count=9223372036854775807, isTopLevel=true, run_once=true, 
dest=0x555d1565f540,

    altdest=0x555d1565f540, qc=0x7fffc9454080) at pquery.c:791
#14 0x555d14325ec8 in exec_simple_query (
--Type  for more, q to quit, c to continue without paging--
    query_string=0x555d1551b020 "merge into itrtest using (select 1, 
'foo') as source on (true)\n  when matched then do nothing;") at 
postgres.c:1273
#15 0x555d1432ae4c in PostgresMain (dbname=0x555d1ab0 "aaa", 
username=0x555d1a98 "alena") at postgres.c:4645
#16 0x555d14244a5d in BackendRun (port=0x555d1554b3b0) at 
postmaster.c:4440
#17 0x555d14244072 in BackendStartup (port=0x555d1554b3b0) at 
postmaster.c:4116

#18 0x555d142405c5 in ServerLoop () at postmaster.c:1768
#19 0x555d1423fec2 in PostmasterMain (argc=3, argv=0x555d1547fcf0) 
at postmaster.c:1467

#20 0x555d140f3122 in main (argc=3, argv=0x555d1547fcf0) at main.c:198

This error is consistently reproduced.

To be honest, I wasn't able to fully figure out the reason for this, but 
it seems that this operation on partitions should not be available at all?


alena@postgres=# SELECT relname, relkind FROM pg_class where 
relname='itrtest';

 relname | relkind
-+-
 itrtest | p
(1 row)

--
Regards,
Alena Rybakina
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company





Re: Removing unneeded self joins

2024-02-18 Thread Alexander Lakhin

18.02.2024 19:18, Alexander Korotkov wrote:

Attached is a draft patch fixing this query. Could you, please, recheck?


Yes, this patch fixes the behavior for that query (I've also tried several
similar queries). Though I don't know the code well enough to judge the
code change.

Best regards,
Alexander





Re: Memory consumed by child SpecialJoinInfo in partitionwise join planning

2024-02-18 Thread Tomas Vondra
Hi,

I took a quick look at this patch today. I certainly agree with the
intent to reduce the amount of memory during planning, assuming it's not
overly disruptive. And I think this patch is fairly localized and looks
sensible.

That being said I'm a big fan of using a local variable on stack and
filling it. I'd probably go with the usual palloc/pfree, because that
makes it much easier to use - the callers would not be responsible for
allocating the SpecialJoinInfo struct. Sure, it's a little bit of
overhead, but with the AllocSet caching I doubt it's measurable.

I did put this through check-world on amd64/arm64, with valgrind,
without any issue. I also tried the scripts shared by Ashutosh in his
initial message (with some minor fixes, adding MEMORY to explain etc).

The results with the 20240130 patches are like this:

   tablesmasterpatched
  -
2  40.8   39.9
3 151.7  142.6
4 464.0  418.5
51663.9 1419.5

That's certainly a nice improvement, and it even reduces the amount of
time for planning (the 5-table join goes from 18s to 17s on my laptop).
That's nice, although 17 seconds for planning is not ... great.

That being said, the amount of remaining memory needed by planning is
still pretty high - we save ~240MB for a join of 5 tables, but we still
need ~1.4GB. Yes, this is a bit extreme example, and it probably is not
very common to join 5 tables with 1000 partitions each ...

Do we know what are the other places consuming the 1.4GB of memory?
Considering my recent thread about scalability, where malloc() turned
out to be one of the main culprits, I wonder if maybe there's a lot to
gain by reducing the memory usage ... Our attitude to memory usage is
that it doesn't really matter if we keep it allocated for a bit, because
we'll free it shortly. And that may be true for "modest" memory usage,
but with 1.4GB that doesn't seem great, and the malloc overhead can be
pretty bad.

regards

-- 
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company




Re: Thoughts about NUM_BUFFER_PARTITIONS

2024-02-18 Thread Tomas Vondra
On 2/18/24 03:30, Li Japin wrote:
> 
> 
>> On Feb 10, 2024, at 20:15, Tomas Vondra  
>> wrote:
>>
>> On 2/8/24 14:27, wenhui qiu wrote:
>>> Hi Heikki Linnakangas
>>>I think the larger shared buffer  higher the probability of multiple
>>> backend processes accessing the same bucket slot BufMappingLock
>>> simultaneously, (   InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS); When I
>>> have free time, I want to do this test. I have seen some tests, but the
>>> result report is in Chinese
>>>
>>
>> I think Heikki is right this is unrelated to the amount of RAM. The
>> partitions are meant to reduce the number of lock collisions when
>> multiple processes try to map a buffer concurrently. But the machines
>> got much larger in this regard too - in 2006 the common CPUs had maybe
>> 2-4 cores, now it's common to have CPUs with ~100 cores, and systems
>> with multiple of them. OTOH the time spent holing the partition lock
>> should be pretty low, IIRC we pretty much just pin the buffer before
>> releasing it, and the backend should do plenty other expensive stuff. So
>> who knows how many backends end up doing the locking at the same time.
>>
>> OTOH, with 128 partitions it takes just 14 backends to have 50% chance
>> of a conflict, so with enough cores ... But how many partitions would be
>> enough? With 1024 partitions it still takes only 38 backends to get 50%
>> chance of a collision. Better, but considering we now have hundreds of
>> cores, not sure if sufficient.
>>
>> (Obviously, we probably want much lower probability of a collision, I
>> only used 50% to illustrate the changes).
>>
> 
> I find it seems need to change MAX_SIMUL_LWLOCKS if we enlarge the 
> NUM_BUFFER_PARTITIONS,
> I didn’t find any comments to describe the relation between MAX_SIMUL_LWLOCKS 
> and
> NUM_BUFFER_PARTITIONS, am I missing someghing?

IMHO the relationship is pretty simple - MAX_SIMUL_LWLOCKS needs to be
higher than NUM_BUFFER_PARTITIONS, so that the backend can acquire all
the partition locks if needed.

There's other places that acquire a bunch of locks, and all of them need
to be careful not to exceed MAX_SIMUL_LWLOCKS. For example gist has
GIST_MAX_SPLIT_PAGES.


regards

-- 
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company




Re: Removing unneeded self joins

2024-02-18 Thread Alexander Korotkov
On Sun, Feb 18, 2024 at 5:04 PM Alexander Korotkov  wrote:
> On Sun, Feb 18, 2024 at 3:00 PM Alexander Lakhin  wrote:
> > 09.01.2024 01:09, Alexander Korotkov wrote:
> >
> > Fixed in 30b4955a46.
> >
> >
> > Please look at the following query which fails with an error since
> > d3d55ce57:
> >
> > create table t (i int primary key);
> >
> > select t3.i from t t1
> >  join t t2 on t1.i = t2.i,
> >  lateral (select t1.i limit 1) t3;
> >
> > ERROR:  non-LATERAL parameter required by subquery
>
> Thank you for spotting.  I'm looking at this.

Attached is a draft patch fixing this query.  Could you, please, recheck?

--
Regards,
Alexander Korotkov


0001-Replace-relids-in-lateral-subquery-target-list-du-v1.patch
Description: Binary data


Re: Removing unneeded self joins

2024-02-18 Thread Alexander Korotkov
On Sun, Feb 18, 2024 at 3:00 PM Alexander Lakhin  wrote:
> 09.01.2024 01:09, Alexander Korotkov wrote:
>
> Fixed in 30b4955a46.
>
>
> Please look at the following query which fails with an error since
> d3d55ce57:
>
> create table t (i int primary key);
>
> select t3.i from t t1
>  join t t2 on t1.i = t2.i,
>  lateral (select t1.i limit 1) t3;
>
> ERROR:  non-LATERAL parameter required by subquery

Thank you for spotting.  I'm looking at this.

--
Regards,
Alexander Korotkov




RE: Synchronizing slots from primary to standby

2024-02-18 Thread Zhijie Hou (Fujitsu)
On Friday, February 16, 2024 6:41 PM shveta malik  
wrote:
> 
> On Thu, Feb 15, 2024 at 10:48 PM Bertrand Drouvot
>  wrote:
> >
> > Looking at v88_0001, random comments:
> 
> Thanks for the feedback.
> 
> >
> > 1 ===
> >
> > Commit message "Be enabling slot synchronization"
> >
> > Typo? s:Be/By
> 
> Modified.
> 
> > 2 ===
> >
> > +It enables a physical standby to synchronize logical failover slots
> > +from the primary server so that logical subscribers are not blocked
> > +after failover.
> >
> > Not sure "not blocked" is the right wording.
> > "can be resumed from the new primary" maybe? (was discussed in [1])
> 
> Modified.
> 
> > 3 ===
> >
> > +#define SlotSyncWorkerAllowed()\
> > +   (sync_replication_slots && pmState == PM_HOT_STANDBY && \
> > +SlotSyncWorkerCanRestart())
> >
> > Maybe add a comment above the macro explaining the logic?
> 
> Done.
> 
> > 4 ===
> >
> > +#include "replication/walreceiver.h"
> >  #include "replication/slotsync.h"
> >
> > should be reverse order?
> 
> Removed walreceiver.h inclusion as it was not needed.
> 
> > 5 ===
> >
> > +   if (SlotSyncWorker->syncing)
> > {
> > -   SpinLockRelease(>mutex);
> > +   SpinLockRelease(>mutex);
> > ereport(ERROR,
> >
> errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
> > errmsg("cannot synchronize replication
> slots concurrently"));
> > }
> >
> > worth to add a test in 040_standby_failover_slots_sync.pl for it?
> 
> It will be very difficult to stabilize this test as we have to make sure that 
> the
> concurrent users (SQL function(s) and/or worker(s)) are in that target 
> function at
> the same time to hit it.
> 
> >
> > 6 ===
> >
> > +static void
> > +slotsync_reread_config(bool restart)
> > +{
> >
> > worth to add test(s) in 040_standby_failover_slots_sync.pl for it?
> 
> Added test.
> 
> Please find v89  patch set. The other changes are:

Thanks for the patch. Here are few comments:

1.

+static char *
+get_dbname_from_conninfo(const char *conninfo)
+{
+   static char *dbname;
+
+   if (dbname)
+   return dbname;
+   else
+   dbname = walrcv_get_dbname_from_conninfo(conninfo);
+
+   return dbname;
+}


I think it's not necessary to have a static variable here, because the guc
check doesn't seem performance sensitive. Additionaly, it does not work well
with slotsync SQL functions, because the dbname will be allocated in temp
memory context when reaching here via SQL function, so it's not safe to access
this static variable in next function call.

2.

+static bool
+validate_remote_info(WalReceiverConn *wrconn, int elevel)
...
+
+   return (!remote_in_recovery && primary_slot_valid);

The primary_slot_valid could be uninitialized in this function.

Best Regards,
Hou zj


Re: PGC_SIGHUP shared_buffers?

2024-02-18 Thread Matthias van de Meent
On Sun, 18 Feb 2024 at 02:03, Andres Freund  wrote:
>
> Hi,
>
> On 2024-02-17 23:40:51 +0100, Matthias van de Meent wrote:
> > > 5. Re-map the shared_buffers when needed.
> > >
> > > Between transactions, a backend should not hold any buffer pins. When
> > > there are no pins, you can munmap() the shared_buffers and mmap() it at
> > > a different address.
>
> I hadn't quite realized that we don't seem to rely on shared_buffers having a
> specific address across processes. That does seem to make it a more viable to
> remap mappings in backends.
>
>
> However, I don't think this works with mmap(MAP_ANONYMOUS) - as long as we are
> using the process model. To my knowledge there is no way to get the same
> mapping in multiple already existing processes. Even mmap()ing /dev/zero after
> sharing file descriptors across processes doesn't work, if I recall correctly.
>
> We would have to use sysv/posix shared memory or such (or mmap() if files in
> tmpfs) for the shared buffers allocation.
>
>
>
> > This can quite realistically fail to find an unused memory region of
> > sufficient size when the heap is sufficiently fragmented, e.g. through
> > ASLR, which would make it difficult to use this dynamic
> > single-allocation shared_buffers in security-hardened environments.
>
> I haven't seen anywhere close to this bad fragmentation on 64bit machines so
> far - have you?

No.

> Most implementations of ASLR randomize mmap locations across multiple runs of
> the same binary, not within the same binary. There are out-of-tree linux
> patches that make mmap() randomize every single allocation, but I am not sure
> that we ought to care about such things.

After looking into ASLR a bit more, I realise I was under the mistaken
impression that ASLR would implicate randomized mmaps(), too.
Apparently, that's wrong; ASLR only does some randomization for the
initialization of the process memory layout, and not the process'
allocations.

> Even if we were to care, on 64bit platforms it doesn't seem likely that we'd
> run out of space that quickly.  AMD64 had 48bits of virtual address space from
> the start, and on recent CPUs that has grown to 57bits [1], that's a lot of
> space.

Yeah, that's a lot of space, but it seems to me it's also easily
consumed; one only needs to allocate one allocation in every 4GB of
address space to make allocations of 8GB impossible; a utilization of
~1 byte/MiB. Applying this to 48 bits of virtual address space, a
process only needs to use ~256MB of memory across the address space to
block out any 8GB allocations; for 57 bits that's still "only" 128GB.
But after looking at ASLR a bit more, it is unrealistic that a normal
OS and process stack would get to allocating memory in such a pattern.

> And if you do run out of VM space, wouldn't that also affect lots of other
> things, like mmap() for malloc?

Yes. But I would usually expect that the main shared memory allocation
would be the single largest uninterrupted allocation, so I'd also
expect it to see more such issues than any current user of memory if
we were to start moving (reallocating) that allocation.

Kind regards,

Matthias van de Meent
Neon (https://neon.tech)




Re: PGC_SIGHUP shared_buffers?

2024-02-18 Thread Konstantin Knizhnik


On 16/02/2024 10:37 pm, Thomas Munro wrote:

On Fri, Feb 16, 2024 at 5:29 PM Robert Haas  wrote:

3. Reserve lots of address space and then only use some of it. I hear
rumors that some forks of PG have implemented something like this. The
idea is that you convince the OS to give you a whole bunch of address
space, but you try to avoid having all of it be backed by physical
memory. If you later want to increase shared_buffers, you then get the
OS to back more of it by physical memory, and if you later want to
decrease shared_buffers, you hopefully have some way of giving the OS
the memory back. As compared with the previous two approaches, this
seems less likely to be noticeable to most PG code. Problems include
(1) you have to somehow figure out how much address space to reserve,
and that forms an upper bound on how big shared_buffers can grow at
runtime and (2) you have to figure out ways to reserve address space
and back more or less of it with physical memory that will work on all
of the platforms that we currently support or might want to support in
the future.

FTR I'm aware of a working experimental prototype along these lines,
that will be presented in Vancouver:

https://www.pgevents.ca/events/pgconfdev2024/sessions/session/31-enhancing-postgresql-plasticity-new-frontiers-in-memory-management/


If you are interested - this is my attempt to implement resizable shared 
buffers based on ballooning:


https://github.com/knizhnik/postgres/pull/2

Unused memory is returned to OS using `madvise` (so it is not so 
portable solution).


Unfortunately there are really many data structure in Postgres which 
size depends on number of buffers.
In my PR I am using `GetAvailableBuffers()` function instead of 
`NBuffers`. But it doesn't always help because many of this data 
structures can not be reallocated.


Another important limitation of this approach are:

1. It is necessary to specify maximal number of shared buffers2. Only 
`BufferBlocks` space is shrinked but not buffer descriptors and buffer 
hash. Estimated memory fooyprint for one page is 132 bytes. If we want 
to scale shared buffers from 100Mb to 100Gb, size of use memory will be 
1.6Gb. And it is quite large.
3. Our CLOCK algorithm becomes very inefficient for large number of 
shared buffers.


Below are first results (pgbench database with scale 100, pgbench -c 32 
-j 4 -T 100 -P1 -M prepared -S ) I get:


| shared_buffers|available_buffers | TPS  |
| --|  |  |
|  128MB|   -1 | 280k |
|1GB|   -1 | 324k |
|2GB|   -1 | 358k |
|   32GB|   -1 | 350k |
|2GB|128Mb | 130k |
|2GB|  1Gb | 311k |
|   32GB|128Mb |  13k |
|   32GB|  1Gb | 140k |
|   32GB|  2Gb | 348k |

`shared_buffers` specifies maximal shared buffers size and 
`avaiable_buffer` - current limit.


So when shared_buffers >> available_buffers and dataset doesn't fit in 
them, we get awful degrade of performance (> 20 times).

Thanks to CLOCK algorithm.
My first thought is to replace clock with LRU based in double-linked 
list. As far as there is no lockless double-list implementation,
it need some global lock. This lock can become bottleneck. The standard 
solution is partitioning: use N  LRU lists instead of 1.
Just as partitioned has table used by buffer manager to lockup buffers. 
Actually we can use the same partitions locks to protect LRU list.
But it not clear what to do with ring buffers (strategies).So I decided 
not to perform such revolution in bufmgr, but optimize clock to more 
efficiently split reserved buffers.
Just add|skip_count|field to buffer descriptor. And it helps! Now the 
worst case shared_buffer/available_buffers = 32Gb/128Mb

shows the same performance 280k as  shared_buffers=128Mb without ballooning.




Re: Removing unneeded self joins

2024-02-18 Thread Alexander Lakhin

Hi Alexander,

09.01.2024 01:09, Alexander Korotkov wrote:

Fixed in 30b4955a46.


Please look at the following query which fails with an error since
d3d55ce57:

create table t (i int primary key);

select t3.i from t t1
 join t t2 on t1.i = t2.i,
 lateral (select t1.i limit 1) t3;

ERROR:  non-LATERAL parameter required by subquery

Best regards,
Alexander

Re: PGC_SIGHUP shared_buffers?

2024-02-18 Thread Robert Haas
On Sat, Feb 17, 2024 at 1:54 AM Heikki Linnakangas  wrote:
> A variant of this approach:
>
> 5. Re-map the shared_buffers when needed.
>
> Between transactions, a backend should not hold any buffer pins. When
> there are no pins, you can munmap() the shared_buffers and mmap() it at
> a different address.

I really like this idea, but I think Andres has latched onto the key
issue, which is that it supposes that the underlying shared memory
object upon which shared_buffers is based can be made bigger and
smaller, and that doesn't work for anonymous mappings AFAIK.

Maybe that's not really a problem any more, though. If we don't depend
on the address of shared_buffers anywhere, we could move it into a
DSM. Now that the stats collector uses DSM, it's surely already a
requirement that DSM works on every machine that runs PostgreSQL.

We'd still need to do something about the buffer mapping table,
though, and I bet dshash is not a reasonable answer on performance
grounds.

Also, it would be nice if the granularity of resizing could be
something less than a whole transaction, because transactions can run
for a long time. We don't really need to wait for a transaction
boundary, probably -- a time when we hold zero buffer pins will
probably happen a lot sooner, and at least some of those should be
safe points at which to remap.

Then again, somebody can open a cursor, read from it until it holds a
pin, and then either idle the connection or make it do arbitrary
amounts of unrelated work, forcing the remapping to be postponed for
an arbitrarily long time. But some version of this problem will exist
in any approach to this problem, and long-running pins are a nuisance
for other reasons, too. We probably just have to accept this sort of
issue as a limitation of our implementation.

-- 
Robert Haas
EDB: http://www.enterprisedb.com




Re: PGC_SIGHUP shared_buffers?

2024-02-18 Thread Robert Haas
On Sat, Feb 17, 2024 at 12:38 AM Andres Freund  wrote:
> IMO the ability to *shrink* shared_buffers dynamically and cheaply is more
> important than growing it in a way, except that they are related of
> course. Idling hardware is expensive, thus overcommitting hardware is very
> attractive (I count "serverless" as part of that). To be able to overcommit
> effectively, unused long-lived memory has to be released. I.e. shared buffers
> needs to be shrinkable.

I see your point, but people want to scale up, too. Of course, those
people will have to live with what we can practically implement.

> Perhaps worth noting that there are two things limiting the size of shared
> buffers: 1) the available buffer space 2) the available buffer *mapping*
> space. I think making the buffer mapping resizable is considerably harder than
> the buffers themselves. Of course pre-reserving memory for a buffer mapping
> suitable for a huge shared_buffers is more feasible than pre-allocating all
> that memory for the buffers themselves. But it' still mean youd have a maximum
> set at server start.

We size the fsync queue based on shared_buffers too. That's a lot less
important, though, and could be worked around in other ways.

> Such a scheme still leaves you with a dependend memory read for a quite
> frequent operation. It could turn out to nto matter hugely if the mapping
> array is cache resident, but I don't know if we can realistically bank on
> that.

I don't know, either. I was hoping you did. :-)

But we can rig up a test pretty easily, I think. We can just create a
fake mapping that gives the same answers as the current calculation
and then beat on it. Of course, if testing shows no difference, there
is the small problem of knowing whether the test scenario was right;
and it's also possible that an initial impact could be mitigated by
removing some gratuitously repeated buffer # -> buffer address
mappings. Still, I think it could provide us with a useful baseline.
I'll throw something together when I have time, unless someone beats
me to it.

> I'm also somewhat concerned about the coarse granularity being problematic. It
> seems like it'd lead to a desire to make the granule small, causing slowness.

How many people set shared_buffers to something that's not a whole
number of GB these days? I mean I bet it happens, but in practice if
you rounded to the nearest GB, or even the nearest 2GB, I bet almost
nobody would really care. I think it's fine to be opinionated here and
hold the line at a relatively large granule, even though in theory
people could want something else.

Alternatively, maybe there could be a provision for the last granule
to be partial, and if you extend further, you throw away the partial
granule and replace it with a whole one. But I'm not even sure that's
worth doing.

> One big advantage of a scheme like this is that it'd be a step towards a NUMA
> aware buffer mapping and replacement. Practically everything beyond the size
> of a small consumer device these days has NUMA characteristics, even if not
> "officially visible". We could make clock sweeps (or a better victim buffer
> selection algorithm) happen within each "chunk", with some additional
> infrastructure to choose which of the chunks to search a buffer in. Using a
> chunk on the current numa node, except when there is a lot of imbalance
> between buffer usage or replacement rate between chunks.

I also wondered whether this might be a useful step toward allowing
different-sized buffers in the same buffer pool (ducks, runs away
quickly). I don't have any particular use for that myself, but it's a
thing some people probably want for some reason or other.

> > 2. Make a Buffer just a disguised pointer. Imagine something like
> > typedef struct { Page bp; } *buffer. WIth this approach,
> > BufferGetBlock() becomes trivial.
>
> You also additionally need something that allows for efficient iteration over
> all shared buffers. Making buffer replacement and checkpointing more expensive
> isn't great.

True, but I don't really see what the problem with this would be in
this approach.

> > 3. Reserve lots of address space and then only use some of it. I hear
> > rumors that some forks of PG have implemented something like this. The
> > idea is that you convince the OS to give you a whole bunch of address
> > space, but you try to avoid having all of it be backed by physical
> > memory. If you later want to increase shared_buffers, you then get the
> > OS to back more of it by physical memory, and if you later want to
> > decrease shared_buffers, you hopefully have some way of giving the OS
> > the memory back. As compared with the previous two approaches, this
> > seems less likely to be noticeable to most PG code.
>
> Another advantage is that you can shrink shared buffers fairly granularly and
> cheaply with that approach, compared to having to move buffes entirely out of
> a larger mapping to be able to unmap it.

Don't you have to still 

Re: Things I don't like about \du's "Attributes" column

2024-02-18 Thread Pavel Luzanov

On 17.02.2024 00:37, David G. Johnston wrote:
On Mon, Feb 12, 2024 at 2:29 PM Pavel Luzanov 
 wrote:


  regress_du_role1 | no| Inherit | no| 2024-12-31 
00:00:00+03(invalid) | 50   | Group role without password but with 
valid until
  regress_du_role2 | yes   | Inherit | yes   |  
   | Not allowed  | No connections allowed
  regress_du_role3 | yes   | | yes   |  
   | 10   | User without attributes
  regress_du_su| yes   | Superuser  +| yes   |  
   | 3(ignored)   | Superuser but with connection limit


Per the recent bug report, we should probably add something like 
(ignored) after the 50 connections for role1 since they are not 
allowed to login so the value is indeed ignored.


Hm, but the same logic applies to "Password?" and "Valid until" for role1 
without login attribute.
The challenge is how to display it for unprivileged users. But they can't see 
password information.
So, displaying 'Valid until' as '(irrelevant)' for privileged users and real 
value for others looks badly.

What can be done in this situation.
0. Show different values as described above.
1. Don't show 'Valid until' for unprivileged users at all. The same logic as 
for 'Password?'.
With possible exception: user can see 'Valid until' for himself.
May be too complicated?

2. Tom's advise:   


Not sure it's worth worrying about


Show real values for 'Valid until' and 'Connection limit' without any hints.

3. The best solution, which I can't see now.

--
Pavel Luzanov
Postgres Professional:https://postgrespro.com


Re: Properly pathify the union planner

2024-02-18 Thread Andy Fan


David Rowley  writes:

>>
>> The two if-clauses looks unnecessary, it should be handled by
>> pathkeys_count_contained_in already. The same issue exists in
>> pathkeys_useful_for_ordering as well. Attached patch fix it in master.
>
> I agree.  I'd rather not have those redundant checks in
> pathkeys_useful_for_setop(), and I do want those functions to be as
> similar as possible.  So I think adjusting it in master is a good
> idea.
>
> I've pushed your patch.
>
Thanks for the pushing!


-- 
Best Regards
Andy Fan





Re: Why is pq_begintypsend so slow?

2024-02-18 Thread Sutou Kouhei
Hi,

In <20240218015955.rmw5mcmobt5hb...@awork3.anarazel.de>
  "Re: Why is pq_begintypsend so slow?" on Sat, 17 Feb 2024 17:59:55 -0800,
  Andres Freund  wrote:

> v3-0008-wip-make-in-out-send-recv-calls-in-copy.c-cheaper.patch

It seems that this is an alternative approach of [1].

[1] 
https://www.postgresql.org/message-id/flat/20240215.153421.96888103784986803.kou%40clear-code.com#34df359e6d255795d16814ce138cc995


+typedef struct CopyInAttributeInfo
+{
+   AttrNumber  num;
+   const char *name;
+
+   Oid typioparam;
+   int32   typmod;
+
+   FmgrInfoin_finfo;
+   union
+   {
+   FunctionCallInfoBaseData fcinfo;
+   charfcinfo_data[SizeForFunctionCallInfo(3)];
+   }   in_fcinfo;

Do we need one FunctionCallInfoBaseData for each attribute?
How about sharing one FunctionCallInfoBaseData by all
attributes like [1]?


@@ -956,20 +956,47 @@ NextCopyFrom(CopyFromState cstate, ExprContext *econtext,
 
values[m] = ExecEvalExpr(defexprs[m], econtext, 
[m]);
}
-
-   /*
-* If ON_ERROR is specified with IGNORE, skip rows with 
soft
-* errors
-*/
-   else if (!InputFunctionCallSafe(_functions[m],
-   
string,
-   
typioparams[m],
-   
att->atttypmod,
-   
(Node *) cstate->escontext,
-   
[m]))

Inlining InputFuncallCallSafe() here to use pre-initialized
fcinfo will decrease maintainability. Because we abstract
InputFunctionCall family in fmgr.c. If we define a
InputFunctionCall variant here, we need to change both of
fmgr.c and here when InputFunctionCall family is changed.
How about defining details in fmgr.c and call it here
instead like [1]?

+   fcinfo->args[0].value = CStringGetDatum(string);
+   fcinfo->args[0].isnull = false;
+   fcinfo->args[1].value = 
ObjectIdGetDatum(attr->typioparam);
+   fcinfo->args[1].isnull = false;
+   fcinfo->args[2].value = 
Int32GetDatum(attr->typmod);
+   fcinfo->args[2].isnull = false;

I think that "fcinfo->isnull = false;" is also needed like
[1].

@@ -1966,7 +1992,7 @@ CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo 
*flinfo,
if (fld_size == -1)
{
*isnull = true;
-   return ReceiveFunctionCall(flinfo, NULL, typioparam, typmod);
+   return ReceiveFunctionCall(fcinfo->flinfo, NULL, 
attr->typioparam, attr->typmod);

Why pre-initialized fcinfo isn't used here?


Thanks,
-- 
kou




Re: serial not accepted as datatype in ALTER TABLE ... ALTER COLUMN

2024-02-18 Thread Andy Fan

Hi Ashutosh, 

> data_type is described on that page as "Data type of the new column,
> or new data type for an existing column." but CREATE TABLE
> documentation [2] redirects data_type to [3], which mentions serial.
> The impression created by the documentation is the second statement
> above is a valid statement as should not throw an error; instead
> change the data type of the column (and create required sequence).

I didn't find out a reason to not support it, if have any reason, I
think it is better have some explaination in the document.

> In code ATPrepAlterColumnType() calls typenameTypeIdAndMod(), whereas
> transformColumnDefinition() (called for ALTER TABLE ... ADD COLUMN and
> CREATE TABLE) handles "serial" data type separately. Looks like we are
> missing a call to transformColumnDefinition() in
> transformAlterTableStmt() under case AT_AlterColumnType.

I tried your idea with the attatchment, it is still in a drafted state
but it can be used as a prove-of-concept and for better following
communicating.  Just one point needs to metion is serial implies
"default value" + "not null" constaint. So when we modify a column into
serial, we need to modify the 'NULL value' and only to the default value
at the RewriteTable stage.

-- 
Best Regards
Andy Fan

>From 4485a9af33c9c7d493e23c2804bde4c15df0b79a Mon Sep 17 00:00:00 2001
From: "yizhi.fzh" 
Date: Sun, 18 Feb 2024 16:14:29 +0800
Subject: [PATCH v0 1/1] POC: Support ALTER TABLE tableName ALERT COLUMN col
 type serial.

---
 src/backend/commands/tablecmds.c  | 60 +--
 src/backend/parser/gram.y |  1 +
 src/backend/parser/parse_utilcmd.c| 41 -
 src/include/nodes/parsenodes.h|  1 +
 src/include/parser/parse_utilcmd.h|  4 +-
 src/test/regress/expected/alter_table.out | 70 +--
 src/test/regress/sql/alter_table.sql  | 20 +--
 7 files changed, 180 insertions(+), 17 deletions(-)

diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 46ece07338..7e13b04efa 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -231,6 +231,8 @@ typedef struct NewColumnValue
 	Expr	   *expr;			/* expression to compute */
 	ExprState  *exprstate;		/* execution state */
 	bool		is_generated;	/* is it a GENERATED expression? */
+	bool		new_on_null;	/* set the new value only when the old value
+ * is NULL. */
 } NewColumnValue;
 
 /*
@@ -5594,7 +5596,8 @@ ATParseTransformCmd(List **wqueue, AlteredTableInfo *tab, Relation rel,
 	 atstmt,
 	 context->queryString,
 	 ,
-	 );
+	 ,
+	 context);
 
 	/* Execute any statements that should happen before these subcommand(s) */
 	foreach(lc, beforeStmts)
@@ -6230,10 +6233,11 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode)
 	if (ex->is_generated)
 		continue;
 
-	newslot->tts_values[ex->attnum - 1]
-		= ExecEvalExpr(ex->exprstate,
-	   econtext,
-	   >tts_isnull[ex->attnum - 1]);
+	if (!ex->new_on_null || oldslot->tts_isnull[ex->attnum - 1])
+		newslot->tts_values[ex->attnum - 1]
+			= ExecEvalExpr(ex->exprstate,
+		   econtext,
+		   >tts_isnull[ex->attnum - 1]);
 }
 
 ExecStoreVirtualTuple(newslot);
@@ -13284,6 +13288,7 @@ ATPrepAlterColumnType(List **wqueue,
 		newval->attnum = attnum;
 		newval->expr = (Expr *) transform;
 		newval->is_generated = false;
+		newval->new_on_null = def->from_serial;
 
 		tab->newvals = lappend(tab->newvals, newval);
 		if (ATColumnChangeRequiresRewrite(transform, attnum))
@@ -13495,6 +13500,48 @@ ATExecAlterColumnType(AlteredTableInfo *tab, Relation rel,
 	HeapTuple	depTup;
 	ObjectAddress address;
 
+	if (def->from_serial)
+	{
+		/*
+		 * Store the DEFAULT for the from_serial
+		 */
+		/* XXX: copy from ATExecAddColumn */
+		RawColumnDefault *rawEnt;
+
+		heapTup = SearchSysCacheCopyAttName(RelationGetRelid(rel), colName);
+		attTup = (Form_pg_attribute) GETSTRUCT(heapTup);
+
+		rawEnt = (RawColumnDefault *) palloc(sizeof(RawColumnDefault));
+		rawEnt->attnum = attTup->attnum;
+		rawEnt->raw_default = copyObject(def->raw_default);
+
+		/*
+		 * Attempt to skip a complete table rewrite by storing the specified
+		 * DEFAULT value outside of the heap.  This may be disabled inside
+		 * AddRelationNewConstraints if the optimization cannot be applied.
+		 */
+		rawEnt->missingMode = (!def->generated);
+
+		rawEnt->generated = def->generated;
+
+		/*
+		 * This function is intended for CREATE TABLE, so it processes a
+		 * _list_ of defaults, but we just do one.
+		 */
+		AddRelationNewConstraints(rel, list_make1(rawEnt), NIL,
+  false, true, false, NULL);
+
+		/* Make the additional catalog changes visible */
+		CommandCounterIncrement();
+
+		/*
+		 * Did the request for a missing value work? If not we'll have to do a
+		 * rewrite
+		 */
+		if (!rawEnt->missingMode)
+