Re: Synchronizing slots from primary to standby
Hi, On Sat, Feb 17, 2024 at 10:10:18AM +0530, Amit Kapila wrote: > On Fri, Feb 16, 2024 at 4:10 PM shveta malik wrote: > > > > On Thu, Feb 15, 2024 at 10:48 PM Bertrand Drouvot > > wrote: > > > > > 5 === > > > > > > + if (SlotSyncWorker->syncing) > > > { > > > - SpinLockRelease(>mutex); > > > + SpinLockRelease(>mutex); > > > ereport(ERROR, > > > > > > errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), > > > errmsg("cannot synchronize replication > > > slots concurrently")); > > > } > > > > > > worth to add a test in 040_standby_failover_slots_sync.pl for it? > > > > It will be very difficult to stabilize this test as we have to make > > sure that the concurrent users (SQL function(s) and/or worker(s)) are > > in that target function at the same time to hit it. > > > > Yeah, I also think would be tricky to write a stable test, maybe one > can explore using a new injection point facility but I don't think it > is worth for this error check as this appears straightforward to be > broken in the future by other changes. Yeah, injection point would probably be the way to go. Agree that's probably not worth adding such a test (we can change our mind later on if needed anyway). Regards, -- Bertrand Drouvot PostgreSQL Contributors Team RDS Open Source Databases Amazon Web Services: https://aws.amazon.com
Re: Injection points: some tools to wait and wake
On Mon, Feb 19, 2024 at 03:01:40PM +0900, Michael Paquier wrote: > 0002 is a polished version of the TAP test that makes use of this > facility, providing coverage for the bug fixed by 7863ee4def65 > (reverting this commit causes the test to fail), where a restart point > runs across a promotion request. The trick is to stop the > checkpointer in the middle of a restart point and issue a promotion > in-between. The CF bot has been screaming at this one on Windows because the process started with IPC::Run::start was not properly finished, so attached is an updated version to bring that back to green. -- Michael From ef27d6eba619f5106915de6b05cbeb5294263c30 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Mon, 19 Feb 2024 14:35:55 +0900 Subject: [PATCH v2 1/2] injection_points: Add routines to wait and wake processes This commit is made of two parts: - A new callback that can be attached to a process to make it wait on a condition variable. The condition checked is registered in shared memory by the module injection_points. - A new SQL function to update the shared state and broadcast the update using a condition variable. The shared state used by the module is registered using the DSM registry, and is optional. --- .../injection_points--1.0.sql | 10 ++ .../injection_points/injection_points.c | 104 ++ src/tools/pgindent/typedefs.list | 1 + 3 files changed, 115 insertions(+) diff --git a/src/test/modules/injection_points/injection_points--1.0.sql b/src/test/modules/injection_points/injection_points--1.0.sql index 5944c41716..20479991f2 100644 --- a/src/test/modules/injection_points/injection_points--1.0.sql +++ b/src/test/modules/injection_points/injection_points--1.0.sql @@ -24,6 +24,16 @@ RETURNS void AS 'MODULE_PATHNAME', 'injection_points_run' LANGUAGE C STRICT PARALLEL UNSAFE; +-- +-- injection_points_wake() +-- +-- Wakes a condition variable waited on in an injection point. +-- +CREATE FUNCTION injection_points_wake() +RETURNS void +AS 'MODULE_PATHNAME', 'injection_points_wake' +LANGUAGE C STRICT PARALLEL UNSAFE; + -- -- injection_points_detach() -- diff --git a/src/test/modules/injection_points/injection_points.c b/src/test/modules/injection_points/injection_points.c index e843e6594f..3a319b1525 100644 --- a/src/test/modules/injection_points/injection_points.c +++ b/src/test/modules/injection_points/injection_points.c @@ -18,18 +18,67 @@ #include "postgres.h" #include "fmgr.h" +#include "storage/condition_variable.h" #include "storage/lwlock.h" #include "storage/shmem.h" +#include "storage/dsm_registry.h" #include "utils/builtins.h" #include "utils/injection_point.h" #include "utils/wait_event.h" PG_MODULE_MAGIC; +/* Shared state information for injection points. */ +typedef struct InjectionPointSharedState +{ + /* protects accesses to wait_counts */ + slock_t lock; + + /* Counter advancing when injection_points_wake() is called */ + int wait_counts; + + /* + * Condition variable that can be used in an injection point, checking + * upon wait_counts when waiting. + */ + ConditionVariable wait_point; +} InjectionPointSharedState; + +/* Pointer to shared-memory state. */ +static InjectionPointSharedState *inj_state = NULL; + +/* Wait event when waiting on condition variable */ +static uint32 injection_wait_event = 0; + extern PGDLLEXPORT void injection_error(const char *name); extern PGDLLEXPORT void injection_notice(const char *name); +extern PGDLLEXPORT void injection_wait(const char *name); +static void +injection_point_init_state(void *ptr) +{ + InjectionPointSharedState *state = (InjectionPointSharedState *) ptr; + + state->wait_counts = 0; + SpinLockInit(>lock); + ConditionVariableInit(>wait_point); +} + +static void +injection_init_shmem(void) +{ + bool found; + + if (inj_state != NULL) + return; + + inj_state = GetNamedDSMSegment("injection_points", + sizeof(InjectionPointSharedState), + injection_point_init_state, + ); +} + /* Set of callbacks available to be attached to an injection point. */ void injection_error(const char *name) @@ -43,6 +92,38 @@ injection_notice(const char *name) elog(NOTICE, "notice triggered for injection point %s", name); } +/* Wait on a condition variable, awaken by injection_points_wake() */ +void +injection_wait(const char *name) +{ + int old_wait_counts; + + if (inj_state == NULL) + injection_init_shmem(); + if (injection_wait_event == 0) + injection_wait_event = WaitEventExtensionNew("injection_wait"); + + SpinLockAcquire(_state->lock); + old_wait_counts = inj_state->wait_counts; + SpinLockRelease(_state->lock); + + /* And sleep.. */ + ConditionVariablePrepareToSleep(_state->wait_point); + for (;;) + { + int new_wait_counts; + + SpinLockAcquire(_state->lock); + new_wait_counts = inj_state->wait_counts; + SpinLockRelease(_state->lock); + + if (old_wait_counts != new_wait_counts) + break; +
Re: Fix race condition in InvalidatePossiblyObsoleteSlot()
Hi, On Mon, Feb 19, 2024 at 03:14:07PM +0900, Michael Paquier wrote: > On Thu, Feb 15, 2024 at 11:24:59AM +, Bertrand Drouvot wrote: > > Thanks for looking at it! > > > > Agree, using an assertion instead in v3 attached. > > And you did not forget the PG_USED_FOR_ASSERTS_ONLY. Thanks to the "CompilerWarnings" cirrus test ;-) > > > > It seems important to document why we are saving this data here; while > > > we hold the LWLock for repslots, before we perform any termination, > > > and we want to protect conflict reports with incorrect data if the > > > slot data got changed while the lwlock is temporarily released while > > > there's a termination. > > > > Yeah, comments added in v3. > > The contents look rather OK, I may do some word-smithing for both. Thanks! > >> For example just after the kill()? It seems to me > >> that we should stuck the checkpointer, perform a forced upgrade of the > >> xmins, release the checkpointer and see the effects of the change in > >> the second loop. Now, modules/injection_points/ does not allow that, > >> yet, but I've posted a patch among these lines that can stop a process > >> and release it using a condition variable (should be 0006 on [1]). I > >> was planning to start a new thread with a patch posted for the next CF > >> to add this kind of facility with a regression test for an old bug, > >> the patch needs a few hours of love, first. I should be able to post > >> that next week. > > > > Great, that looks like a good idea! > > I've been finally able to spend some time on what I had in mind and > posted it here for the next CF: > https://www.postgresql.org/message-id/zdluxbk5hgpol...@paquier.xyz > > You should be able to reuse that the same way I do in 0002 posted on > the thread, where I'm causing the checkpointer to wait, then wake it > up. Thanks! I'll look at it. > > Are we going to fix this on master and 16 stable first and then later on > > add a > > test on master with the injection points? > > Still, the other patch is likely going to take a couple of weeks > before getting into the tree, so I have no objection to fix the bug > first and backpatch, then introduce a test. Things have proved that > failures could show up in the buildfarm in this area, so more time > running this code across two branches is not a bad concept, either. Fully agree. Regards, -- Bertrand Drouvot PostgreSQL Contributors Team RDS Open Source Databases Amazon Web Services: https://aws.amazon.com
Re: Show WAL write and fsync stats in pg_stat_io
Hi, On Thu, 18 Jan 2024 at 04:22, Michael Paquier wrote: > > On Wed, Jan 17, 2024 at 03:20:39PM +0300, Nazir Bilal Yavuz wrote: > > I agree with your points. While the other I/O related work is > > happening we can discuss what we should do in the variable op_byte > > cases. Also, this is happening only for WAL right now but if we try to > > extend pg_stat_io in the future, that problem possibly will rise > > again. So, it could be good to come up with a general solution, not > > only for WAL. > > Okay, I've marked the patch as RwF for this CF. I wanted to inform you that the 73f0a13266 commit changed all WALRead calls to read variable bytes, only the WAL receiver was reading variable bytes before. -- Regards, Nazir Bilal Yavuz Microsoft
Re: Add pg_basetype() function to obtain a DOMAIN base type
On Sun, Feb 18, 2024 at 7:29 AM Tomas Vondra wrote: > > > Also, now that I looked at the v2 patch again, I see it only really > tweaked the pg_proc.dat entry, but the code still does PG_GETARG_OID (so > the "any" bit is not really correct). > PG_GETARG_OID part indeed is wrong. so I change to following: +Datum +pg_basetype(PG_FUNCTION_ARGS) +{ + Oid oid; + + oid = get_fn_expr_argtype(fcinfo->flinfo, 0); + if (!SearchSysCacheExists1(TYPEOID, ObjectIdGetDatum(oid))) + PG_RETURN_NULL(); + + PG_RETURN_OID(getBaseType(oid)); +} I still name the function as pg_basetype, feel free to change it. + + + + pg_basetype + +pg_basetype ( "any" ) +regtype + + + Returns the OID of the base type of a domain or if the argument is a basetype it returns the same type. + If there's a chain of domain dependencies, it will recurse until finding the base type. + compared with pg_typeof's explanation, I feel like pg_basetype's explanation doesn't seem accurate. However, I don't know how to rephrase it. From a06f2de575da6e5fa45919c792f3dab2470f4927 Mon Sep 17 00:00:00 2001 From: jian he Date: Mon, 19 Feb 2024 15:01:19 +0800 Subject: [PATCH v3 1/1] Add pg_basetype("any") function Currently obtaining the base type of a domain involves a complex SQL query, this specially in the case of recursive domain types. To solve this, use the already available getBaseType() function, and expose it as the pg_basetype SQL function. if the argument is not a doamin type, return the type of the argument as is. if the argument is a type of doamin, pg_basetype will recurse the domain dependencies chain and return the real inner base type of the domain. discussion: https://postgr.es/m/CAGRrpzZSX8j=MQcbCSEisFA=ic=K3bknVfnFjAv1diVJxFHJvg@mail.gmail.com --- doc/src/sgml/func.sgml | 25 +++ src/backend/utils/adt/misc.c | 16 + src/include/catalog/pg_proc.dat | 3 +++ src/test/regress/expected/domain.out | 36 src/test/regress/sql/domain.sql | 16 + 5 files changed, 96 insertions(+) diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index cf3de803..5af0f4c6 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -24778,6 +24778,31 @@ SELECT pg_type_is_visible('myschema.widget'::regtype); + + + + pg_basetype + +pg_basetype ( "any" ) +regtype + + + Returns the OID of the base type of a domain or if the argument is a basetype it returns the same type. + If there's a chain of domain dependencies, it will recurse until finding the base type. + + +For example: + +CREATE DOMAIN mytext as text; + +SELECT pg_basetype('mytext'::mytext); + pg_basetype +--- + text + + + + diff --git a/src/backend/utils/adt/misc.c b/src/backend/utils/adt/misc.c index 2d7d7806..30f6a2e5 100644 --- a/src/backend/utils/adt/misc.c +++ b/src/backend/utils/adt/misc.c @@ -45,6 +45,7 @@ #include "utils/fmgroids.h" #include "utils/lsyscache.h" #include "utils/ruleutils.h" +#include "utils/syscache.h" #include "utils/timestamp.h" @@ -566,6 +567,21 @@ pg_typeof(PG_FUNCTION_ARGS) PG_RETURN_OID(get_fn_expr_argtype(fcinfo->flinfo, 0)); } +/* + * Return the base type of the argument. + * iff the argument is not a type of domain, return the argument's type as is. + */ +Datum +pg_basetype(PG_FUNCTION_ARGS) +{ + Oid oid; + + oid = get_fn_expr_argtype(fcinfo->flinfo, 0); + if (!SearchSysCacheExists1(TYPEOID, ObjectIdGetDatum(oid))) + PG_RETURN_NULL(); + + PG_RETURN_OID(getBaseType(oid)); +} /* * Implementation of the COLLATE FOR expression; returns the collation diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 9c120fc2..bbd6b411 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -3877,6 +3877,9 @@ { oid => '1619', descr => 'type of the argument', proname => 'pg_typeof', proisstrict => 'f', provolatile => 's', prorettype => 'regtype', proargtypes => 'any', prosrc => 'pg_typeof' }, +{ oid => '6312', descr => 'get the base type of a domain', + proname => 'pg_basetype', proisstrict => 'f', provolatile => 's', + prorettype => 'regtype', proargtypes => 'any', prosrc => 'pg_basetype' }, { oid => '3162', descr => 'collation of the argument; implementation of the COLLATION FOR expression', proname => 'pg_collation_for', proisstrict => 'f', provolatile => 's', diff --git a/src/test/regress/expected/domain.out b/src/test/regress/expected/domain.out index 6d94e844..13bf7877 100644 --- a/src/test/regress/expected/domain.out +++ b/src/test/regress/expected/domain.out @@ -1207,3 +1207,39 @@ create domain testdomain1 as int constraint unsigned check (value > 0); alter domain testdomain1 rename constraint unsigned to
Add an option to skip loading missing publication to avoid logical replication failure
Hi, Currently ALTER SUBSCRIPTION ... SET PUBLICATION will break the logical replication in certain cases. This can happen as the apply worker will get restarted after SET PUBLICATION, the apply worker will use the existing slot and replication origin corresponding to the subscription. Now, it is possible that before restart the origin has not been updated and the WAL start location points to a location prior to where PUBLICATION pub exists which can lead to such an error. Once this error occurs, apply worker will never be able to proceed and will always return the same error. There was discussion on this and Amit had posted a patch to handle this at [2]. Amit's patch does continue using a historic snapshot but ignores publications that are not found for the purpose of computing RelSyncEntry attributes. We won't mark such an entry as valid till all the publications are loaded without anything missing. This means we won't publish operations on tables corresponding to that publication till we found such a publication and that seems okay. I have added an option skip_not_exist_publication to enable this operation only when skip_not_exist_publication is specified as true. There is no change in default behavior when skip_not_exist_publication is specified as false. But one thing to note with the patch (with skip_not_exist_publication option) is that replication of few WAL entries will be skipped till the publication is loaded like in the below example: -- Create table in publisher and subscriber create table t1(c1 int); create table t2(c1 int); -- Create publications create publication pub1 for table t1; create publication pub2 for table t2; -- Create subscription create subscription test1 connection 'dbname=postgres host=localhost port=5432' publication pub1, pub2; -- Drop one publication drop publication pub1; -- Insert in the publisher insert into t1 values(11); insert into t2 values(21); -- Select in subscriber postgres=# select * from t1; c1 (0 rows) postgres=# select * from t2; c1 21 (1 row) -- Create the dropped publication in publisher create publication pub1 for table t1; -- Insert in the publisher insert into t1 values(12); postgres=# select * from t1; c1 11 12 (2 rows) -- Select data in subscriber postgres=# select * from t1; -- record with value 11 will be missing in subscriber c1 12 (1 row) Thoughts? [1] - https://www.postgresql.org/message-id/CAA4eK1%2BT-ETXeRM4DHWzGxBpKafLCp__5bPA_QZfFQp7-0wj4Q%40mail.gmail.com Regards, Vignesh From 9b79dee26554ae4af9ff00948ab185482b071ba8 Mon Sep 17 00:00:00 2001 From: Vignesh C Date: Mon, 19 Feb 2024 10:20:02 +0530 Subject: [PATCH v1 1/2] Skip loading the publication if the publication does not exist. Skip loading the publication if the publication does not exist. --- src/backend/replication/pgoutput/pgoutput.c | 28 +++-- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/src/backend/replication/pgoutput/pgoutput.c b/src/backend/replication/pgoutput/pgoutput.c index 998f92d671..f7b6d0384d 100644 --- a/src/backend/replication/pgoutput/pgoutput.c +++ b/src/backend/replication/pgoutput/pgoutput.c @@ -82,7 +82,7 @@ static void pgoutput_stream_prepare_txn(LogicalDecodingContext *ctx, static bool publications_valid; -static List *LoadPublications(List *pubnames); +static List *LoadPublications(List *pubnames, bool *skipped); static void publication_invalidation_cb(Datum arg, int cacheid, uint32 hashvalue); static void send_relation_and_attrs(Relation relation, TransactionId xid, @@ -1703,9 +1703,13 @@ pgoutput_shutdown(LogicalDecodingContext *ctx) /* * Load publications from the list of publication names. + * + * Here, we just skip the publications that don't exist yet. 'skipped' + * will be true if we find any publication from the given list that doesn't + * exist. */ static List * -LoadPublications(List *pubnames) +LoadPublications(List *pubnames, bool *skipped) { List *result = NIL; ListCell *lc; @@ -1713,9 +1717,12 @@ LoadPublications(List *pubnames) foreach(lc, pubnames) { char *pubname = (char *) lfirst(lc); - Publication *pub = GetPublicationByName(pubname, false); + Publication *pub = GetPublicationByName(pubname, true); - result = lappend(result, pub); + if (pub) + result = lappend(result, pub); + else + *skipped = true; } return result; @@ -1994,7 +2001,7 @@ get_rel_sync_entry(PGOutputData *data, Relation relation) } /* Validate the entry */ - if (!entry->replicate_valid) + if (!entry->replicate_valid || !publications_valid) { Oid schemaId = get_rel_namespace(relid); List *pubids = GetRelationPublications(relid); @@ -2011,6 +2018,7 @@ get_rel_sync_entry(PGOutputData *data, Relation relation) bool am_partition = get_rel_relispartition(relid); char relkind = get_rel_relkind(relid); List *rel_publications = NIL; + bool skipped_pub = false; /* Reload publications if needed before use.
Re: A new message seems missing a punctuation
On Mon, Feb 19, 2024 at 12:14 PM Amit Kapila wrote: > > On Mon, Feb 19, 2024 at 11:42 AM Robert Haas wrote: > > > > On Mon, Feb 19, 2024 at 11:04 AM Kyotaro Horiguchi > > wrote: > > > Or I thought the values could be moved to DETAILS: line. > > > > Yeah, I think that's likely to be the right approach here. The details > > aren't too clear to me. > > > > Does the primary error message really need to say "could not sync > > slot"? If it will be obvious from context that we were trying to sync > > a slot, then it would be fine to just say "ERROR: remote slot precedes > > local slot". > > > > As this is a LOG message, I feel one may need some more information on > the context but it is not mandatory. > > > But I also don't quite understand what problem this is trying to > > report. Is this slot-syncing code running on the primary or the > > standby? If it's running on the primary, then surely it's expected > > that the remote slot will precede the local one. And if it's running > > on the standby, then the comments in > > update_and_persist_local_synced_slot about waiting for the remote side > > to catch up seem quite confusing, because surely we're chasing the > > primary and not the other way around? > > > > The local's restart_lsn could be ahead of than primary's for the very > first sync when the WAL corresponding to the remote's restart_lsn is > not available on standby (say due to a different wal related settings > the required WAL has been removed when we first time tried to sync the > slot). For more details, you can refer to comments atop slotsync.c > starting from "If the WAL corresponding to the remote's restart_lsn > ..." > Sorry, I gave the wrong reference, the comments I was referring to start with: "If on physical standby, the WAL corresponding ...". -- With Regards, Amit Kapila.
Re: pg_upgrade and logical replication
On Mon, Feb 19, 2024 at 6:54 AM Hayato Kuroda (Fujitsu) wrote: > > Thanks for reviewing! PSA new version. > Pushed this after making minor changes in the comments. -- With Regards, Amit Kapila.
Re: Add last_commit_lsn to pg_stat_database
On Sun, Feb 18, 2024 at 02:28:06AM +0100, Tomas Vondra wrote: > Thanks for the updated patch. I don't have a clear opinion on the > feature and whether this is the way to implement it, but I have two > simple questions. Some users I know of would be really happy to be able to get this information for each database in a single view, so that feels natural to plug the information into pg_stat_database. > 1) Do we really need to modify RecordTransactionCommitPrepared and > XactLogCommitRecord to return the LSN of the commit record? Can't we > just use XactLastRecEnd? It's good enough for advancing > replorigin_session_origin_lsn, why wouldn't it be good enough for this? Or XactLastCommitEnd? The changes in AtEOXact_PgStat() are not really attractive for what's a static variable in all the backends. > 2) Earlier in this thread it was claimed the function returning the > last_commit_lsn is STABLE, but I have to admit it's not clear to me why > that's the case :-( All the pg_stat_get_db_* functions are marked as > stable, so I guess it's thanks to the pgstat system ... The consistency of the shared stats data depends on stats_fetch_consistency. The default of "cache" makes sure that the values don't change across a scan, until the end of a transaction. I have not paid much attention about that until now, but note that it would not be the case of "none" where the data is retrieved each time it is requested. So it seems to me that these functions should be actually volatile, not stable, because they could deliver different values across the same scan as an effect of the design behind pgstat_fetch_entry() and a non-default stats_fetch_consistency. I may be missing something, of course. -- Michael signature.asc Description: PGP signature
Re: A new message seems missing a punctuation
On Mon, Feb 19, 2024 at 11:42 AM Robert Haas wrote: > > On Mon, Feb 19, 2024 at 11:04 AM Kyotaro Horiguchi > wrote: > > Or I thought the values could be moved to DETAILS: line. > > Yeah, I think that's likely to be the right approach here. The details > aren't too clear to me. > > Does the primary error message really need to say "could not sync > slot"? If it will be obvious from context that we were trying to sync > a slot, then it would be fine to just say "ERROR: remote slot precedes > local slot". > As this is a LOG message, I feel one may need some more information on the context but it is not mandatory. > But I also don't quite understand what problem this is trying to > report. Is this slot-syncing code running on the primary or the > standby? If it's running on the primary, then surely it's expected > that the remote slot will precede the local one. And if it's running > on the standby, then the comments in > update_and_persist_local_synced_slot about waiting for the remote side > to catch up seem quite confusing, because surely we're chasing the > primary and not the other way around? > The local's restart_lsn could be ahead of than primary's for the very first sync when the WAL corresponding to the remote's restart_lsn is not available on standby (say due to a different wal related settings the required WAL has been removed when we first time tried to sync the slot). For more details, you can refer to comments atop slotsync.c starting from "If the WAL corresponding to the remote's restart_lsn ..." > But if we ignore all of that, then we could just do this: > > ERROR: could not sync slot information as remote slot precedes local slot > DETAIL: Remote slot has LSN %X/%X and catalog xmin %u, but remote slot > has LSN %X/%X and catalog xmin %u. > This looks good to me but instead of ERROR here we want to use LOG. -- With Regards, Amit Kapila.
Re: Add system identifier to backup manifest
On Mon, Feb 19, 2024 at 4:22 AM Michael Paquier wrote: > On Thu, Feb 15, 2024 at 05:41:46PM +0530, Robert Haas wrote: > > On Thu, Feb 15, 2024 at 3:05 PM Amul Sul wrote: > > > Kindly have a look at the attached version. > > > > IMHO, 0001 looks fine, except probably the comment could be phrased a > > bit more nicely. > > And the new option should be documented at the top of the init() > routine for perldoc. > Added in the attached version. > > That can be left for whoever commits this to > > wordsmith. Michael, what are your plans? > > Not much, so feel free to not wait for me. I've just read through the > patch because I like the idea/feature :) > Thank you, that helped a lot. > 0002 seems like a reasonable approach, but there's a hunk in the wrong > > patch: 0004 modifies pg_combinebackup's check_control_files to use > > get_dir_controlfile() rather than git_controlfile(), but it looks to > > me like that should be part of 0002. > Fixed in the attached version. > I'm slightly concerned about 0002 that silently changes the meaning of > get_controlfile(). That would impact extension code without people > knowing about it when compiling, just when they run their stuff under > 17~. > Agreed, now they will have an error as _could not read file "": Is a directory_. But, IIUC, that what usually happens with the dev version, and the extension needs to be updated for compatibility with the newer version for the same reason. Regards, Amul From 354014538b16579f005bd6f4ce771b1aa22b5e02 Mon Sep 17 00:00:00 2001 From: Amul Sul Date: Thu, 15 Feb 2024 12:10:23 +0530 Subject: [PATCH v8 1/4] Add option to force initdb in PostgreSQL::Test::Cluster:init() --- src/bin/pg_combinebackup/t/005_integrity.pl | 19 +++ src/test/perl/PostgreSQL/Test/Cluster.pm| 15 ++- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/bin/pg_combinebackup/t/005_integrity.pl b/src/bin/pg_combinebackup/t/005_integrity.pl index 3b445d0e30f..5d425209211 100644 --- a/src/bin/pg_combinebackup/t/005_integrity.pl +++ b/src/bin/pg_combinebackup/t/005_integrity.pl @@ -18,18 +18,13 @@ $node1->init(has_archiving => 1, allows_streaming => 1); $node1->append_conf('postgresql.conf', 'summarize_wal = on'); $node1->start; -# Set up another new database instance. We don't want to use the cached -# INITDB_TEMPLATE for this, because we want it to be a separate cluster -# with a different system ID. -my $node2; -{ - local $ENV{'INITDB_TEMPLATE'} = undef; - - $node2 = PostgreSQL::Test::Cluster->new('node2'); - $node2->init(has_archiving => 1, allows_streaming => 1); - $node2->append_conf('postgresql.conf', 'summarize_wal = on'); - $node2->start; -} +# Set up another new database instance with force initdb option. We don't want +# to initializing database system by copying initdb template for this, because +# we want it to be a separate cluster with a different system ID. +my $node2 = PostgreSQL::Test::Cluster->new('node2'); +$node2->init(force_initdb => 1, has_archiving => 1, allows_streaming => 1); +$node2->append_conf('postgresql.conf', 'summarize_wal = on'); +$node2->start; # Take a full backup from node1. my $backup1path = $node1->backup_dir . '/backup1'; diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm index 07da74cf562..2b4f9a48365 100644 --- a/src/test/perl/PostgreSQL/Test/Cluster.pm +++ b/src/test/perl/PostgreSQL/Test/Cluster.pm @@ -495,6 +495,10 @@ a directory that's only accessible to the current user to ensure that. On Windows, we use SSPI authentication to ensure the same (by pg_regress --config-auth). +force_initdb => 1 will force to initialized the cluster by initdb. Otherwise, if +available and if there aren't any parameters, use a previously initdb'd cluster +as a template by copying it. + WAL archiving can be enabled on this node by passing the keyword parameter has_archiving => 1. This is disabled by default. @@ -517,6 +521,7 @@ sub init local %ENV = $self->_get_env(); + $params{force_initdb} = 0 unless defined $params{force_initdb}; $params{allows_streaming} = 0 unless defined $params{allows_streaming}; $params{has_archiving} = 0 unless defined $params{has_archiving}; @@ -529,14 +534,14 @@ sub init mkdir $self->backup_dir; mkdir $self->archive_dir; - # If available and if there aren't any parameters, use a previously - # initdb'd cluster as a template by copying it. For a lot of tests, that's - # substantially cheaper. Do so only if there aren't parameters, it doesn't - # seem worth figuring out whether they affect compatibility. + # For a lot of tests, that's substantially cheaper to copy previously + # initdb'd cluster as a template. Do so only if force_initdb => 0, and there + # aren't parameters, it doesn't seem worth figuring out whether they affect + # compatibility. # # There's very similar code in pg_regress.c, but we can't easily # deduplicate it until we require perl at build time. -
Re: Fix race condition in InvalidatePossiblyObsoleteSlot()
On Thu, Feb 15, 2024 at 11:24:59AM +, Bertrand Drouvot wrote: > Thanks for looking at it! > > Agree, using an assertion instead in v3 attached. And you did not forget the PG_USED_FOR_ASSERTS_ONLY. > > It seems important to document why we are saving this data here; while > > we hold the LWLock for repslots, before we perform any termination, > > and we want to protect conflict reports with incorrect data if the > > slot data got changed while the lwlock is temporarily released while > > there's a termination. > > Yeah, comments added in v3. The contents look rather OK, I may do some word-smithing for both. >> For example just after the kill()? It seems to me >> that we should stuck the checkpointer, perform a forced upgrade of the >> xmins, release the checkpointer and see the effects of the change in >> the second loop. Now, modules/injection_points/ does not allow that, >> yet, but I've posted a patch among these lines that can stop a process >> and release it using a condition variable (should be 0006 on [1]). I >> was planning to start a new thread with a patch posted for the next CF >> to add this kind of facility with a regression test for an old bug, >> the patch needs a few hours of love, first. I should be able to post >> that next week. > > Great, that looks like a good idea! I've been finally able to spend some time on what I had in mind and posted it here for the next CF: https://www.postgresql.org/message-id/zdluxbk5hgpol...@paquier.xyz You should be able to reuse that the same way I do in 0002 posted on the thread, where I'm causing the checkpointer to wait, then wake it up. > Are we going to fix this on master and 16 stable first and then later on add a > test on master with the injection points? Still, the other patch is likely going to take a couple of weeks before getting into the tree, so I have no objection to fix the bug first and backpatch, then introduce a test. Things have proved that failures could show up in the buildfarm in this area, so more time running this code across two branches is not a bad concept, either. -- Michael signature.asc Description: PGP signature
Re: A new message seems missing a punctuation
On Mon, Feb 19, 2024 at 11:04 AM Kyotaro Horiguchi wrote: > Or I thought the values could be moved to DETAILS: line. Yeah, I think that's likely to be the right approach here. The details aren't too clear to me. Does the primary error message really need to say "could not sync slot"? If it will be obvious from context that we were trying to sync a slot, then it would be fine to just say "ERROR: remote slot precedes local slot". But I also don't quite understand what problem this is trying to report. Is this slot-syncing code running on the primary or the standby? If it's running on the primary, then surely it's expected that the remote slot will precede the local one. And if it's running on the standby, then the comments in update_and_persist_local_synced_slot about waiting for the remote side to catch up seem quite confusing, because surely we're chasing the primary and not the other way around? But if we ignore all of that, then we could just do this: ERROR: could not sync slot information as remote slot precedes local slot DETAIL: Remote slot has LSN %X/%X and catalog xmin %u, but remote slot has LSN %X/%X and catalog xmin %u. which would fix the original complaint, and my point about using English rather than just printing a bunch of values. -- Robert Haas EDB: http://www.enterprisedb.com
Injection points: some tools to wait and wake
Hi all, (Ashutosh in CC as he was involved in the discussion last time.) I have proposed on the original thread related to injection points to have more stuff to be able to wait at an arbtrary point and wake at will the process waiting so as it is possible to control the order of actions taken in a test: https://www.postgresql.org/message-id/ZTiV8tn_MIb_H2rE%40paquier.xyz I didn't do that in the other thread out of time, but here is a patch set to complete what I wanted, using a condition variable to wait and wake processes: - State is in shared memory, using a DSM tracked by the registry and an integer counter. - Callback to wait on a condition variable. - SQL function to update the shared state and broadcast the update to the condition variable. - Use a custom wait event to track the wait in pg_stat_activity. 0001 requires no backend changes, only more stuff into the test module injection_points so that could be backpatched assuming that the backend is able to support injection points. This could be expanded into using more variables and/or states, but I don't really see a point in introducing more without a reason to do so, and I have no need for more at the moment. 0002 is a polished version of the TAP test that makes use of this facility, providing coverage for the bug fixed by 7863ee4def65 (reverting this commit causes the test to fail), where a restart point runs across a promotion request. The trick is to stop the checkpointer in the middle of a restart point and issue a promotion in-between. Thoughts and comments are welcome. -- Michael From ef27d6eba619f5106915de6b05cbeb5294263c30 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Mon, 19 Feb 2024 14:35:55 +0900 Subject: [PATCH 1/2] injection_points: Add routines to wait and wake processes This commit is made of two parts: - A new callback that can be attached to a process to make it wait on a condition variable. The condition checked is registered in shared memory by the module injection_points. - A new SQL function to update the shared state and broadcast the update using a condition variable. The shared state used by the module is registered using the DSM registry, and is optional. --- .../injection_points--1.0.sql | 10 ++ .../injection_points/injection_points.c | 104 ++ src/tools/pgindent/typedefs.list | 1 + 3 files changed, 115 insertions(+) diff --git a/src/test/modules/injection_points/injection_points--1.0.sql b/src/test/modules/injection_points/injection_points--1.0.sql index 5944c41716..20479991f2 100644 --- a/src/test/modules/injection_points/injection_points--1.0.sql +++ b/src/test/modules/injection_points/injection_points--1.0.sql @@ -24,6 +24,16 @@ RETURNS void AS 'MODULE_PATHNAME', 'injection_points_run' LANGUAGE C STRICT PARALLEL UNSAFE; +-- +-- injection_points_wake() +-- +-- Wakes a condition variable waited on in an injection point. +-- +CREATE FUNCTION injection_points_wake() +RETURNS void +AS 'MODULE_PATHNAME', 'injection_points_wake' +LANGUAGE C STRICT PARALLEL UNSAFE; + -- -- injection_points_detach() -- diff --git a/src/test/modules/injection_points/injection_points.c b/src/test/modules/injection_points/injection_points.c index e843e6594f..3a319b1525 100644 --- a/src/test/modules/injection_points/injection_points.c +++ b/src/test/modules/injection_points/injection_points.c @@ -18,18 +18,67 @@ #include "postgres.h" #include "fmgr.h" +#include "storage/condition_variable.h" #include "storage/lwlock.h" #include "storage/shmem.h" +#include "storage/dsm_registry.h" #include "utils/builtins.h" #include "utils/injection_point.h" #include "utils/wait_event.h" PG_MODULE_MAGIC; +/* Shared state information for injection points. */ +typedef struct InjectionPointSharedState +{ + /* protects accesses to wait_counts */ + slock_t lock; + + /* Counter advancing when injection_points_wake() is called */ + int wait_counts; + + /* + * Condition variable that can be used in an injection point, checking + * upon wait_counts when waiting. + */ + ConditionVariable wait_point; +} InjectionPointSharedState; + +/* Pointer to shared-memory state. */ +static InjectionPointSharedState *inj_state = NULL; + +/* Wait event when waiting on condition variable */ +static uint32 injection_wait_event = 0; + extern PGDLLEXPORT void injection_error(const char *name); extern PGDLLEXPORT void injection_notice(const char *name); +extern PGDLLEXPORT void injection_wait(const char *name); +static void +injection_point_init_state(void *ptr) +{ + InjectionPointSharedState *state = (InjectionPointSharedState *) ptr; + + state->wait_counts = 0; + SpinLockInit(>lock); + ConditionVariableInit(>wait_point); +} + +static void +injection_init_shmem(void) +{ + bool found; + + if (inj_state != NULL) + return; + + inj_state = GetNamedDSMSegment("injection_points", + sizeof(InjectionPointSharedState), + injection_point_init_state, + ); +} +
Re: PGC_SIGHUP shared_buffers?
On Mon, Feb 19, 2024 at 2:05 AM Andres Freund wrote: > We probably should address that independently of making shared_buffers > PGC_SIGHUP. The queue gets absurdly large once s_b hits a few GB. It's not > that much memory compared to the buffer blocks themselves, but a sync queue of > many millions of entries just doesn't make sense. And a few hundred MB for > that isn't nothing either, even if it's just a fraction of the space for the > buffers. It makes checkpointer more susceptible to OOM as well, because > AbsorbSyncRequests() allocates an array to copy all requests into local > memory. Sure, that could just be capped, if it makes sense. Although given the thrust of this discussion, it might be even better to couple it to something other than the size of shared_buffers. > I'd say the vast majority of postgres instances in production run with less > than 1GB of s_b. Just because numbers wise the majority of instances are > running on small VMs and/or many PG instances are running on one larger > machine. There are a lot of instances where the total available memory is > less than 2GB. Whoa. That is not my experience at all. If I've ever seen such a small system since working at EDB (since 2010!) it was just one where the initdb-time default was never changed. I can't help wondering if we should have some kind of memory_model GUC, measured in T-shirt sizes or something. We've coupled a bunch of things to shared_buffers mostly as a way of distinguishing small systems from large ones. But if we want to make shared_buffers dynamically changeable and we don't want to make all that other stuff dynamically changeable, decoupling those calculations might be an important thing to do. On a really small system, do we even need the ability to dynamically change shared_buffers at all? If we do, then I suspect the granule needs to be small. But does someone want to take a system with <1GB of shared_buffers and then scale it way, way up? I suppose it would be nice to have the option. But you might have to make some choices, like pick either a 16MB granule or a 128MB granule or a 1GB granule at startup time and then stick with it? I don't know, I'm just spitballing here, because I don't know what the real design is going to look like yet. > > Don't you have to still move buffers entirely out of the region you > > want to unmap? > > Sure. But you can unmap at the granularity of a hardware page (there is some > fragmentation cost on the OS / hardware page table level > though). Theoretically you could unmap individual 8kB pages. I thought there were problems, at least on some operating systems, if the address space mappings became too fragmented. At least, I wouldn't expect that you could use huge pages for shared_buffers and still unmap little tiny bits. How would that even work? -- Robert Haas EDB: http://www.enterprisedb.com
Re: About a recently-added message
On Mon, Feb 19, 2024 at 11:10 AM Amit Kapila wrote: > > On Thu, Feb 15, 2024 at 11:49 AM Kyotaro Horiguchi > wrote: > > > > At Thu, 15 Feb 2024 09:22:23 +0530, shveta malik > > wrote in > > > > > > +1 on changing the msg(s) suggested way. Please find the patch for the > > > same. It also removes double quotes around the variable names > > > > Thanks for the discussion. > > > > With a translator hat on, I would be happy if I could determine > > whether a word requires translation with minimal background > > information. In this case, a translator needs to know which values > > wal_level can take. It's relatively easy in this case, but I'm not > > sure if this is always the case. Therefore, I would be slightly > > happier if "logical" were double-quoted. > > > > I see that we use "logical" in double quotes in various error > messages. For example: "wal_level must be set to \"replica\" or > \"logical\" at server start". So following that we can use the double > quotes here as well. Okay, now since we will have double quotes for logical. So do you prefer the existing way of giving error msg or the changed one. Existing: errmsg("bad configuration for slot synchronization"), errhint("wal_level must be >= logical.")); errmsg("bad configuration for slot synchronization"), errhint("%s must be defined.", "primary_conninfo")); The changed one: errmsg("slot synchronization requires wal_level >= logical")); errmsg("slot synchronization requires %s to be defined", "primary_conninfo")); thanks Shveta
Re: Do away with zero-padding assumption before WALRead()
At Mon, 19 Feb 2024 11:02:39 +0530, Bharath Rupireddy wrote in > > After all, the patch looks good to me. > > Thanks. It was committed - > https://git.postgresql.org/gitweb/?p=postgresql.git;a=commitdiff;h=73f0a1326608ac3a7d390706fdeec59fe4dc42c0. Yeah. I realied that after I had already sent the mail.. No harm done:p regards. -- Kyotaro Horiguchi NTT Open Source Software Center
Re: About a recently-added message
On Thu, Feb 15, 2024 at 11:49 AM Kyotaro Horiguchi wrote: > > At Thu, 15 Feb 2024 09:22:23 +0530, shveta malik > wrote in > > > > +1 on changing the msg(s) suggested way. Please find the patch for the > > same. It also removes double quotes around the variable names > > Thanks for the discussion. > > With a translator hat on, I would be happy if I could determine > whether a word requires translation with minimal background > information. In this case, a translator needs to know which values > wal_level can take. It's relatively easy in this case, but I'm not > sure if this is always the case. Therefore, I would be slightly > happier if "logical" were double-quoted. > I see that we use "logical" in double quotes in various error messages. For example: "wal_level must be set to \"replica\" or \"logical\" at server start". So following that we can use the double quotes here as well. -- With Regards, Amit Kapila.
Re: A new message seems missing a punctuation
At Mon, 19 Feb 2024 10:31:33 +0530, Robert Haas wrote in > On Mon, Feb 19, 2024 at 10:10 AM Kyotaro Horiguchi > wrote: > > A recent commit (7a424ece48) added the following message: > > > > > could not sync slot information as remote slot precedes local slot: > > > remote slot "%s": LSN (%X/%X), catalog xmin (%u) local slot: LSN > > > (%X/%X), catalog xmin (%u) > > > > Since it is a bit overloaded but doesn't have a separator between > > "catalog xmin (%u)" and "local slot:", it is somewhat cluttered. Don't > > we need something, for example a semicolon there to improve > > readability and reduce clutter? > > I think maybe we should even revise the message even more. In most > places we do not just print out a whole bunch of values, but use a > sentence to connect them. Mmm. Something like this?: "could not sync slot information: local slot LSN (%X/%X) or xmin(%u) behind remote (%X/%X, %u)" Or I thought the values could be moved to DETAILS: line. By the way, the code around the message is as follows. > /* > * The remote slot didn't catch up to locally reserved position. > * > * We do not drop the slot because the restart_lsn can be ahead of the > * current location when recreating the slot in the next cycle. It may > * take more time to create such a slot. Therefore, we keep this slot > * and attempt the synchronization in the next cycle. > * > * XXX should this be changed to elog(DEBUG1) perhaps? > */ > ereport(LOG, > errmsg("could not sync slot information as remote slot precedes > local slot:" > " remote slot \"%s\": LSN (%X/%X), > catalog xmin (%u) local slot: LSN (%X/%X), catalog xmin (%u)", If we change this message to DEBUG1, a timeout feature to show a WARNING message might be needed for DBAs to notice that something bad is ongoing. However, it doesn't seem appropriate as a LOG message to me. Perhaps, a LOG message should be more like: > "LOG: waiting for local slot to catch up to remote" > "DETAIL: remote slot LSN (%X/%X); " regards. -- Kyotaro Horiguchi NTT Open Source Software Center
Re: Do away with zero-padding assumption before WALRead()
On Mon, Feb 19, 2024 at 8:26 AM Kyotaro Horiguchi wrote: > > On > the flip side, SimpleXLogPageRead always reads a whole page and > returns XLOG_BLCKSZ. However, as you know, the returned buffer doesn't > contain random garbage bytes. Is this assumption true when wal_init_zero is off? I think when wal_init_zero is off, the last few bytes of the last page from the WAL file may contain garbage bytes i.e. not zero bytes, no? > Therefore, it's safe as long as the > caller doesn't access beyond the returned count. As a result, the > description you pointed out seems to be enough. Right. > After all, the patch looks good to me. Thanks. It was committed - https://git.postgresql.org/gitweb/?p=postgresql.git;a=commitdiff;h=73f0a1326608ac3a7d390706fdeec59fe4dc42c0. -- Bharath Rupireddy PostgreSQL Contributors Team RDS Open Source Databases Amazon Web Services: https://aws.amazon.com
Re: A new message seems missing a punctuation
On Mon, Feb 19, 2024 at 10:31 AM Robert Haas wrote: > > On Mon, Feb 19, 2024 at 10:10 AM Kyotaro Horiguchi > wrote: > > A recent commit (7a424ece48) added the following message: > > > > > could not sync slot information as remote slot precedes local slot: > > > remote slot "%s": LSN (%X/%X), catalog xmin (%u) local slot: LSN > > > (%X/%X), catalog xmin (%u) > > > > Since it is a bit overloaded but doesn't have a separator between > > "catalog xmin (%u)" and "local slot:", it is somewhat cluttered. Don't > > we need something, for example a semicolon there to improve > > readability and reduce clutter? > > I think maybe we should even revise the message even more. In most > places we do not just print out a whole bunch of values, but use a > sentence to connect them. I have tried to reword the msg, please have a look. thanks Shveta v1-0001-Reword-LOG-msg-for-slot-sync.patch Description: Binary data
Re: Removing unneeded self joins
On 18/2/2024 23:18, Alexander Korotkov wrote: On Sun, Feb 18, 2024 at 5:04 PM Alexander Korotkov wrote: On Sun, Feb 18, 2024 at 3:00 PM Alexander Lakhin wrote: 09.01.2024 01:09, Alexander Korotkov wrote: Fixed in 30b4955a46. Please look at the following query which fails with an error since d3d55ce57: create table t (i int primary key); select t3.i from t t1 join t t2 on t1.i = t2.i, lateral (select t1.i limit 1) t3; ERROR: non-LATERAL parameter required by subquery Thank you for spotting. I'm looking at this. Attached is a draft patch fixing this query. Could you, please, recheck? I reviewed this patch. Why do you check only the target list? I guess these links can be everywhere. See the patch in the attachment with the elaborated test and slightly changed code. -- regards, Andrei Lepikhov Postgres Professional From 7f94a3c96fd410522b87e570240cdb96b300dd31 Mon Sep 17 00:00:00 2001 From: "Andrey V. Lepikhov" Date: Mon, 19 Feb 2024 12:17:55 +0700 Subject: [PATCH] Replace relids in lateral subquery target list during SJE --- src/backend/optimizer/plan/analyzejoins.c | 29 ++- src/test/regress/expected/join.out| 44 +++ src/test/regress/sql/join.sql | 12 +++ 3 files changed, 84 insertions(+), 1 deletion(-) diff --git a/src/backend/optimizer/plan/analyzejoins.c b/src/backend/optimizer/plan/analyzejoins.c index e494acd51a..072298f66c 100644 --- a/src/backend/optimizer/plan/analyzejoins.c +++ b/src/backend/optimizer/plan/analyzejoins.c @@ -395,7 +395,34 @@ remove_rel_from_query(PlannerInfo *root, RelOptInfo *rel, } /* Update lateral references. */ - replace_varno((Node *) otherrel->lateral_vars, relid, subst); + if (root->hasLateralRTEs) + { + RangeTblEntry *rte = root->simple_rte_array[rti]; + ReplaceVarnoContext ctx = {.from = relid,.to = subst}; + + if (rte->lateral) + { + replace_varno((Node *) otherrel->lateral_vars, relid, subst); + + /* +* Although we pass root->parse through cleanup procedure, +* but parse->rtable and rte contains refs to different copies +* of the subquery. +*/ + if (otherrel->rtekind == RTE_SUBQUERY) + query_tree_walker(rte->subquery, replace_varno_walker, , + QTW_EXAMINE_SORTGROUP); +#ifdef USE_ASSERT_CHECKING + /* Just check possibly hidden non-replaced relids */ + Assert(!bms_is_member(relid, pull_varnos(root, (Node *) rte->tablesample))); + Assert(!bms_is_member(relid, pull_varnos(root, (Node *) rte->functions))); + Assert(!bms_is_member(relid, pull_varnos(root, (Node *) rte->tablefunc))); + Assert(!bms_is_member(relid, pull_varnos(root, (Node *) rte->values_lists))); +#endif + } + } + + } /* diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index 0c2cba8921..d560a4a6b9 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -6349,6 +6349,50 @@ on true; -> Seq Scan on int8_tbl y (7 rows) +-- Test processing target lists in lateral subqueries +explain (verbose, costs off) +SELECT t3.a FROM sj t1, sj t2, +LATERAL (SELECT t1.a WHERE t1.a <> 1 +GROUP BY (t1.a) HAVING t1.a > 0 ORDER BY t1.a LIMIT 1) t3, +LATERAL (SELECT t1.a,t3.a WHERE t1.a <> t3.a+t2.a +GROUP BY (t3.a) HAVING t1.a > t3.a*t3.a+t2.a/t1.a LIMIT 2) t4, +LATERAL (SELECT * FROM sj TABLESAMPLE bernoulli(t1.a/t2.a) +REPEATABLE (t1.a+t2.a)) t5, +LATERAL generate_series(1, t1.a + t2.a) AS t6 +WHERE t1.a = t2.a; + QUERY PLAN +--- + Nested Loop + Output: (t2.a) + -> Nested Loop + Output: t2.a, (t2.a) + -> Nested Loop + Output: t2.a, (t2.a) + -> Nested Loop + Output: t2.a, (t2.a) + -> Seq Scan on public.sj t2 + Output: t2.a, t2.b, t2.c + Filter: (t2.a IS NOT NULL) + -> Limit + Output: (t2.a) + -> Group + Output: t2.a +
Re: A new message seems missing a punctuation
On Mon, Feb 19, 2024 at 10:10 AM Kyotaro Horiguchi wrote: > A recent commit (7a424ece48) added the following message: > > > could not sync slot information as remote slot precedes local slot: > > remote slot "%s": LSN (%X/%X), catalog xmin (%u) local slot: LSN > > (%X/%X), catalog xmin (%u) > > Since it is a bit overloaded but doesn't have a separator between > "catalog xmin (%u)" and "local slot:", it is somewhat cluttered. Don't > we need something, for example a semicolon there to improve > readability and reduce clutter? I think maybe we should even revise the message even more. In most places we do not just print out a whole bunch of values, but use a sentence to connect them. -- Robert Haas EDB: http://www.enterprisedb.com
A new message seems missing a punctuation
A recent commit (7a424ece48) added the following message: > could not sync slot information as remote slot precedes local slot: > remote slot "%s": LSN (%X/%X), catalog xmin (%u) local slot: LSN > (%X/%X), catalog xmin (%u) Since it is a bit overloaded but doesn't have a separator between "catalog xmin (%u)" and "local slot:", it is somewhat cluttered. Don't we need something, for example a semicolon there to improve readability and reduce clutter? regards. -- Kyotaro Horiguchi NTT Open Source Software Center
RE: Synchronizing slots from primary to standby
On Monday, February 19, 2024 11:39 AM shveta malik wrote: > > On Sun, Feb 18, 2024 at 7:40 PM Zhijie Hou (Fujitsu) > wrote: > > > > On Friday, February 16, 2024 6:41 PM shveta malik > wrote: > > Thanks for the patch. Here are few comments: > > Thanks for the comments. > > > > > 2. > > > > +static bool > > +validate_remote_info(WalReceiverConn *wrconn, int elevel) > > ... > > + > > + return (!remote_in_recovery && primary_slot_valid); > > > > The primary_slot_valid could be uninitialized in this function. > > return (!remote_in_recovery && primary_slot_valid); > > Here if remote_in_recovery is true, it will not even read primary_slot_valid. > It > will read primary_slot_valid only if remote_in_recovery is false and in such a > case primary_slot_valid will always be initialized in the else block above, > let me > know if you still feel we shall initialize this to some default? I understand that it will not be used, but some complier could report WARNING for the un-initialized variable. The cfbot[1] seems complain about this as well. [1] https://cirrus-ci.com/task/5416851522453504 Best Regards, Hou zj
Re: pg_upgrade and logical replication
On Mon, 19 Feb 2024 at 06:54, Hayato Kuroda (Fujitsu) wrote: > > Dear Vignesh, > > Thanks for reviewing! PSA new version. > > > > > Thanks for the updated patch, Few suggestions: > > 1) This can be moved to keep it similar to other tests: > > +# Setup a disabled subscription. The upcoming test will check the > > +# pg_createsubscriber won't work, so it is sufficient. > > +$publisher->safe_psql('postgres', "CREATE PUBLICATION regress_pub1"); > > +$old_sub->safe_psql('postgres', > > + "CREATE SUBSCRIPTION regress_sub1 CONNECTION '$connstr' > > PUBLICATION regress_pub1 WITH (enabled = false)" > > +); > > + > > +$old_sub->stop; > > + > > +# -- > > +# Check that pg_upgrade fails when max_replication_slots configured in the > > new > > +# cluster is less than the number of subscriptions in the old cluster. > > +# -- > > +$new_sub->append_conf('postgresql.conf', "max_replication_slots = 0"); > > + > > +# pg_upgrade will fail because the new cluster has insufficient > > +# max_replication_slots. > > +command_checks_all( > > + [ > > + 'pg_upgrade', '--no-sync', '-d', $old_sub->data_dir, > > + '-D', $new_sub->data_dir, '-b', $oldbindir, > > + '-B', $newbindir, '-s', $new_sub->host, > > + '-p', $old_sub->port, '-P', $new_sub->port, > > + $mode, '--check', > > + ], > > > > like below and the extra comment can be removed: > > +# -- > > +# Check that pg_upgrade fails when max_replication_slots configured in the > > new > > +# cluster is less than the number of subscriptions in the old cluster. > > +# -- > > +# Create a disabled subscription. > > +$publisher->safe_psql('postgres', "CREATE PUBLICATION regress_pub1"); > > +$old_sub->safe_psql('postgres', > > + "CREATE SUBSCRIPTION regress_sub1 CONNECTION '$connstr' > > PUBLICATION regress_pub1 WITH (enabled = false)" > > +); > > + > > +$old_sub->stop; > > + > > +$new_sub->append_conf('postgresql.conf', "max_replication_slots = 0"); > > + > > +# pg_upgrade will fail because the new cluster has insufficient > > +# max_replication_slots. > > +command_checks_all( > > + [ > > + 'pg_upgrade', '--no-sync', '-d', $old_sub->data_dir, > > + '-D', $new_sub->data_dir, '-b', $oldbindir, > > + '-B', $newbindir, '-s', $new_sub->host, > > + '-p', $old_sub->port, '-P', $new_sub->port, > > + $mode, '--check', > > + ], > > Partially fixed. I moved the creation part to below but comments were kept. > > > 2) This comment can be slightly changed: > > +# Change configuration as well not to start the initial sync automatically > > +$new_sub->append_conf('postgresql.conf', > > + "max_logical_replication_workers = 0"); > > > > to: > > Change configuration so that initial table sync sync does not get > > started automatically > > Fixed. > > > 3) The old comments were slightly better: > > # Resume the initial sync and wait until all tables of subscription > > # 'regress_sub5' are synchronized > > $new_sub->append_conf('postgresql.conf', > > "max_logical_replication_workers = 10"); > > $new_sub->restart; > > $new_sub->safe_psql('postgres', "ALTER SUBSCRIPTION regress_sub5 > > ENABLE"); > > $new_sub->wait_for_subscription_sync($publisher, 'regress_sub5'); > > > > Like: > > # Enable the subscription > > $new_sub->safe_psql('postgres', "ALTER SUBSCRIPTION regress_sub5 > > ENABLE"); > > > > # Wait until all tables of subscription 'regress_sub5' are synchronized > > $new_sub->wait_for_subscription_sync($publisher, 'regress_sub5'); > > Per comments from Amit [1], I did not change. Thanks for the updated patch, I don't have any more comments. Regards, Vignesh
Re: Emitting JSON to file using COPY TO
On Fri, Jan 19, 2024 at 4:10 PM Masahiko Sawada wrote: > > if (opts_out->json_mode && is_from) > ereport(ERROR, ...); > > if (!opts_out->json_mode && opts_out->force_array) > ereport(ERROR, ...); > > Also these checks can be moved close to other checks at the end of > ProcessCopyOptions(). > > --- > @@ -3395,6 +3395,10 @@ copy_opt_item: > { > $$ = makeDefElem("format", (Node *) makeString("csv"), > @1); > } > + | JSON > + { > + $$ = makeDefElem("format", (Node *) makeString("json"), > @1); > + } > | HEADER_P > { > $$ = makeDefElem("header", (Node *) makeBoolean(true), > @1); > @@ -3427,6 +3431,10 @@ copy_opt_item: > { > $$ = makeDefElem("encoding", (Node *) makeString($2), @1); > } > + | FORCE ARRAY > + { > + $$ = makeDefElem("force_array", (Node *) > makeBoolean(true), @1); > + } > ; > > I believe we don't need to support new options in old-style syntax. > you are right about the force_array case. we don't need to add force_array related changes in gram.y. On Wed, Jan 31, 2024 at 9:26 PM Alvaro Herrera wrote: > > On 2024-Jan-23, jian he wrote: > > > > + | FORMAT_LA copy_generic_opt_arg > > > + { > > > + $$ = makeDefElem("format", $2, @1); > > > + } > > > ; > > > > > > I think it's not necessary. "format" option is already handled in > > > copy_generic_opt_elem. > > > > test it, I found out this part is necessary. > > because a query with WITH like `copy (select 1) to stdout with > > (format json, force_array false); ` will fail. > > Right, because "FORMAT JSON" is turned into FORMAT_LA JSON by parser.c > (see base_yylex there). I'm not really sure but I think it might be > better to make it "| FORMAT_LA JSON" instead of invoking the whole > copy_generic_opt_arg syntax. Not because of performance, but just > because it's much clearer what's going on. > I am not sure what alternative you are referring to. I've rebased the patch, made some cosmetic changes. Now I think it's pretty neat. you can, based on it, make your change, then I may understand the alternative you are referring to. From b3d3d6023f96aa7971a0663d8c0bd6de50e877a5 Mon Sep 17 00:00:00 2001 From: jian he Date: Mon, 19 Feb 2024 10:37:18 +0800 Subject: [PATCH v9 1/2] Add another COPY fomrat: json this format is only allowed in COPY TO operation. discussion: https://postgr.es/m/CALvfUkBxTYy5uWPFVwpk_7ii2zgT07t3d-yR_cy4sfrrLU%3Dkcg%40mail.gmail.com discussion: https://postgr.es/m/6a04628d-0d53-41d9-9e35-5a8dc302c34c@joeconway.com --- doc/src/sgml/ref/copy.sgml | 5 ++ src/backend/commands/copy.c| 13 +++ src/backend/commands/copyto.c | 125 - src/backend/parser/gram.y | 8 ++ src/backend/utils/adt/json.c | 5 +- src/include/commands/copy.h| 1 + src/include/utils/json.h | 2 + src/test/regress/expected/copy.out | 54 + src/test/regress/sql/copy.sql | 38 + 9 files changed, 208 insertions(+), 43 deletions(-) diff --git a/doc/src/sgml/ref/copy.sgml b/doc/src/sgml/ref/copy.sgml index 55764fc1..ef9e4729 100644 --- a/doc/src/sgml/ref/copy.sgml +++ b/doc/src/sgml/ref/copy.sgml @@ -214,9 +214,14 @@ COPY { table_name [ ( text, csv (Comma Separated Values), + json (JavaScript Object Notation), or binary. The default is text. + + The json option is allowed only in + COPY TO. + diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index cc0786c6..5d5b733d 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -480,6 +480,8 @@ ProcessCopyOptions(ParseState *pstate, /* default format */ ; else if (strcmp(fmt, "csv") == 0) opts_out->csv_mode = true; + else if (strcmp(fmt, "json") == 0) +opts_out->json_mode = true; else if (strcmp(fmt, "binary") == 0) opts_out->binary = true; else @@ -716,6 +718,11 @@ ProcessCopyOptions(ParseState *pstate, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot specify HEADER in BINARY mode"))); + if (opts_out->json_mode && opts_out->header_line) + ereport(ERROR, +(errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot specify HEADER in JSON mode"))); + /* Check quote */ if (!opts_out->csv_mode && opts_out->quote != NULL) ereport(ERROR, @@ -793,6 +800,12 @@ ProcessCopyOptions(ParseState *pstate, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("COPY FREEZE cannot be used with COPY TO"))); + /* Check json format */ + if (opts_out->json_mode && is_from) + ereport(ERROR, +(errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot use JSON mode in COPY FROM"))); + if
Re: Synchronizing slots from primary to standby
On Sun, Feb 18, 2024 at 7:40 PM Zhijie Hou (Fujitsu) wrote: > > On Friday, February 16, 2024 6:41 PM shveta malik > wrote: > Thanks for the patch. Here are few comments: Thanks for the comments. > > 2. > > +static bool > +validate_remote_info(WalReceiverConn *wrconn, int elevel) > ... > + > + return (!remote_in_recovery && primary_slot_valid); > > The primary_slot_valid could be uninitialized in this function. return (!remote_in_recovery && primary_slot_valid); Here if remote_in_recovery is true, it will not even read primary_slot_valid. It will read primary_slot_valid only if remote_in_recovery is false and in such a case primary_slot_valid will always be initialized in the else block above, let me know if you still feel we shall initialize this to some default? thanks Shveta
Re: Speeding up COPY TO for uuids and arrays
On Sat, Feb 17, 2024 at 12:24:33PM -0800, Andres Freund wrote: > I wonder if we should move the core part for converting to hex to numutils.c, > we already have code the for the inverse. There does seem to be further > optimization potential in the conversion, and that seems better done somewhere > central rather than one type's output function. OTOH, it might not be worth > it, given the need to add the dashes. I'd tend to live with the current location of the code, but I'm OK if people feel differently on this one, so I'm OK with what Laurenz is proposing. >> - Patch 0003 speeds up array_out a bit by avoiding some zero >> byte writes. The measured speed gain is under 2%. > > Makes sense. +1. -- Michael signature.asc Description: PGP signature
Re: Returning non-terminated string in ECPG Informix-compatible function
On Thu, Feb 15, 2024 at 05:17:17PM +0700, Oleg Tselebrovskiy wrote: > Thanks for review! dt_common.c is quite amazing, the APIs that we have in it rely on strcpy() but we have no idea of the length of the buffer string given in input to store the result. This would require breaking the existing APIs or inventing new ones to be able to plug some safer strlcpy() calls. Not sure if it's really worth bothering. For now, I've applied the OOM checks on HEAD and the fix with the null termination on all stable branches. -- Michael signature.asc Description: PGP signature
Re: Do away with zero-padding assumption before WALRead()
At Mon, 19 Feb 2024 11:56:22 +0900 (JST), Kyotaro Horiguchi wrote in > Yeah, perhaps I was overly concerned. The removed comment made me > think that someone could add code relying on the incorrect assumption > that the remaining bytes beyond the returned count are cleared out. On > the flip side, SimpleXLogPageRead always reads a whole page and > returns XLOG_BLCKSZ. However, as you know, the returned buffer doesn't > contain random garbage bytes. Therefore, it's safe as long as the Forgot to mention that there is a case involving non-initialized pages, but it doesn't affect the correctness of the description you pointed out. regards. -- Kyotaro Horiguchi NTT Open Source Software Center
Re: Do away with zero-padding assumption before WALRead()
At Fri, 16 Feb 2024 19:50:00 +0530, Bharath Rupireddy wrote in > On Fri, Feb 16, 2024 at 7:10 AM Kyotaro Horiguchi > wrote: > > 1. It's useless to copy the whole page regardless of the 'count'. It's > > enough to copy only up to the 'count'. The patch looks good in this > > regard. > > Yes, it's not needed to copy the whole page. Per my understanding > about page transfers between disk and OS page cache - upon OS page > cache miss, the whole page (of disk block size) gets fetched from disk > even if we just read 'count' bytes (< disk block size). Right, but with a possibly-different block size. Anyway that behavior doesn't affect the result of this change. (Could affect performance hereafter if it were not the case, though..) > > 2. Maybe we need a comment that states the page_read callback > > functions leave garbage bytes beyond the returned count, due to > > partial copying without clearing the unused portion. > > Isn't the comment around page_read callback at > https://git.postgresql.org/gitweb/?p=postgresql.git;a=blob;f=src/include/access/xlogreader.h;h=2e9e5f43eb2de1ca9ba81afe76d21357065c61aa;hb=d57b7cc3338e9d9aa1d7c5da1b25a17c5a72dcce#l78 > enough? > > "The callback shall return the number of bytes read (never more than > XLOG_BLCKSZ), or -1 on failure." Yeah, perhaps I was overly concerned. The removed comment made me think that someone could add code relying on the incorrect assumption that the remaining bytes beyond the returned count are cleared out. On the flip side, SimpleXLogPageRead always reads a whole page and returns XLOG_BLCKSZ. However, as you know, the returned buffer doesn't contain random garbage bytes. Therefore, it's safe as long as the caller doesn't access beyond the returned count. As a result, the description you pointed out seems to be enough. After all, the patch looks good to me. regards. -- Kyotaro Horiguchi NTT Open Source Software Center
Re: Thoughts about NUM_BUFFER_PARTITIONS
On Mon, 19 Feb 2024 at 00:56, Tomas Vondra wrote: > On 2/18/24 03:30, Li Japin wrote: >> >> I find it seems need to change MAX_SIMUL_LWLOCKS if we enlarge the >> NUM_BUFFER_PARTITIONS, >> I didn’t find any comments to describe the relation between >> MAX_SIMUL_LWLOCKS and >> NUM_BUFFER_PARTITIONS, am I missing someghing? > > IMHO the relationship is pretty simple - MAX_SIMUL_LWLOCKS needs to be > higher than NUM_BUFFER_PARTITIONS, so that the backend can acquire all > the partition locks if needed. > Thanks for the explanation! Got it. > There's other places that acquire a bunch of locks, and all of them need > to be careful not to exceed MAX_SIMUL_LWLOCKS. For example gist has > GIST_MAX_SPLIT_PAGES. > > > regards
Re: [PoC] Improve dead tuple storage for lazy vacuum
On Fri, Feb 16, 2024 at 12:41 PM John Naylor wrote: > > On Fri, Feb 16, 2024 at 10:05 AM Masahiko Sawada > wrote: > > > v61-0007: Runtime-embeddable tids -- Optional for v17, but should > > > reduce memory regressions, so should be considered. Up to 3 tids can > > > be stored in the last level child pointer. It's not polished, but I'll > > > only proceed with that if we think we need this. "flags" iis called > > > that because it could hold tidbitmap.c booleans (recheck, lossy) in > > > the future, in addition to reserving space for the pointer tag. Note: > > > I hacked the tests to only have 2 offsets per block to demo, but of > > > course both paths should be tested. > > > > Interesting. I've run the same benchmark tests we did[1][2] (the > > median of 3 runs): > > > > monotonically ordered int column index: > > > > master: system usage: CPU: user: 14.91 s, system: 0.80 s, elapsed: 15.73 s > > v-59: system usage: CPU: user: 9.67 s, system: 0.81 s, elapsed: 10.50 s > > v-62: system usage: CPU: user: 1.94 s, system: 0.69 s, elapsed: 2.64 s > > Hmm, that's strange -- this test is intended to delete all records > from the last 20% of the blocks, so I wouldn't expect any improvement > here, only in the sparse case. Maybe something is wrong. All the more > reason to put it off... Okay, let's dig it deeper later. > > > I'm happy to see a huge improvement. While it's really fascinating to > > me, I'm concerned about the time left until the feature freeze. We > > need to polish both tidstore and vacuum integration patches in 5 > > weeks. Personally I'd like to have it as a separate patch for now, and > > focus on completing the main three patches since we might face some > > issues after pushing these patches. I think with 0007 patch it's a big > > win but it's still a win even without 0007 patch. > > Agreed to not consider it for initial commit. I'll hold on to it for > some future time. > > > > 2. Management of memory contexts. It's pretty verbose and messy. I > > > think the abstraction could be better: > > > A: tidstore currently passes CurrentMemoryContext to RT_CREATE, so we > > > can't destroy or reset it. That means we have to do a lot of manual > > > work. > > > B: Passing "max_bytes" to the radix tree was my idea, I believe, but > > > it seems the wrong responsibility. Not all uses will have a > > > work_mem-type limit, I'm guessing. We only use it for limiting the max > > > block size, and aset's default 8MB is already plenty small for > > > vacuum's large limit anyway. tidbitmap.c's limit is work_mem, so > > > smaller, and there it makes sense to limit the max blocksize this way. > > > C: The context for values has complex #ifdefs based on the value > > > length/varlen, but it's both too much and not enough. If we get a bump > > > context, how would we shoehorn that in for values for vacuum but not > > > for tidbitmap? > > > > > > Here's an idea: Have vacuum (or tidbitmap etc.) pass a context to > > > TidStoreCreate(), and then to RT_CREATE. That context will contain the > > > values (for local mem), and the node slabs will be children of the > > > value context. That way, measuring memory usage and free-ing can just > > > call with this parent context, and let recursion handle the rest. > > > Perhaps the passed context can also hold the radix-tree struct, but > > > I'm not sure since I haven't tried it. What do you think? > > > > If I understand your idea correctly, RT_CREATE() creates the context > > for values as a child of the passed context and the node slabs as > > children of the value context. That way, measuring memory usage can > > just call with the value context. It sounds like a good idea. But it > > was not clear to me how to address point B and C. > > For B & C, vacuum would create a context to pass to TidStoreCreate, > and it wouldn't need to bother changing max block size. RT_CREATE > would use that directly for leaves (if any), and would only create > child slab contexts under it. It would not need to know about > max_bytes. Modifyng your diagram a bit, something like: > > - caller-supplied radix tree memory context (the 3 structs -- and > leaves, if any) (aset (or future bump?)) > - node slab contexts > > This might only be workable with aset, if we need to individually free > the structs. (I haven't studied this, it was a recent idea) > It's simpler, because with small fixed length values, we don't need to > detect that and avoid creating a leaf context. All leaves would live > in the same context as the structs. Thank you for the explanation. I think that vacuum and tidbitmap (and future users) would end up having the same max block size calculation. And it seems slightly odd layering to me that max-block-size-specified context is created on vacuum (or tidbitmap) layer, a varlen-value radix tree is created by tidstore layer, and the passed context is used for leaves (if varlen-value is used) on radix tree layer. Another idea is to create a max-block-size-specified
Re: Reducing memory consumed by RestrictInfo list translations in partitionwise join planning
On 19/2/2024 06:05, Tomas Vondra wrote: However, this is a somewhat extreme example - it's joining 5 tables, each with 1000 partitions, using a partition-wise join. It reduces the amount of memory, but the planning time is still quite high (and essentially the same as without the patch). So it's not like it'd make them significantly more practical ... do we have any other ideas/plans how to improve that? The planner principle of cleaning up all allocated structures after the optimization stage simplifies development and code. But, if we want to achieve horizontal scalability on many partitions, we should introduce per-partition memory context and reset it in between. GEQO already has a short-lived memory context, making designing extensions a bit more challenging but nothing too painful. -- regards, Andrei Lepikhov Postgres Professional
RE: pg_upgrade and logical replication
Dear Vignesh, Thanks for reviewing! PSA new version. > > Thanks for the updated patch, Few suggestions: > 1) This can be moved to keep it similar to other tests: > +# Setup a disabled subscription. The upcoming test will check the > +# pg_createsubscriber won't work, so it is sufficient. > +$publisher->safe_psql('postgres', "CREATE PUBLICATION regress_pub1"); > +$old_sub->safe_psql('postgres', > + "CREATE SUBSCRIPTION regress_sub1 CONNECTION '$connstr' > PUBLICATION regress_pub1 WITH (enabled = false)" > +); > + > +$old_sub->stop; > + > +# -- > +# Check that pg_upgrade fails when max_replication_slots configured in the > new > +# cluster is less than the number of subscriptions in the old cluster. > +# -- > +$new_sub->append_conf('postgresql.conf', "max_replication_slots = 0"); > + > +# pg_upgrade will fail because the new cluster has insufficient > +# max_replication_slots. > +command_checks_all( > + [ > + 'pg_upgrade', '--no-sync', '-d', $old_sub->data_dir, > + '-D', $new_sub->data_dir, '-b', $oldbindir, > + '-B', $newbindir, '-s', $new_sub->host, > + '-p', $old_sub->port, '-P', $new_sub->port, > + $mode, '--check', > + ], > > like below and the extra comment can be removed: > +# -- > +# Check that pg_upgrade fails when max_replication_slots configured in the > new > +# cluster is less than the number of subscriptions in the old cluster. > +# -- > +# Create a disabled subscription. > +$publisher->safe_psql('postgres', "CREATE PUBLICATION regress_pub1"); > +$old_sub->safe_psql('postgres', > + "CREATE SUBSCRIPTION regress_sub1 CONNECTION '$connstr' > PUBLICATION regress_pub1 WITH (enabled = false)" > +); > + > +$old_sub->stop; > + > +$new_sub->append_conf('postgresql.conf', "max_replication_slots = 0"); > + > +# pg_upgrade will fail because the new cluster has insufficient > +# max_replication_slots. > +command_checks_all( > + [ > + 'pg_upgrade', '--no-sync', '-d', $old_sub->data_dir, > + '-D', $new_sub->data_dir, '-b', $oldbindir, > + '-B', $newbindir, '-s', $new_sub->host, > + '-p', $old_sub->port, '-P', $new_sub->port, > + $mode, '--check', > + ], Partially fixed. I moved the creation part to below but comments were kept. > 2) This comment can be slightly changed: > +# Change configuration as well not to start the initial sync automatically > +$new_sub->append_conf('postgresql.conf', > + "max_logical_replication_workers = 0"); > > to: > Change configuration so that initial table sync sync does not get > started automatically Fixed. > 3) The old comments were slightly better: > # Resume the initial sync and wait until all tables of subscription > # 'regress_sub5' are synchronized > $new_sub->append_conf('postgresql.conf', > "max_logical_replication_workers = 10"); > $new_sub->restart; > $new_sub->safe_psql('postgres', "ALTER SUBSCRIPTION regress_sub5 > ENABLE"); > $new_sub->wait_for_subscription_sync($publisher, 'regress_sub5'); > > Like: > # Enable the subscription > $new_sub->safe_psql('postgres', "ALTER SUBSCRIPTION regress_sub5 > ENABLE"); > > # Wait until all tables of subscription 'regress_sub5' are synchronized > $new_sub->wait_for_subscription_sync($publisher, 'regress_sub5'); Per comments from Amit [1], I did not change. [1]: https://www.postgresql.org/message-id/CAA4eK1Ls%2BRmJtTvOgaRXd%2BeHSY3x-KUE%3DsfEGQoU-JF_UzA62A%40mail.gmail.com Best Regards, Hayato Kuroda FUJITSU LIMITED https://www.fujitsu.com/ v5-0001-Fix-testcase.patch Description: v5-0001-Fix-testcase.patch
Re: Why is pq_begintypsend so slow?
Hi, In <20240218200906.zvihkrs46yl2j...@awork3.anarazel.de> "Re: Why is pq_begintypsend so slow?" on Sun, 18 Feb 2024 12:09:06 -0800, Andres Freund wrote: >> [1] >> https://www.postgresql.org/message-id/flat/20240215.153421.96888103784986803.kou%40clear-code.com#34df359e6d255795d16814ce138cc995 >> Do we need one FunctionCallInfoBaseData for each attribute? >> How about sharing one FunctionCallInfoBaseData by all >> attributes like [1]? > > That makes no sense to me. You're throwing away most of the possible gains by > having to update the FunctionCallInfo fields on every call. You're saving > neglegible amounts of memory at a substantial runtime cost. The number of updated fields of your approach and [1] are same: Your approach: 6 (I think that "fcinfo->isnull = false" is needed though.) + fcinfo->args[0].value = CStringGetDatum(string); + fcinfo->args[0].isnull = false; + fcinfo->args[1].value = ObjectIdGetDatum(attr->typioparam); + fcinfo->args[1].isnull = false; + fcinfo->args[2].value = Int32GetDatum(attr->typmod); + fcinfo->args[2].isnull = false; [1]: 6 (including "fcinfo->isnull = false") + fcinfo->flinfo = flinfo; + fcinfo->context = escontext; + fcinfo->isnull = false; + fcinfo->args[0].value = CStringGetDatum(str); + fcinfo->args[1].value = ObjectIdGetDatum(typioparam); + fcinfo->args[2].value = Int32GetDatum(typmod); >> Inlining InputFuncallCallSafe() here to use pre-initialized >> fcinfo will decrease maintainability. Because we abstract >> InputFunctionCall family in fmgr.c. If we define a >> InputFunctionCall variant here, we need to change both of >> fmgr.c and here when InputFunctionCall family is changed. >> How about defining details in fmgr.c and call it here >> instead like [1]? > > I'm not sure I buy that that is a problem. It's not like my approach was > actually bypassing fmgr abstractions alltogether - instead it just used the > lower level APIs, because it's a performance sensitive area. [1] provides some optimized abstractions, which are implemented with lower level APIs, without breaking the abstractions. Note that I don't have a strong opinion how to implement this optimization. If other developers think this approach makes sense for this optimization, I don't object it. >> @@ -1966,7 +1992,7 @@ CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo >> *flinfo, >> if (fld_size == -1) >> { >> *isnull = true; >> -return ReceiveFunctionCall(flinfo, NULL, typioparam, typmod); >> +return ReceiveFunctionCall(fcinfo->flinfo, NULL, >> attr->typioparam, attr->typmod); >> >> Why pre-initialized fcinfo isn't used here? > > Because it's a prototype and because I don't think it's a common path. How about adding a comment why we don't need to optimize this case? I don't have a strong opinion how to implement this optimization as I said above. It seems that you like your approach. So I withdraw [1]. Could you complete this optimization? Can we proceed making COPY format extensible without this optimization? It seems that it'll take a little time to complete this optimization because your patch is still WIP. And it seems that you can work on it after making COPY format extensible. Thanks, -- kou
Re: What about Perl autodie?
On 2024-02-14 We 11:52, Peter Eisentraut wrote: On 08.02.24 16:53, Tom Lane wrote: Daniel Gustafsson writes: On 8 Feb 2024, at 08:01, Peter Eisentraut wrote: I suppose we could start using it for completely new scripts. +1, it would be nice to eventually be able to move to it everywhere so starting now with new scripts may make the eventual transition smoother. I'm still concerned about people carelessly using autodie-reliant code in places where they shouldn't. I offer two safer ways forward: 1. Wait till v16 is the oldest supported branch, and then migrate both HEAD and back branches to using autodie. 2. Don't wait, migrate them all now. This would mean requiring Perl 5.10.1 or later to run the TAP tests, even in back branches. I think #2 might not be all that radical. We have nothing older than 5.14.0 in the buildfarm, so we don't really have much grounds for claiming that 5.8.3 will work today. And 5.10.1 came out in 2009, so how likely is it that anyone cares anymore? A gentler way might be to start using some perlcritic policies like InputOutput::RequireCheckedOpen or the more general InputOutput::RequireCheckedSyscalls and add explicit error checking at the sites it points out. And then if we start using autodie in the future, any inappropriate backpatching of calls lacking error checks would be caught. Yeah, that should work. cheers andrew -- Andrew Dunstan EDB: https://www.enterprisedb.com
Re: Running the fdw test from the terminal crashes into the core-dump
On Mon, Feb 19, 2024 at 4:44 AM Alvaro Herrera wrote: > On 2024-Feb-18, Tom Lane wrote: > > > So I'd blame this on faulty handling of the zero-partitions case in > > the RTEPermissionInfo refactoring. (I didn't bisect to prove that > > a61b1f748 is exactly where it broke, but I do see that the query > > successfully does nothing in v15.) > > You're right, this is the commit that broke it. It's unclear to me if > Amit is available to look at it, so I'll give this a look tomorrow if he > isn't. I'll look at this today. -- Thanks, Amit Langote
Re: Reducing memory consumed by RestrictInfo list translations in partitionwise join planning
Hi, After taking a look at the patch optimizing SpecialJoinInfo allocations, I decided to take a quick look at this one too. I don't have any specific comments on the code yet, but it seems quite a bit more complex than the other patch ... it's introducing a HTAB into the optimizer, surely that has costs too. I started by doing the same test as with the other patch, comparing master to the two patches (applied independently and both). And I got about this (in MB): tablesmaster sjinforinfo both --- 237 36 3433 3 138129 122 113 4 421376 363 318 5 1495 1254 1172 931 Unlike the SpecialJoinInfo patch, I haven't seen any reduction in planning time for this one. The reduction in allocated memory is nice. I wonder what's allocating the remaining memory, and we'd have to do to reduce it further. However, this is a somewhat extreme example - it's joining 5 tables, each with 1000 partitions, using a partition-wise join. It reduces the amount of memory, but the planning time is still quite high (and essentially the same as without the patch). So it's not like it'd make them significantly more practical ... do we have any other ideas/plans how to improve that? AFAIK we don't expect this to improve "regular" cases with modest number of tables / partitions, etc. But could it make them slower in some case? regards -- Tomas Vondra EnterpriseDB: http://www.enterprisedb.com The Enterprise PostgreSQL Company
Re: Add system identifier to backup manifest
On Thu, Feb 15, 2024 at 05:41:46PM +0530, Robert Haas wrote: > On Thu, Feb 15, 2024 at 3:05 PM Amul Sul wrote: > > Kindly have a look at the attached version. > > IMHO, 0001 looks fine, except probably the comment could be phrased a > bit more nicely. And the new option should be documented at the top of the init() routine for perldoc. > That can be left for whoever commits this to > wordsmith. Michael, what are your plans? Not much, so feel free to not wait for me. I've just read through the patch because I like the idea/feature :) > 0002 seems like a reasonable approach, but there's a hunk in the wrong > patch: 0004 modifies pg_combinebackup's check_control_files to use > get_dir_controlfile() rather than git_controlfile(), but it looks to > me like that should be part of 0002. I'm slightly concerned about 0002 that silently changes the meaning of get_controlfile(). That would impact extension code without people knowing about it when compiling, just when they run their stuff under 17~. -- Michael signature.asc Description: PGP signature
Re: Why is pq_begintypsend so slow?
On Sun, Feb 18, 2024 at 12:09:06PM -0800, Andres Freund wrote: > On 2024-02-18 17:38:09 +0900, Sutou Kouhei wrote: >> @@ -1966,7 +1992,7 @@ CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo >> *flinfo, >> if (fld_size == -1) >> { >> *isnull = true; >> -return ReceiveFunctionCall(flinfo, NULL, typioparam, typmod); >> +return ReceiveFunctionCall(fcinfo->flinfo, NULL, >> attr->typioparam, attr->typmod); >> >> Why pre-initialized fcinfo isn't used here? > > Because it's a prototype and because I don't think it's a common path. 0008 and 0010 (a bit) are the only patches of the set that touch some of the areas that would be impacted by the refactoring to use callbacks in the COPY code, still I don't see anything that could not be changed in what's updated here, the one-row callback in COPY FROM being the most touched. So I don't quite see why each effort could not happen on their own? Or Andres, do you think that any improvements you've been proposing in this area should happen before we consider refactoring the COPY code to plug in the callbacks? I'm a bit confused by the situation, TBH. -- Michael signature.asc Description: PGP signature
Re: Patch: Add parse_type Function
On 2024-02-18 20:00 +0100, Pavel Stehule wrote: > The overhead of parse_type_and_format can be related to higher planning > time. PL/pgSQL can assign composite without usage FROM clause. Thanks, didn't know that this makes a difference. In that case both variants are on par. BEGIN; CREATE FUNCTION format_with_parse_type(text) RETURNS text LANGUAGE plpgsql STABLE STRICT AS $$ DECLARE p record := parse_type($1); BEGIN RETURN format_type(p.typid, p.typmod); END $$; CREATE FUNCTION format_with_to_regtypmod(text) RETURNS text LANGUAGE plpgsql STABLE STRICT AS $$ BEGIN RETURN format_type(to_regtype($1), to_regtypmod($1)); END $$; COMMIT; Results: SELECT format_with_parse_type('interval second(0)'); pgbench (17devel) transaction type: format_with_parse_type.sql scaling factor: 1 query mode: simple number of clients: 1 number of threads: 1 maximum number of tries: 1 duration: 10 s number of transactions actually processed: 253530 number of failed transactions: 0 (0.000%) latency average = 0.039 ms initial connection time = 1.846 ms tps = 25357.551681 (without initial connection time) SELECT format_with_to_regtypmod('interval second(0)'); pgbench (17devel) transaction type: format_with_to_regtypmod.sql scaling factor: 1 query mode: simple number of clients: 1 number of threads: 1 maximum number of tries: 1 duration: 10 s number of transactions actually processed: 257942 number of failed transactions: 0 (0.000%) latency average = 0.039 ms initial connection time = 1.544 ms tps = 25798.015526 (without initial connection time) -- Erik
Re: PGC_SIGHUP shared_buffers?
Hi, On 2024-02-18 17:06:09 +0530, Robert Haas wrote: > On Sat, Feb 17, 2024 at 12:38 AM Andres Freund wrote: > > IMO the ability to *shrink* shared_buffers dynamically and cheaply is more > > important than growing it in a way, except that they are related of > > course. Idling hardware is expensive, thus overcommitting hardware is very > > attractive (I count "serverless" as part of that). To be able to overcommit > > effectively, unused long-lived memory has to be released. I.e. shared > > buffers > > needs to be shrinkable. > > I see your point, but people want to scale up, too. Of course, those > people will have to live with what we can practically implement. Sure, I didn't intend to say that scaling up isn't useful. > > Perhaps worth noting that there are two things limiting the size of shared > > buffers: 1) the available buffer space 2) the available buffer *mapping* > > space. I think making the buffer mapping resizable is considerably harder > > than > > the buffers themselves. Of course pre-reserving memory for a buffer mapping > > suitable for a huge shared_buffers is more feasible than pre-allocating all > > that memory for the buffers themselves. But it' still mean youd have a > > maximum > > set at server start. > > We size the fsync queue based on shared_buffers too. That's a lot less > important, though, and could be worked around in other ways. We probably should address that independently of making shared_buffers PGC_SIGHUP. The queue gets absurdly large once s_b hits a few GB. It's not that much memory compared to the buffer blocks themselves, but a sync queue of many millions of entries just doesn't make sense. And a few hundred MB for that isn't nothing either, even if it's just a fraction of the space for the buffers. It makes checkpointer more susceptible to OOM as well, because AbsorbSyncRequests() allocates an array to copy all requests into local memory. > > Such a scheme still leaves you with a dependend memory read for a quite > > frequent operation. It could turn out to nto matter hugely if the mapping > > array is cache resident, but I don't know if we can realistically bank on > > that. > > I don't know, either. I was hoping you did. :-) > > But we can rig up a test pretty easily, I think. We can just create a > fake mapping that gives the same answers as the current calculation > and then beat on it. Of course, if testing shows no difference, there > is the small problem of knowing whether the test scenario was right; > and it's also possible that an initial impact could be mitigated by > removing some gratuitously repeated buffer # -> buffer address > mappings. Still, I think it could provide us with a useful baseline. > I'll throw something together when I have time, unless someone beats > me to it. I think such a test would be useful, although I also don't know how confident we would be if we saw positive results. Probably depends a bit on the generated code and how plausible it is to not see regressions. > > I'm also somewhat concerned about the coarse granularity being problematic. > > It > > seems like it'd lead to a desire to make the granule small, causing > > slowness. > > How many people set shared_buffers to something that's not a whole > number of GB these days? I'd say the vast majority of postgres instances in production run with less than 1GB of s_b. Just because numbers wise the majority of instances are running on small VMs and/or many PG instances are running on one larger machine. There are a lot of instances where the total available memory is less than 2GB. > I mean I bet it happens, but in practice if you rounded to the nearest GB, > or even the nearest 2GB, I bet almost nobody would really care. I think it's > fine to be opinionated here and hold the line at a relatively large granule, > even though in theory people could want something else. I don't believe that at all unfortunately. > > One big advantage of a scheme like this is that it'd be a step towards a > > NUMA > > aware buffer mapping and replacement. Practically everything beyond the size > > of a small consumer device these days has NUMA characteristics, even if not > > "officially visible". We could make clock sweeps (or a better victim buffer > > selection algorithm) happen within each "chunk", with some additional > > infrastructure to choose which of the chunks to search a buffer in. Using a > > chunk on the current numa node, except when there is a lot of imbalance > > between buffer usage or replacement rate between chunks. > > I also wondered whether this might be a useful step toward allowing > different-sized buffers in the same buffer pool (ducks, runs away > quickly). I don't have any particular use for that myself, but it's a > thing some people probably want for some reason or other. I still think that that's something that will just cause a significant cost in complexity, and secondarily also runtime overhead, at a comparatively marginal gain.
Re: Why is pq_begintypsend so slow?
Hi, On 2024-02-18 17:38:09 +0900, Sutou Kouhei wrote: > In <20240218015955.rmw5mcmobt5hb...@awork3.anarazel.de> > "Re: Why is pq_begintypsend so slow?" on Sat, 17 Feb 2024 17:59:55 -0800, > Andres Freund wrote: > > > v3-0008-wip-make-in-out-send-recv-calls-in-copy.c-cheaper.patch > > It seems that this is an alternative approach of [1]. Note that what I posted was a very lightly polished rebase of a ~4 year old patchset. > [1] > https://www.postgresql.org/message-id/flat/20240215.153421.96888103784986803.kou%40clear-code.com#34df359e6d255795d16814ce138cc995 > > > +typedef struct CopyInAttributeInfo > +{ > + AttrNumber num; > + const char *name; > + > + Oid typioparam; > + int32 typmod; > + > + FmgrInfoin_finfo; > + union > + { > + FunctionCallInfoBaseData fcinfo; > + charfcinfo_data[SizeForFunctionCallInfo(3)]; > + } in_fcinfo; > > Do we need one FunctionCallInfoBaseData for each attribute? > How about sharing one FunctionCallInfoBaseData by all > attributes like [1]? That makes no sense to me. You're throwing away most of the possible gains by having to update the FunctionCallInfo fields on every call. You're saving neglegible amounts of memory at a substantial runtime cost. > @@ -956,20 +956,47 @@ NextCopyFrom(CopyFromState cstate, ExprContext > *econtext, > > values[m] = ExecEvalExpr(defexprs[m], econtext, > [m]); > } > - > - /* > - * If ON_ERROR is specified with IGNORE, skip rows with > soft > - * errors > - */ > - else if (!InputFunctionCallSafe(_functions[m], > - > string, > - > typioparams[m], > - > att->atttypmod, > - > (Node *) cstate->escontext, > - > [m])) > > Inlining InputFuncallCallSafe() here to use pre-initialized > fcinfo will decrease maintainability. Because we abstract > InputFunctionCall family in fmgr.c. If we define a > InputFunctionCall variant here, we need to change both of > fmgr.c and here when InputFunctionCall family is changed. > How about defining details in fmgr.c and call it here > instead like [1]? I'm not sure I buy that that is a problem. It's not like my approach was actually bypassing fmgr abstractions alltogether - instead it just used the lower level APIs, because it's a performance sensitive area. > @@ -1966,7 +1992,7 @@ CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo > *flinfo, > if (fld_size == -1) > { > *isnull = true; > - return ReceiveFunctionCall(flinfo, NULL, typioparam, typmod); > + return ReceiveFunctionCall(fcinfo->flinfo, NULL, > attr->typioparam, attr->typmod); > > Why pre-initialized fcinfo isn't used here? Because it's a prototype and because I don't think it's a common path. Greetings, Andres Freund
Re: Running the fdw test from the terminal crashes into the core-dump
On 2024-Feb-18, Tom Lane wrote: > So I'd blame this on faulty handling of the zero-partitions case in > the RTEPermissionInfo refactoring. (I didn't bisect to prove that > a61b1f748 is exactly where it broke, but I do see that the query > successfully does nothing in v15.) You're right, this is the commit that broke it. It's unclear to me if Amit is available to look at it, so I'll give this a look tomorrow if he isn't. -- Álvaro Herrera 48°01'N 7°57'E — https://www.EnterpriseDB.com/ "Porque francamente, si para saber manejarse a uno mismo hubiera que rendir examen... ¿Quién es el machito que tendría carnet?" (Mafalda)
Re: Transaction timeout
Alexander, thanks for pushing this! This is small but very awaited feature. > On 16 Feb 2024, at 02:08, Andres Freund wrote: > > Isn't this test going to be very fragile on busy / slow machines? What if the > pg_sleep() takes one second, because there were other tasks to schedule? I'd > be surprised if this didn't fail under valgrind, for example. Even more robust tests that were bullet-proof in CI previously exhibited some failures on buildfarm. Currently there are 5 failures through this weekend. Failing tests are testing interaction of idle_in_transaction_session_timeout vs transaction_timeout(5), and rescheduling transaction_timeout(6). Symptoms: [0] transaction timeout occurs when it is being scheduled. Seems like SET was running to long. step s6_begin: BEGIN ISOLATION LEVEL READ COMMITTED; step s6_tt: SET statement_timeout = '1s'; SET transaction_timeout = '10ms'; +s6: FATAL: terminating connection due to transaction timeout step checker_sleep: SELECT pg_sleep(0.1); [1] transaction timeout 10ms is not detected after 1s step s6_check: SELECT count(*) FROM pg_stat_activity WHERE application_name = 'isolation/timeouts/s6'; count - -0 +1 [2] transaction timeout is not detected in both session 5 and session 6. So far not signle animal reported failures twice, so it's hard to say anything about frequency. But it seems to be significant source of failures. So far I have these ideas: 1. Remove test sessions 5 and 6. But it seems a little strange that session 3 did not fail at all (it is testing interaction of statement_timeout and transaction_timeout). This test is very similar to test sessiont 5... 2. Increase wait times. step checker_sleep { SELECT pg_sleep(0.1); } Seems not enough to observe backend timed out from pg_stat_activity. But this won't help from [0]. 3. Reuse waiting INJECTION_POINT from [3] to make timeout tests deterministic and safe from race conditions. With waiting injection points we can wait as much as needed in current environment. Any advices are welcome. Best regards, Andrey Borodin. [0] https://buildfarm.postgresql.org/cgi-bin/show_log.pl?nm=tamandua=2024-02-16%2020%3A06%3A51 [1] https://buildfarm.postgresql.org/cgi-bin/show_log.pl?nm=kestrel=2024-02-16%2001%3A45%3A10 [2] https://buildfarm.postgresql.org/cgi-bin/show_log.pl?nm=fairywren=2024-02-17%2001%3A55%3A45 [3] https://www.postgresql.org/message-id/0925f9a9-4d53-4b27-a87e-3d83a757b...@yandex-team.ru
Re: Running the fdw test from the terminal crashes into the core-dump
Alena Rybakina writes: > After starting the server (initdb + pg_ctl start) I ran a regress test > create_misc.sql ('\i src/test/regress/sql/create_misc.sql') and, after > that, > I ran the fdw test ('\i contrib/postgres_fdw/sql/postgres_fdw.sql') in > the psql, and it failed in the core-dump due to the worked assert. > To be honest, such a failure occurred only if the fdw extension was not > installed earlier. Thanks for the report! This can be reproduced more simply with z=# create table test (a int, b text) partition by list(a); CREATE TABLE z=# merge into test using (select 1, 'foo') as source on (true) when matched then do nothing; server closed the connection unexpectedly The MERGE produces a query tree with :rtable ( {RANGETBLENTRY :alias <> :eref {ALIAS :aliasname test :colnames ("a" "b") } :rtekind 0 :relid 49152 :relkind p :rellockmode 3 :tablesample <> :perminfoindex 1 :lateral false :inh true :inFromCl false :securityQuals <> } ... ) :rteperminfos ( {RTEPERMISSIONINFO :relid 49152 :inh true :requiredPerms 0 :checkAsUser 0 :selectedCols (b) :insertedCols (b) :updatedCols (b) } ) and that zero for requiredPerms is what leads to the assertion failure later. So I'd blame this on faulty handling of the zero-partitions case in the RTEPermissionInfo refactoring. (I didn't bisect to prove that a61b1f748 is exactly where it broke, but I do see that the query successfully does nothing in v15.) regards, tom lane
Re: Patch: Add parse_type Function
Hi ne 18. 2. 2024 v 19:50 odesílatel Erik Wienhold napsal: > On 2024-02-12 19:20 +0100, Tom Lane wrote: > > I wrote: > > > It strikes me that this is basically to_regtype() with the additional > > > option to return the typmod. That leads to some questions: > > > > BTW, another way that this problem could be approached is to use > > to_regtype() as-is, with a separate function to obtain the typmod: > > > > select format_type(to_regtype('timestamp(4)'), > to_regtypmod('timestamp(4)')); > > > > This is intellectually ugly, since it implies parsing the same > > typename string twice. But on the other hand it avoids the notational > > pain and runtime overhead involved in using a record-returning > > function. So I think it might be roughly a wash for performance. > > Question to think about is which way is easier to use. I don't > > have an opinion particularly; just throwing the idea out there. > > Out of curiosity, I benchmarked this with the attached to_regtypmod() > patch based on David's v5 applied to a6c21887a9. The script running > pgbench and its output are included at the end. > > Just calling parse_type() vs to_regtype()/to_regtypmod() is a wash for > performance as you thought. But format_type() performs better with > to_regtypmod() than with parse_type(). Accessing the record fields > returned by parse_type() adds some overhead. > > to_regtypmod() is better for our use case in pgTAP which relies on > format_type() to normalize the type name. The implementation of > to_regtypmod() is also simpler than parse_type(). Usage-wise, both are > clunky IMO. > > Benchmark script: > > #!/usr/bin/env bash > > set -eu > > cat <<'SQL' > parse_type.sql > SELECT parse_type('interval second(0)'); > SQL > > cat <<'SQL' > parse_type_and_format.sql > SELECT format_type(p.typid, p.typmod) FROM parse_type('interval > second(0)') p; > SQL > > cat <<'SQL' > to_regtypmod.sql > SELECT to_regtype('interval second(0)'), to_regtypmod('interval > second(0)'); > SQL > > cat <<'SQL' > to_regtypmod_and_format.sql > SELECT format_type(to_regtype('interval second(0)'), > to_regtypmod('interval second(0)')); > SQL > > for f in \ > parse_type.sql \ > parse_type_and_format.sql \ > to_regtypmod.sql \ > to_regtypmod_and_format.sql > do > pgbench -n -f "$f" -T10 postgres > echo > done > > pgbench output: > > pgbench (17devel) > transaction type: parse_type.sql > scaling factor: 1 > query mode: simple > number of clients: 1 > number of threads: 1 > maximum number of tries: 1 > duration: 10 s > number of transactions actually processed: 277017 > number of failed transactions: 0 (0.000%) > latency average = 0.036 ms > initial connection time = 1.623 ms > tps = 27706.005513 (without initial connection time) > > pgbench (17devel) > transaction type: parse_type_and_format.sql > scaling factor: 1 > query mode: simple > number of clients: 1 > number of threads: 1 > maximum number of tries: 1 > duration: 10 s > number of transactions actually processed: 222487 > number of failed transactions: 0 (0.000%) > latency average = 0.045 ms > initial connection time = 1.603 ms > tps = 22252.095670 (without initial connection time) > > pgbench (17devel) > transaction type: to_regtypmod.sql > scaling factor: 1 > query mode: simple > number of clients: 1 > number of threads: 1 > maximum number of tries: 1 > duration: 10 s > number of transactions actually processed: 276134 > number of failed transactions: 0 (0.000%) > latency average = 0.036 ms > initial connection time = 1.570 ms > tps = 27617.628259 (without initial connection time) > > pgbench (17devel) > transaction type: to_regtypmod_and_format.sql > scaling factor: 1 > query mode: simple > number of clients: 1 > number of threads: 1 > maximum number of tries: 1 > duration: 10 s > number of transactions actually processed: 270820 > number of failed transactions: 0 (0.000%) > latency average = 0.037 ms > initial connection time = 1.631 ms > tps = 27086.331104 (without initial connection time) > The overhead of parse_type_and_format can be related to higher planning time. PL/pgSQL can assign composite without usage FROM clause. Regards Pavel > -- > Erik >
Re: Patch: Add parse_type Function
On 2024-02-12 19:20 +0100, Tom Lane wrote: > I wrote: > > It strikes me that this is basically to_regtype() with the additional > > option to return the typmod. That leads to some questions: > > BTW, another way that this problem could be approached is to use > to_regtype() as-is, with a separate function to obtain the typmod: > > select format_type(to_regtype('timestamp(4)'), to_regtypmod('timestamp(4)')); > > This is intellectually ugly, since it implies parsing the same > typename string twice. But on the other hand it avoids the notational > pain and runtime overhead involved in using a record-returning > function. So I think it might be roughly a wash for performance. > Question to think about is which way is easier to use. I don't > have an opinion particularly; just throwing the idea out there. Out of curiosity, I benchmarked this with the attached to_regtypmod() patch based on David's v5 applied to a6c21887a9. The script running pgbench and its output are included at the end. Just calling parse_type() vs to_regtype()/to_regtypmod() is a wash for performance as you thought. But format_type() performs better with to_regtypmod() than with parse_type(). Accessing the record fields returned by parse_type() adds some overhead. to_regtypmod() is better for our use case in pgTAP which relies on format_type() to normalize the type name. The implementation of to_regtypmod() is also simpler than parse_type(). Usage-wise, both are clunky IMO. Benchmark script: #!/usr/bin/env bash set -eu cat <<'SQL' > parse_type.sql SELECT parse_type('interval second(0)'); SQL cat <<'SQL' > parse_type_and_format.sql SELECT format_type(p.typid, p.typmod) FROM parse_type('interval second(0)') p; SQL cat <<'SQL' > to_regtypmod.sql SELECT to_regtype('interval second(0)'), to_regtypmod('interval second(0)'); SQL cat <<'SQL' > to_regtypmod_and_format.sql SELECT format_type(to_regtype('interval second(0)'), to_regtypmod('interval second(0)')); SQL for f in \ parse_type.sql \ parse_type_and_format.sql \ to_regtypmod.sql \ to_regtypmod_and_format.sql do pgbench -n -f "$f" -T10 postgres echo done pgbench output: pgbench (17devel) transaction type: parse_type.sql scaling factor: 1 query mode: simple number of clients: 1 number of threads: 1 maximum number of tries: 1 duration: 10 s number of transactions actually processed: 277017 number of failed transactions: 0 (0.000%) latency average = 0.036 ms initial connection time = 1.623 ms tps = 27706.005513 (without initial connection time) pgbench (17devel) transaction type: parse_type_and_format.sql scaling factor: 1 query mode: simple number of clients: 1 number of threads: 1 maximum number of tries: 1 duration: 10 s number of transactions actually processed: 222487 number of failed transactions: 0 (0.000%) latency average = 0.045 ms initial connection time = 1.603 ms tps = 22252.095670 (without initial connection time) pgbench (17devel) transaction type: to_regtypmod.sql scaling factor: 1 query mode: simple number of clients: 1 number of threads: 1 maximum number of tries: 1 duration: 10 s number of transactions actually processed: 276134 number of failed transactions: 0 (0.000%) latency average = 0.036 ms initial connection time = 1.570 ms tps = 27617.628259 (without initial connection time) pgbench (17devel) transaction type: to_regtypmod_and_format.sql scaling factor: 1 query mode: simple number of clients: 1 number of threads: 1 maximum number of tries: 1 duration: 10 s number of transactions actually processed: 270820 number of failed transactions: 0 (0.000%) latency average = 0.037 ms initial connection time = 1.631 ms tps = 27086.331104 (without initial connection time) -- Erik >From 0b60432a84d63a7fccaae0fe123a0aa2ae67493b Mon Sep 17 00:00:00 2001 From: Erik Wienhold Date: Sun, 18 Feb 2024 17:33:35 +0100 Subject: [PATCH] Add to_regtypmod() for benchmarking against parse_type() --- src/backend/utils/adt/regproc.c | 18 ++ src/include/catalog/pg_proc.dat | 3 ++ src/test/regress/expected/regproc.out | 51 +++ src/test/regress/sql/regproc.sql | 26 ++ 4 files changed, 98 insertions(+) diff --git a/src/backend/utils/adt/regproc.c b/src/backend/utils/adt/regproc.c index 4fee27a139..6285dc7192 100644 ---
Running the fdw test from the terminal crashes into the core-dump
Hi, hackers! After starting the server (initdb + pg_ctl start) I ran a regress test create_misc.sql ('\i src/test/regress/sql/create_misc.sql') and, after that, I ran the fdw test ('\i contrib/postgres_fdw/sql/postgres_fdw.sql') in the psql, and it failed in the core-dump due to the worked assert. To be honest, such a failure occurred only if the fdw extension was not installed earlier. script to reproduce the error: ./configure CFLAGS='-g -ggdb -O0' --enable-debug --enable-cassert --prefix=`pwd`/tmp_install && make -j8 -s install export CDIR=$(pwd) export PGDATA=$CDIR/postgres_data rm -r $PGDATA mkdir $PGDATA ${CDIR}/tmp_install/bin/initdb -D $PGDATA >> log.txt ${CDIR}/tmp_install/bin/pg_ctl -D $PGDATA -l logfile start ${CDIR}/tmp_install/bin/psql -d postgres -f src/test/regress/sql/create_misc.sql && ${CDIR}/tmp_install/bin/psql -d postgres -f contrib/postgres_fdw/sql/postgres_fdw.sql The query, where the problem is: -- MERGE ought to fail cleanly merge into itrtest using (select 1, 'foo') as source on (true) when matched then do nothing; Coredump: #5 0x555d1451f483 in ExceptionalCondition (conditionName=0x555d146bba13 "requiredPerms != 0", fileName=0x555d146bb7b0 "execMain.c", lineNumber=654) at assert.c:66 #6 0x555d14064ebe in ExecCheckOneRelPerms (perminfo=0x555d1565ef90) at execMain.c:654 #7 0x555d14064d94 in ExecCheckPermissions (rangeTable=0x555d1565eef0, rteperminfos=0x555d1565efe0, ereport_on_violation=true) at execMain.c:623 #8 0x555d140652ca in InitPlan (queryDesc=0x555d156cde50, eflags=0) at execMain.c:850 #9 0x555d140644a8 in standard_ExecutorStart (queryDesc=0x555d156cde50, eflags=0) at execMain.c:266 #10 0x555d140641ec in ExecutorStart (queryDesc=0x555d156cde50, eflags=0) at execMain.c:145 #11 0x555d1432c025 in ProcessQuery (plan=0x555d1565f3e0, sourceText=0x555d1551b020 "merge into itrtest using (select 1, 'foo') as source on (true)\n when matched then do nothing;", params=0x0, queryEnv=0x0, dest=0x555d1565f540, qc=0x7fffc9454080) at pquery.c:155 #12 0x555d1432dbd8 in PortalRunMulti (portal=0x555d15597ef0, isTopLevel=true, setHoldSnapshot=false, dest=0x555d1565f540, altdest=0x555d1565f540, qc=0x7fffc9454080) at pquery.c:1277 #13 0x555d1432d0cf in PortalRun (portal=0x555d15597ef0, count=9223372036854775807, isTopLevel=true, run_once=true, dest=0x555d1565f540, altdest=0x555d1565f540, qc=0x7fffc9454080) at pquery.c:791 #14 0x555d14325ec8 in exec_simple_query ( --Type for more, q to quit, c to continue without paging-- query_string=0x555d1551b020 "merge into itrtest using (select 1, 'foo') as source on (true)\n when matched then do nothing;") at postgres.c:1273 #15 0x555d1432ae4c in PostgresMain (dbname=0x555d1ab0 "aaa", username=0x555d1a98 "alena") at postgres.c:4645 #16 0x555d14244a5d in BackendRun (port=0x555d1554b3b0) at postmaster.c:4440 #17 0x555d14244072 in BackendStartup (port=0x555d1554b3b0) at postmaster.c:4116 #18 0x555d142405c5 in ServerLoop () at postmaster.c:1768 #19 0x555d1423fec2 in PostmasterMain (argc=3, argv=0x555d1547fcf0) at postmaster.c:1467 #20 0x555d140f3122 in main (argc=3, argv=0x555d1547fcf0) at main.c:198 This error is consistently reproduced. To be honest, I wasn't able to fully figure out the reason for this, but it seems that this operation on partitions should not be available at all? alena@postgres=# SELECT relname, relkind FROM pg_class where relname='itrtest'; relname | relkind -+- itrtest | p (1 row) -- Regards, Alena Rybakina Postgres Professional: http://www.postgrespro.com The Russian Postgres Company
Re: Removing unneeded self joins
18.02.2024 19:18, Alexander Korotkov wrote: Attached is a draft patch fixing this query. Could you, please, recheck? Yes, this patch fixes the behavior for that query (I've also tried several similar queries). Though I don't know the code well enough to judge the code change. Best regards, Alexander
Re: Memory consumed by child SpecialJoinInfo in partitionwise join planning
Hi, I took a quick look at this patch today. I certainly agree with the intent to reduce the amount of memory during planning, assuming it's not overly disruptive. And I think this patch is fairly localized and looks sensible. That being said I'm a big fan of using a local variable on stack and filling it. I'd probably go with the usual palloc/pfree, because that makes it much easier to use - the callers would not be responsible for allocating the SpecialJoinInfo struct. Sure, it's a little bit of overhead, but with the AllocSet caching I doubt it's measurable. I did put this through check-world on amd64/arm64, with valgrind, without any issue. I also tried the scripts shared by Ashutosh in his initial message (with some minor fixes, adding MEMORY to explain etc). The results with the 20240130 patches are like this: tablesmasterpatched - 2 40.8 39.9 3 151.7 142.6 4 464.0 418.5 51663.9 1419.5 That's certainly a nice improvement, and it even reduces the amount of time for planning (the 5-table join goes from 18s to 17s on my laptop). That's nice, although 17 seconds for planning is not ... great. That being said, the amount of remaining memory needed by planning is still pretty high - we save ~240MB for a join of 5 tables, but we still need ~1.4GB. Yes, this is a bit extreme example, and it probably is not very common to join 5 tables with 1000 partitions each ... Do we know what are the other places consuming the 1.4GB of memory? Considering my recent thread about scalability, where malloc() turned out to be one of the main culprits, I wonder if maybe there's a lot to gain by reducing the memory usage ... Our attitude to memory usage is that it doesn't really matter if we keep it allocated for a bit, because we'll free it shortly. And that may be true for "modest" memory usage, but with 1.4GB that doesn't seem great, and the malloc overhead can be pretty bad. regards -- Tomas Vondra EnterpriseDB: http://www.enterprisedb.com The Enterprise PostgreSQL Company
Re: Thoughts about NUM_BUFFER_PARTITIONS
On 2/18/24 03:30, Li Japin wrote: > > >> On Feb 10, 2024, at 20:15, Tomas Vondra >> wrote: >> >> On 2/8/24 14:27, wenhui qiu wrote: >>> Hi Heikki Linnakangas >>>I think the larger shared buffer higher the probability of multiple >>> backend processes accessing the same bucket slot BufMappingLock >>> simultaneously, ( InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS); When I >>> have free time, I want to do this test. I have seen some tests, but the >>> result report is in Chinese >>> >> >> I think Heikki is right this is unrelated to the amount of RAM. The >> partitions are meant to reduce the number of lock collisions when >> multiple processes try to map a buffer concurrently. But the machines >> got much larger in this regard too - in 2006 the common CPUs had maybe >> 2-4 cores, now it's common to have CPUs with ~100 cores, and systems >> with multiple of them. OTOH the time spent holing the partition lock >> should be pretty low, IIRC we pretty much just pin the buffer before >> releasing it, and the backend should do plenty other expensive stuff. So >> who knows how many backends end up doing the locking at the same time. >> >> OTOH, with 128 partitions it takes just 14 backends to have 50% chance >> of a conflict, so with enough cores ... But how many partitions would be >> enough? With 1024 partitions it still takes only 38 backends to get 50% >> chance of a collision. Better, but considering we now have hundreds of >> cores, not sure if sufficient. >> >> (Obviously, we probably want much lower probability of a collision, I >> only used 50% to illustrate the changes). >> > > I find it seems need to change MAX_SIMUL_LWLOCKS if we enlarge the > NUM_BUFFER_PARTITIONS, > I didn’t find any comments to describe the relation between MAX_SIMUL_LWLOCKS > and > NUM_BUFFER_PARTITIONS, am I missing someghing? IMHO the relationship is pretty simple - MAX_SIMUL_LWLOCKS needs to be higher than NUM_BUFFER_PARTITIONS, so that the backend can acquire all the partition locks if needed. There's other places that acquire a bunch of locks, and all of them need to be careful not to exceed MAX_SIMUL_LWLOCKS. For example gist has GIST_MAX_SPLIT_PAGES. regards -- Tomas Vondra EnterpriseDB: http://www.enterprisedb.com The Enterprise PostgreSQL Company
Re: Removing unneeded self joins
On Sun, Feb 18, 2024 at 5:04 PM Alexander Korotkov wrote: > On Sun, Feb 18, 2024 at 3:00 PM Alexander Lakhin wrote: > > 09.01.2024 01:09, Alexander Korotkov wrote: > > > > Fixed in 30b4955a46. > > > > > > Please look at the following query which fails with an error since > > d3d55ce57: > > > > create table t (i int primary key); > > > > select t3.i from t t1 > > join t t2 on t1.i = t2.i, > > lateral (select t1.i limit 1) t3; > > > > ERROR: non-LATERAL parameter required by subquery > > Thank you for spotting. I'm looking at this. Attached is a draft patch fixing this query. Could you, please, recheck? -- Regards, Alexander Korotkov 0001-Replace-relids-in-lateral-subquery-target-list-du-v1.patch Description: Binary data
Re: Removing unneeded self joins
On Sun, Feb 18, 2024 at 3:00 PM Alexander Lakhin wrote: > 09.01.2024 01:09, Alexander Korotkov wrote: > > Fixed in 30b4955a46. > > > Please look at the following query which fails with an error since > d3d55ce57: > > create table t (i int primary key); > > select t3.i from t t1 > join t t2 on t1.i = t2.i, > lateral (select t1.i limit 1) t3; > > ERROR: non-LATERAL parameter required by subquery Thank you for spotting. I'm looking at this. -- Regards, Alexander Korotkov
RE: Synchronizing slots from primary to standby
On Friday, February 16, 2024 6:41 PM shveta malik wrote: > > On Thu, Feb 15, 2024 at 10:48 PM Bertrand Drouvot > wrote: > > > > Looking at v88_0001, random comments: > > Thanks for the feedback. > > > > > 1 === > > > > Commit message "Be enabling slot synchronization" > > > > Typo? s:Be/By > > Modified. > > > 2 === > > > > +It enables a physical standby to synchronize logical failover slots > > +from the primary server so that logical subscribers are not blocked > > +after failover. > > > > Not sure "not blocked" is the right wording. > > "can be resumed from the new primary" maybe? (was discussed in [1]) > > Modified. > > > 3 === > > > > +#define SlotSyncWorkerAllowed()\ > > + (sync_replication_slots && pmState == PM_HOT_STANDBY && \ > > +SlotSyncWorkerCanRestart()) > > > > Maybe add a comment above the macro explaining the logic? > > Done. > > > 4 === > > > > +#include "replication/walreceiver.h" > > #include "replication/slotsync.h" > > > > should be reverse order? > > Removed walreceiver.h inclusion as it was not needed. > > > 5 === > > > > + if (SlotSyncWorker->syncing) > > { > > - SpinLockRelease(>mutex); > > + SpinLockRelease(>mutex); > > ereport(ERROR, > > > errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), > > errmsg("cannot synchronize replication > slots concurrently")); > > } > > > > worth to add a test in 040_standby_failover_slots_sync.pl for it? > > It will be very difficult to stabilize this test as we have to make sure that > the > concurrent users (SQL function(s) and/or worker(s)) are in that target > function at > the same time to hit it. > > > > > 6 === > > > > +static void > > +slotsync_reread_config(bool restart) > > +{ > > > > worth to add test(s) in 040_standby_failover_slots_sync.pl for it? > > Added test. > > Please find v89 patch set. The other changes are: Thanks for the patch. Here are few comments: 1. +static char * +get_dbname_from_conninfo(const char *conninfo) +{ + static char *dbname; + + if (dbname) + return dbname; + else + dbname = walrcv_get_dbname_from_conninfo(conninfo); + + return dbname; +} I think it's not necessary to have a static variable here, because the guc check doesn't seem performance sensitive. Additionaly, it does not work well with slotsync SQL functions, because the dbname will be allocated in temp memory context when reaching here via SQL function, so it's not safe to access this static variable in next function call. 2. +static bool +validate_remote_info(WalReceiverConn *wrconn, int elevel) ... + + return (!remote_in_recovery && primary_slot_valid); The primary_slot_valid could be uninitialized in this function. Best Regards, Hou zj
Re: PGC_SIGHUP shared_buffers?
On Sun, 18 Feb 2024 at 02:03, Andres Freund wrote: > > Hi, > > On 2024-02-17 23:40:51 +0100, Matthias van de Meent wrote: > > > 5. Re-map the shared_buffers when needed. > > > > > > Between transactions, a backend should not hold any buffer pins. When > > > there are no pins, you can munmap() the shared_buffers and mmap() it at > > > a different address. > > I hadn't quite realized that we don't seem to rely on shared_buffers having a > specific address across processes. That does seem to make it a more viable to > remap mappings in backends. > > > However, I don't think this works with mmap(MAP_ANONYMOUS) - as long as we are > using the process model. To my knowledge there is no way to get the same > mapping in multiple already existing processes. Even mmap()ing /dev/zero after > sharing file descriptors across processes doesn't work, if I recall correctly. > > We would have to use sysv/posix shared memory or such (or mmap() if files in > tmpfs) for the shared buffers allocation. > > > > > This can quite realistically fail to find an unused memory region of > > sufficient size when the heap is sufficiently fragmented, e.g. through > > ASLR, which would make it difficult to use this dynamic > > single-allocation shared_buffers in security-hardened environments. > > I haven't seen anywhere close to this bad fragmentation on 64bit machines so > far - have you? No. > Most implementations of ASLR randomize mmap locations across multiple runs of > the same binary, not within the same binary. There are out-of-tree linux > patches that make mmap() randomize every single allocation, but I am not sure > that we ought to care about such things. After looking into ASLR a bit more, I realise I was under the mistaken impression that ASLR would implicate randomized mmaps(), too. Apparently, that's wrong; ASLR only does some randomization for the initialization of the process memory layout, and not the process' allocations. > Even if we were to care, on 64bit platforms it doesn't seem likely that we'd > run out of space that quickly. AMD64 had 48bits of virtual address space from > the start, and on recent CPUs that has grown to 57bits [1], that's a lot of > space. Yeah, that's a lot of space, but it seems to me it's also easily consumed; one only needs to allocate one allocation in every 4GB of address space to make allocations of 8GB impossible; a utilization of ~1 byte/MiB. Applying this to 48 bits of virtual address space, a process only needs to use ~256MB of memory across the address space to block out any 8GB allocations; for 57 bits that's still "only" 128GB. But after looking at ASLR a bit more, it is unrealistic that a normal OS and process stack would get to allocating memory in such a pattern. > And if you do run out of VM space, wouldn't that also affect lots of other > things, like mmap() for malloc? Yes. But I would usually expect that the main shared memory allocation would be the single largest uninterrupted allocation, so I'd also expect it to see more such issues than any current user of memory if we were to start moving (reallocating) that allocation. Kind regards, Matthias van de Meent Neon (https://neon.tech)
Re: PGC_SIGHUP shared_buffers?
On 16/02/2024 10:37 pm, Thomas Munro wrote: On Fri, Feb 16, 2024 at 5:29 PM Robert Haas wrote: 3. Reserve lots of address space and then only use some of it. I hear rumors that some forks of PG have implemented something like this. The idea is that you convince the OS to give you a whole bunch of address space, but you try to avoid having all of it be backed by physical memory. If you later want to increase shared_buffers, you then get the OS to back more of it by physical memory, and if you later want to decrease shared_buffers, you hopefully have some way of giving the OS the memory back. As compared with the previous two approaches, this seems less likely to be noticeable to most PG code. Problems include (1) you have to somehow figure out how much address space to reserve, and that forms an upper bound on how big shared_buffers can grow at runtime and (2) you have to figure out ways to reserve address space and back more or less of it with physical memory that will work on all of the platforms that we currently support or might want to support in the future. FTR I'm aware of a working experimental prototype along these lines, that will be presented in Vancouver: https://www.pgevents.ca/events/pgconfdev2024/sessions/session/31-enhancing-postgresql-plasticity-new-frontiers-in-memory-management/ If you are interested - this is my attempt to implement resizable shared buffers based on ballooning: https://github.com/knizhnik/postgres/pull/2 Unused memory is returned to OS using `madvise` (so it is not so portable solution). Unfortunately there are really many data structure in Postgres which size depends on number of buffers. In my PR I am using `GetAvailableBuffers()` function instead of `NBuffers`. But it doesn't always help because many of this data structures can not be reallocated. Another important limitation of this approach are: 1. It is necessary to specify maximal number of shared buffers2. Only `BufferBlocks` space is shrinked but not buffer descriptors and buffer hash. Estimated memory fooyprint for one page is 132 bytes. If we want to scale shared buffers from 100Mb to 100Gb, size of use memory will be 1.6Gb. And it is quite large. 3. Our CLOCK algorithm becomes very inefficient for large number of shared buffers. Below are first results (pgbench database with scale 100, pgbench -c 32 -j 4 -T 100 -P1 -M prepared -S ) I get: | shared_buffers|available_buffers | TPS | | --| | | | 128MB| -1 | 280k | |1GB| -1 | 324k | |2GB| -1 | 358k | | 32GB| -1 | 350k | |2GB|128Mb | 130k | |2GB| 1Gb | 311k | | 32GB|128Mb | 13k | | 32GB| 1Gb | 140k | | 32GB| 2Gb | 348k | `shared_buffers` specifies maximal shared buffers size and `avaiable_buffer` - current limit. So when shared_buffers >> available_buffers and dataset doesn't fit in them, we get awful degrade of performance (> 20 times). Thanks to CLOCK algorithm. My first thought is to replace clock with LRU based in double-linked list. As far as there is no lockless double-list implementation, it need some global lock. This lock can become bottleneck. The standard solution is partitioning: use N LRU lists instead of 1. Just as partitioned has table used by buffer manager to lockup buffers. Actually we can use the same partitions locks to protect LRU list. But it not clear what to do with ring buffers (strategies).So I decided not to perform such revolution in bufmgr, but optimize clock to more efficiently split reserved buffers. Just add|skip_count|field to buffer descriptor. And it helps! Now the worst case shared_buffer/available_buffers = 32Gb/128Mb shows the same performance 280k as shared_buffers=128Mb without ballooning.
Re: Removing unneeded self joins
Hi Alexander, 09.01.2024 01:09, Alexander Korotkov wrote: Fixed in 30b4955a46. Please look at the following query which fails with an error since d3d55ce57: create table t (i int primary key); select t3.i from t t1 join t t2 on t1.i = t2.i, lateral (select t1.i limit 1) t3; ERROR: non-LATERAL parameter required by subquery Best regards, Alexander
Re: PGC_SIGHUP shared_buffers?
On Sat, Feb 17, 2024 at 1:54 AM Heikki Linnakangas wrote: > A variant of this approach: > > 5. Re-map the shared_buffers when needed. > > Between transactions, a backend should not hold any buffer pins. When > there are no pins, you can munmap() the shared_buffers and mmap() it at > a different address. I really like this idea, but I think Andres has latched onto the key issue, which is that it supposes that the underlying shared memory object upon which shared_buffers is based can be made bigger and smaller, and that doesn't work for anonymous mappings AFAIK. Maybe that's not really a problem any more, though. If we don't depend on the address of shared_buffers anywhere, we could move it into a DSM. Now that the stats collector uses DSM, it's surely already a requirement that DSM works on every machine that runs PostgreSQL. We'd still need to do something about the buffer mapping table, though, and I bet dshash is not a reasonable answer on performance grounds. Also, it would be nice if the granularity of resizing could be something less than a whole transaction, because transactions can run for a long time. We don't really need to wait for a transaction boundary, probably -- a time when we hold zero buffer pins will probably happen a lot sooner, and at least some of those should be safe points at which to remap. Then again, somebody can open a cursor, read from it until it holds a pin, and then either idle the connection or make it do arbitrary amounts of unrelated work, forcing the remapping to be postponed for an arbitrarily long time. But some version of this problem will exist in any approach to this problem, and long-running pins are a nuisance for other reasons, too. We probably just have to accept this sort of issue as a limitation of our implementation. -- Robert Haas EDB: http://www.enterprisedb.com
Re: PGC_SIGHUP shared_buffers?
On Sat, Feb 17, 2024 at 12:38 AM Andres Freund wrote: > IMO the ability to *shrink* shared_buffers dynamically and cheaply is more > important than growing it in a way, except that they are related of > course. Idling hardware is expensive, thus overcommitting hardware is very > attractive (I count "serverless" as part of that). To be able to overcommit > effectively, unused long-lived memory has to be released. I.e. shared buffers > needs to be shrinkable. I see your point, but people want to scale up, too. Of course, those people will have to live with what we can practically implement. > Perhaps worth noting that there are two things limiting the size of shared > buffers: 1) the available buffer space 2) the available buffer *mapping* > space. I think making the buffer mapping resizable is considerably harder than > the buffers themselves. Of course pre-reserving memory for a buffer mapping > suitable for a huge shared_buffers is more feasible than pre-allocating all > that memory for the buffers themselves. But it' still mean youd have a maximum > set at server start. We size the fsync queue based on shared_buffers too. That's a lot less important, though, and could be worked around in other ways. > Such a scheme still leaves you with a dependend memory read for a quite > frequent operation. It could turn out to nto matter hugely if the mapping > array is cache resident, but I don't know if we can realistically bank on > that. I don't know, either. I was hoping you did. :-) But we can rig up a test pretty easily, I think. We can just create a fake mapping that gives the same answers as the current calculation and then beat on it. Of course, if testing shows no difference, there is the small problem of knowing whether the test scenario was right; and it's also possible that an initial impact could be mitigated by removing some gratuitously repeated buffer # -> buffer address mappings. Still, I think it could provide us with a useful baseline. I'll throw something together when I have time, unless someone beats me to it. > I'm also somewhat concerned about the coarse granularity being problematic. It > seems like it'd lead to a desire to make the granule small, causing slowness. How many people set shared_buffers to something that's not a whole number of GB these days? I mean I bet it happens, but in practice if you rounded to the nearest GB, or even the nearest 2GB, I bet almost nobody would really care. I think it's fine to be opinionated here and hold the line at a relatively large granule, even though in theory people could want something else. Alternatively, maybe there could be a provision for the last granule to be partial, and if you extend further, you throw away the partial granule and replace it with a whole one. But I'm not even sure that's worth doing. > One big advantage of a scheme like this is that it'd be a step towards a NUMA > aware buffer mapping and replacement. Practically everything beyond the size > of a small consumer device these days has NUMA characteristics, even if not > "officially visible". We could make clock sweeps (or a better victim buffer > selection algorithm) happen within each "chunk", with some additional > infrastructure to choose which of the chunks to search a buffer in. Using a > chunk on the current numa node, except when there is a lot of imbalance > between buffer usage or replacement rate between chunks. I also wondered whether this might be a useful step toward allowing different-sized buffers in the same buffer pool (ducks, runs away quickly). I don't have any particular use for that myself, but it's a thing some people probably want for some reason or other. > > 2. Make a Buffer just a disguised pointer. Imagine something like > > typedef struct { Page bp; } *buffer. WIth this approach, > > BufferGetBlock() becomes trivial. > > You also additionally need something that allows for efficient iteration over > all shared buffers. Making buffer replacement and checkpointing more expensive > isn't great. True, but I don't really see what the problem with this would be in this approach. > > 3. Reserve lots of address space and then only use some of it. I hear > > rumors that some forks of PG have implemented something like this. The > > idea is that you convince the OS to give you a whole bunch of address > > space, but you try to avoid having all of it be backed by physical > > memory. If you later want to increase shared_buffers, you then get the > > OS to back more of it by physical memory, and if you later want to > > decrease shared_buffers, you hopefully have some way of giving the OS > > the memory back. As compared with the previous two approaches, this > > seems less likely to be noticeable to most PG code. > > Another advantage is that you can shrink shared buffers fairly granularly and > cheaply with that approach, compared to having to move buffes entirely out of > a larger mapping to be able to unmap it. Don't you have to still
Re: Things I don't like about \du's "Attributes" column
On 17.02.2024 00:37, David G. Johnston wrote: On Mon, Feb 12, 2024 at 2:29 PM Pavel Luzanov wrote: regress_du_role1 | no| Inherit | no| 2024-12-31 00:00:00+03(invalid) | 50 | Group role without password but with valid until regress_du_role2 | yes | Inherit | yes | | Not allowed | No connections allowed regress_du_role3 | yes | | yes | | 10 | User without attributes regress_du_su| yes | Superuser +| yes | | 3(ignored) | Superuser but with connection limit Per the recent bug report, we should probably add something like (ignored) after the 50 connections for role1 since they are not allowed to login so the value is indeed ignored. Hm, but the same logic applies to "Password?" and "Valid until" for role1 without login attribute. The challenge is how to display it for unprivileged users. But they can't see password information. So, displaying 'Valid until' as '(irrelevant)' for privileged users and real value for others looks badly. What can be done in this situation. 0. Show different values as described above. 1. Don't show 'Valid until' for unprivileged users at all. The same logic as for 'Password?'. With possible exception: user can see 'Valid until' for himself. May be too complicated? 2. Tom's advise: Not sure it's worth worrying about Show real values for 'Valid until' and 'Connection limit' without any hints. 3. The best solution, which I can't see now. -- Pavel Luzanov Postgres Professional:https://postgrespro.com
Re: Properly pathify the union planner
David Rowley writes: >> >> The two if-clauses looks unnecessary, it should be handled by >> pathkeys_count_contained_in already. The same issue exists in >> pathkeys_useful_for_ordering as well. Attached patch fix it in master. > > I agree. I'd rather not have those redundant checks in > pathkeys_useful_for_setop(), and I do want those functions to be as > similar as possible. So I think adjusting it in master is a good > idea. > > I've pushed your patch. > Thanks for the pushing! -- Best Regards Andy Fan
Re: Why is pq_begintypsend so slow?
Hi, In <20240218015955.rmw5mcmobt5hb...@awork3.anarazel.de> "Re: Why is pq_begintypsend so slow?" on Sat, 17 Feb 2024 17:59:55 -0800, Andres Freund wrote: > v3-0008-wip-make-in-out-send-recv-calls-in-copy.c-cheaper.patch It seems that this is an alternative approach of [1]. [1] https://www.postgresql.org/message-id/flat/20240215.153421.96888103784986803.kou%40clear-code.com#34df359e6d255795d16814ce138cc995 +typedef struct CopyInAttributeInfo +{ + AttrNumber num; + const char *name; + + Oid typioparam; + int32 typmod; + + FmgrInfoin_finfo; + union + { + FunctionCallInfoBaseData fcinfo; + charfcinfo_data[SizeForFunctionCallInfo(3)]; + } in_fcinfo; Do we need one FunctionCallInfoBaseData for each attribute? How about sharing one FunctionCallInfoBaseData by all attributes like [1]? @@ -956,20 +956,47 @@ NextCopyFrom(CopyFromState cstate, ExprContext *econtext, values[m] = ExecEvalExpr(defexprs[m], econtext, [m]); } - - /* -* If ON_ERROR is specified with IGNORE, skip rows with soft -* errors -*/ - else if (!InputFunctionCallSafe(_functions[m], - string, - typioparams[m], - att->atttypmod, - (Node *) cstate->escontext, - [m])) Inlining InputFuncallCallSafe() here to use pre-initialized fcinfo will decrease maintainability. Because we abstract InputFunctionCall family in fmgr.c. If we define a InputFunctionCall variant here, we need to change both of fmgr.c and here when InputFunctionCall family is changed. How about defining details in fmgr.c and call it here instead like [1]? + fcinfo->args[0].value = CStringGetDatum(string); + fcinfo->args[0].isnull = false; + fcinfo->args[1].value = ObjectIdGetDatum(attr->typioparam); + fcinfo->args[1].isnull = false; + fcinfo->args[2].value = Int32GetDatum(attr->typmod); + fcinfo->args[2].isnull = false; I think that "fcinfo->isnull = false;" is also needed like [1]. @@ -1966,7 +1992,7 @@ CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo, if (fld_size == -1) { *isnull = true; - return ReceiveFunctionCall(flinfo, NULL, typioparam, typmod); + return ReceiveFunctionCall(fcinfo->flinfo, NULL, attr->typioparam, attr->typmod); Why pre-initialized fcinfo isn't used here? Thanks, -- kou
Re: serial not accepted as datatype in ALTER TABLE ... ALTER COLUMN
Hi Ashutosh, > data_type is described on that page as "Data type of the new column, > or new data type for an existing column." but CREATE TABLE > documentation [2] redirects data_type to [3], which mentions serial. > The impression created by the documentation is the second statement > above is a valid statement as should not throw an error; instead > change the data type of the column (and create required sequence). I didn't find out a reason to not support it, if have any reason, I think it is better have some explaination in the document. > In code ATPrepAlterColumnType() calls typenameTypeIdAndMod(), whereas > transformColumnDefinition() (called for ALTER TABLE ... ADD COLUMN and > CREATE TABLE) handles "serial" data type separately. Looks like we are > missing a call to transformColumnDefinition() in > transformAlterTableStmt() under case AT_AlterColumnType. I tried your idea with the attatchment, it is still in a drafted state but it can be used as a prove-of-concept and for better following communicating. Just one point needs to metion is serial implies "default value" + "not null" constaint. So when we modify a column into serial, we need to modify the 'NULL value' and only to the default value at the RewriteTable stage. -- Best Regards Andy Fan >From 4485a9af33c9c7d493e23c2804bde4c15df0b79a Mon Sep 17 00:00:00 2001 From: "yizhi.fzh" Date: Sun, 18 Feb 2024 16:14:29 +0800 Subject: [PATCH v0 1/1] POC: Support ALTER TABLE tableName ALERT COLUMN col type serial. --- src/backend/commands/tablecmds.c | 60 +-- src/backend/parser/gram.y | 1 + src/backend/parser/parse_utilcmd.c| 41 - src/include/nodes/parsenodes.h| 1 + src/include/parser/parse_utilcmd.h| 4 +- src/test/regress/expected/alter_table.out | 70 +-- src/test/regress/sql/alter_table.sql | 20 +-- 7 files changed, 180 insertions(+), 17 deletions(-) diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 46ece07338..7e13b04efa 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -231,6 +231,8 @@ typedef struct NewColumnValue Expr *expr; /* expression to compute */ ExprState *exprstate; /* execution state */ bool is_generated; /* is it a GENERATED expression? */ + bool new_on_null; /* set the new value only when the old value + * is NULL. */ } NewColumnValue; /* @@ -5594,7 +5596,8 @@ ATParseTransformCmd(List **wqueue, AlteredTableInfo *tab, Relation rel, atstmt, context->queryString, , - ); + , + context); /* Execute any statements that should happen before these subcommand(s) */ foreach(lc, beforeStmts) @@ -6230,10 +6233,11 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode) if (ex->is_generated) continue; - newslot->tts_values[ex->attnum - 1] - = ExecEvalExpr(ex->exprstate, - econtext, - >tts_isnull[ex->attnum - 1]); + if (!ex->new_on_null || oldslot->tts_isnull[ex->attnum - 1]) + newslot->tts_values[ex->attnum - 1] + = ExecEvalExpr(ex->exprstate, + econtext, + >tts_isnull[ex->attnum - 1]); } ExecStoreVirtualTuple(newslot); @@ -13284,6 +13288,7 @@ ATPrepAlterColumnType(List **wqueue, newval->attnum = attnum; newval->expr = (Expr *) transform; newval->is_generated = false; + newval->new_on_null = def->from_serial; tab->newvals = lappend(tab->newvals, newval); if (ATColumnChangeRequiresRewrite(transform, attnum)) @@ -13495,6 +13500,48 @@ ATExecAlterColumnType(AlteredTableInfo *tab, Relation rel, HeapTuple depTup; ObjectAddress address; + if (def->from_serial) + { + /* + * Store the DEFAULT for the from_serial + */ + /* XXX: copy from ATExecAddColumn */ + RawColumnDefault *rawEnt; + + heapTup = SearchSysCacheCopyAttName(RelationGetRelid(rel), colName); + attTup = (Form_pg_attribute) GETSTRUCT(heapTup); + + rawEnt = (RawColumnDefault *) palloc(sizeof(RawColumnDefault)); + rawEnt->attnum = attTup->attnum; + rawEnt->raw_default = copyObject(def->raw_default); + + /* + * Attempt to skip a complete table rewrite by storing the specified + * DEFAULT value outside of the heap. This may be disabled inside + * AddRelationNewConstraints if the optimization cannot be applied. + */ + rawEnt->missingMode = (!def->generated); + + rawEnt->generated = def->generated; + + /* + * This function is intended for CREATE TABLE, so it processes a + * _list_ of defaults, but we just do one. + */ + AddRelationNewConstraints(rel, list_make1(rawEnt), NIL, + false, true, false, NULL); + + /* Make the additional catalog changes visible */ + CommandCounterIncrement(); + + /* + * Did the request for a missing value work? If not we'll have to do a + * rewrite + */ + if (!rawEnt->missingMode) +