Re: [BUG] pg_dump blocked

2022-11-18 Thread Gilles Darold

Le 17/11/2022 à 17:59, Tom Lane a écrit :

Gilles Darold  writes:

I have an incorrect behavior with pg_dump prior PG version 15. With
PostgreSQL 15, thanks to commit e3fcbbd623b9ccc16cdbda374654d91a4727d173
the problem is gone but for older versions it persists with locks on
partitioned tables.

I didn't want to back-patch e3fcbbd62 at the time, but it's probably aged
long enough now to be safe to back-patch.  If we do anything here,
it should be to back-patch the whole thing, else we've only partially
fixed the issue.



Here are the different patched following the PostgreSQL version from 11 
to 14, they should apply on the corresponding stable branches. The 
patches only concern the move of the unsafe functions, 
pg_get_partkeydef() and pg_get_expr(). They should all apply without 
problem on their respective branch, pg_dump tap regression test passed 
on all versions.


Regards,

--
Gilles Darold
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index e110257395..cbe7c1e01d 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -6322,14 +6322,15 @@ getTables(Archive *fout, int *numTables)
 	int			i_foreignserver;
 	int			i_is_identity_sequence;
 	int			i_changed_acl;
-	int			i_partkeydef;
 	int			i_ispartition;
-	int			i_partbound;
 	int			i_amname;
 
 	/*
 	 * Find all the tables and table-like objects.
 	 *
+	 * We must fetch all tables in this phase because otherwise we cannot
+	 * correctly identify inherited columns, owned sequences, etc.
+	 *
 	 * We include system catalogs, so that we can work if a user table is
 	 * defined to inherit from a system catalog (pretty weird, but...)
 	 *
@@ -6343,8 +6344,10 @@ getTables(Archive *fout, int *numTables)
 	 *
 	 * Note: in this phase we should collect only a minimal amount of
 	 * information about each table, basically just enough to decide if it is
-	 * interesting. We must fetch all tables in this phase because otherwise
-	 * we cannot correctly identify inherited columns, owned sequences, etc.
+	 * interesting.  In particular, since we do not yet have lock on any user
+	 * table, we MUST NOT invoke any server-side data collection functions
+	 * (for instance, pg_get_partkeydef()).  Those are likely to fail or give
+	 * wrong answers if any concurrent DDL is happening.
 	 *
 	 * We purposefully ignore toast OIDs for partitioned tables; the reason is
 	 * that versions 10 and 11 have them, but 12 does not, so emitting them
@@ -6353,9 +6356,7 @@ getTables(Archive *fout, int *numTables)
 
 	if (fout->remoteVersion >= 90600)
 	{
-		char	   *partkeydef = "NULL";
 		char	   *ispartition = "false";
-		char	   *partbound = "NULL";
 		char	   *relhasoids = "c.relhasoids";
 
 		PQExpBuffer acl_subquery = createPQExpBuffer();
@@ -6374,11 +6375,7 @@ getTables(Archive *fout, int *numTables)
 		 */
 
 		if (fout->remoteVersion >= 10)
-		{
-			partkeydef = "pg_get_partkeydef(c.oid)";
 			ispartition = "c.relispartition";
-			partbound = "pg_get_expr(c.relpartbound, c.oid)";
-		}
 
 		/* In PG12 upwards WITH OIDS does not exist anymore. */
 		if (fout->remoteVersion >= 12)
@@ -6419,7 +6416,7 @@ getTables(Archive *fout, int *numTables)
 		  "CASE WHEN c.relkind = 'f' THEN "
 		  "(SELECT ftserver FROM pg_catalog.pg_foreign_table WHERE ftrelid = c.oid) "
 		  "ELSE 0 END AS foreignserver, "
-		  "CASE WHEN c.reloftype <> 0 THEN c.reloftype::pg_catalog.regtype ELSE NULL END AS reloftype, "
+		  "c.reloftype, "
 		  "d.refobjid AS owning_tab, "
 		  "d.refobjsubid AS owning_col, "
 		  "(SELECT spcname FROM pg_tablespace t WHERE t.oid = c.reltablespace) AS reltablespace, "
@@ -6439,9 +6436,7 @@ getTables(Archive *fout, int *numTables)
 		  "OR %s IS NOT NULL"
 		  "))"
 		  "AS changed_acl, "
-		  "%s AS partkeydef, "
-		  "%s AS ispartition, "
-		  "%s AS partbound "
+		  "%s AS ispartition "
 		  "FROM pg_class c "
 		  "LEFT JOIN pg_depend d ON "
 		  "(c.relkind = '%c' AND "
@@ -6467,9 +6462,7 @@ getTables(Archive *fout, int *numTables)
 		  attracl_subquery->data,
 		  attinitacl_subquery->data,
 		  attinitracl_subquery->data,
-		  partkeydef,
 		  ispartition,
-		  partbound,
 		  RELKIND_SEQUENCE,
 		  RELKIND_PARTITIONED_TABLE,
 		  RELKIND_RELATION, RELKIND_SEQUENCE,
@@ -6512,7 +6505,7 @@ getTables(Archive *fout, int *numTables)
 		  "CASE WHEN c.relkind = 'f' THEN "
 		  "(SELECT ftserver FROM pg_catalog.pg_foreign_table WHERE ftrelid = c.oid) "
 		  "ELSE 0 END AS foreignserver, "
-		  "CASE WHEN c.reloftype <> 0 THEN c.reloftype::pg_catalog.regtype ELSE NULL END AS reloftype, "
+		  "c.reloftype, "
 		  "d.refobjid AS owning_tab, "
 		  "d.refobjsubid AS owning_col, "
 		  "(SELECT spcname FROM pg_tablespace t WHERE t.oid = c.reltablespace) AS reltablespace, "
@@ -6521,9 +6514,7 @@ getTables(Archive *fout, int *numTables)
 		  "WHEN 'check_option=cascaded' = ANY (c.relop

Re: [PoC] Improve dead tuple storage for lazy vacuum

2022-11-18 Thread John Naylor
On Fri, Nov 18, 2022 at 8:20 PM Masahiko Sawada 
wrote:
>
> FYI I've not tested the patch you shared today but here are the
> benchmark results I did with the v9 patch in my environment (I used
> the second filter). I splitted 0004 patch into two patches: a patch
> for pure refactoring patch to introduce rt_node_ptr and a patch to do
> pointer tagging.
>
> v9 0003 patch: 1113 1114 1114
> introduce rt_node_ptr: 1127 1128 1128
> pointer tagging  : 1085 1087 1086 (equivalent to 0004 patch)
>
> In my environment, rt_node_ptr seemed to lead some overhead but
> pointer tagging had performance benefits. I'm not sure the reason why
> the results are different from yours. The radix tree stats shows the
> same as your tests.

There is less than 2% difference from the medial set of results, so it's
hard to distinguish from noise. I did a fresh rebuild and retested with the
same results: about 15% slowdown in v9 0004. That's strange.

On Wed, Nov 16, 2022 at 10:24 PM Masahiko Sawada 
wrote:

> > filter = (((uint64) 1<<32) | (0xFF<<24));
> > LOG:  num_keys = 944, height = 7, n4 = 47515559, n32 = 6209, n128 =
62632, n256 = 3161
> >
> > 1) Any idea why the tree height would be reported as 7 here? I didn't
expect that.
>
> In my environment, (0xFF<<24) is 0xFF00, not 0xFF00.
> It seems the filter should be (((uint64) 1<<32) | ((uint64)
> 0xFF<<24)).

Ugh, sign extension, brain fade on my part. Thanks, I'm glad there was a
straightforward explanation.

--
John Naylor
EDB: http://www.enterprisedb.com


Re: allowing for control over SET ROLE

2022-11-18 Thread Michael Paquier
On Fri, Nov 18, 2022 at 04:19:15PM -0500, Robert Haas wrote:
> Fixed that, and the other mistake Álvaro spotted, and also bumped
> catversion because I forgot that earlier.

I was looking at this code yesterday, to see today that psql's
completion should be completed with this new clause, similary to ADMIN
and INHERIT.
--
Michael
diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c
index 13014f074f..d7f90b657a 100644
--- a/src/bin/psql/tab-complete.c
+++ b/src/bin/psql/tab-complete.c
@@ -3773,7 +3773,7 @@ psql_completion(const char *text, int start, int end)
  */
 	/* Complete GRANT/REVOKE with a list of roles and privileges */
 	else if (TailMatches("GRANT|REVOKE") ||
-			 TailMatches("REVOKE", "ADMIN|GRANT|INHERIT", "OPTION", "FOR"))
+			 TailMatches("REVOKE", "ADMIN|GRANT|INHERIT|SET", "OPTION", "FOR"))
 	{
 		/*
 		 * With ALTER DEFAULT PRIVILEGES, restrict completion to grantable
@@ -3791,10 +3791,11 @@ psql_completion(const char *text, int start, int end)
 	 Privilege_options_of_grant_and_revoke,
 	 "GRANT OPTION FOR",
 	 "ADMIN OPTION FOR",
-	 "INHERIT OPTION FOR");
+	 "INHERIT OPTION FOR",
+	 "SET OPTION FOR");
 		else if (TailMatches("REVOKE", "GRANT", "OPTION", "FOR"))
 			COMPLETE_WITH(Privilege_options_of_grant_and_revoke);
-		else if (TailMatches("REVOKE", "ADMIN|INHERIT", "OPTION", "FOR"))
+		else if (TailMatches("REVOKE", "ADMIN|INHERIT|SET", "OPTION", "FOR"))
 			COMPLETE_WITH_QUERY(Query_for_list_of_roles);
 	}
 
@@ -3802,10 +3803,12 @@ psql_completion(const char *text, int start, int end)
 			 TailMatches("REVOKE", "GRANT", "OPTION", "FOR", "ALTER"))
 		COMPLETE_WITH("SYSTEM");
 
-	else if (TailMatches("GRANT|REVOKE", "SET") ||
-			 TailMatches("REVOKE", "GRANT", "OPTION", "FOR", "SET") ||
+	else if (TailMatches("REVOKE", "SET"))
+		COMPLETE_WITH("ON PARAMETER", "OPTION FOR");
+	else if (TailMatches("GRANT", "SET") ||
 			 TailMatches("GRANT|REVOKE", "ALTER", "SYSTEM") ||
-			 TailMatches("REVOKE", "GRANT", "OPTION", "FOR", "ALTER", "SYSTEM"))
+			 TailMatches("REVOKE", "GRANT", "OPTION", "FOR", "ALTER", "SYSTEM") ||
+			 TailMatches("REVOKE", "GRANT", "OPTION", "FOR", "SET"))
 		COMPLETE_WITH("ON PARAMETER");
 
 	else if (TailMatches("GRANT|REVOKE", MatchAny, "ON", "PARAMETER") ||
@@ -3941,14 +3944,16 @@ psql_completion(const char *text, int start, int end)
 	else if (HeadMatches("GRANT") && TailMatches("TO", MatchAny))
 		COMPLETE_WITH("WITH ADMIN",
 	  "WITH INHERIT",
+	  "WITH SET",
 	  "WITH GRANT OPTION",
 	  "GRANTED BY");
 	else if (HeadMatches("GRANT") && TailMatches("TO", MatchAny, "WITH"))
 		COMPLETE_WITH("ADMIN",
 	  "INHERIT",
+	  "SET",
 	  "GRANT OPTION");
 	else if (HeadMatches("GRANT") &&
-			 (TailMatches("TO", MatchAny, "WITH", "ADMIN|INHERIT")))
+			 (TailMatches("TO", MatchAny, "WITH", "ADMIN|INHERIT|SET")))
 		COMPLETE_WITH("OPTION", "TRUE", "FALSE");
 	else if (HeadMatches("GRANT") && TailMatches("TO", MatchAny, "WITH", MatchAny, "OPTION"))
 		COMPLETE_WITH("GRANTED BY");


signature.asc
Description: PGP signature


Re: allowing for control over SET ROLE

2022-11-18 Thread Erik Rijkers

Op 18-11-2022 om 22:19 schreef Robert Haas:

On Fri, Nov 18, 2022 at 1:50 PM Erik Rijkers  wrote:

In grant.sgml,

'actualy permisions'

looks a bit unorthodox.


Fixed that, and the other mistake Álvaro spotted, and also bumped
catversion because I forgot that earlier.


Sorry to be nagging but

  'permisions'  should be
  'permissions'

as well.


And as I'm nagging anyway: I also wondered whether the word order could 
improve:


- Word order as it stands:
However, the actual permissions conferred depend on the options 
associated with the grant.


-- maybe better:
However, the permissions actually conferred depend on the options 
associated with the grant.


But I'm not sure.


Thanks,

Erik

Thanks,

Erik




More efficient build farm animal wakeup?

2022-11-18 Thread Thomas Munro
Hi,

Is there a way to find out about new git commits that is more
efficient and timely than running N git fetches or whatever every
minute in a cron job?  Maybe some kind of long polling where you send
an HTTP request that says "I think the tips of branches x, y, z are at
111, 222, 333" and the server responds when that ceases to be true?




Re: test/modules/test_oat_hooks vs. debug_discard_caches=1

2022-11-18 Thread Tom Lane
Andres Freund  writes:
> On 2022-11-18 15:55:34 -0500, Tom Lane wrote:
>> We realized today [1] that it's been some time since the buildfarm
>> had any debug_discard_caches (nee CLOBBER_CACHE_ALWAYS) coverage.

> Do we know when it was covered last? I assume it's before the addition of
> test_oat_hooks in 90efa2f5565?

As far as that goes: some digging in the buildfarm DB says that avocet
last did a CCA run on 2021-10-22 and trilobite on 2021-10-24.  They
were then offline completely until 2022-02-10, and when they restarted
the runtimes were way too short to be CCA tests.

Seems like maybe we need a little more redundancy in this bunch of
buildfarm animals.

regards, tom lane




Re: Assertion failure in SnapBuildInitialSnapshot()

2022-11-18 Thread Andres Freund
Hi,

On 2022-11-18 11:20:36 +0530, Amit Kapila wrote:
> Okay, updated the patch accordingly.

Assuming it passes tests etc, this'd work for me.

- Andres




Re: test/modules/test_oat_hooks vs. debug_discard_caches=1

2022-11-18 Thread Tom Lane
Andres Freund  writes:
> On 2022-11-18 15:55:34 -0500, Tom Lane wrote:
>> The test_oat_hooks test is failing --- it's not crashing, but
>> it's emitting more NOTICE lines than the expected output includes,
>> evidently as a result of the hooks getting invoked extra times
>> during cache reloads.  I can reproduce that here.

> Did you already look into where those additional namespace searches are coming
> from? There are case in which it is not unproblematic to have repeated
> namespace searches due to the potential for races it opens up...

I'm not sufficiently interested in that API to dig hard for details,
but in a first look it seemed like the extra reports were coming
from repeated executions of recomputeNamespacePath, which are
forced after a cache invalidation by NamespaceCallback.

regards, tom lane




Re: Allow placeholders in ALTER ROLE w/o superuser

2022-11-18 Thread Alexander Korotkov
On Sat, Nov 19, 2022 at 12:41 AM Tom Lane  wrote:
> ... BTW, re-reading the commit message for a0ffa885e:
>
> One caveat is that PGC_USERSET GUCs are unaffected by the SET privilege
> --- one could wish that those were handled by a revocable grant to
> PUBLIC, but they are not, because we couldn't make it robust enough
> for GUCs defined by extensions.
>
> it suddenly struck me to wonder if the later 13d838815 changed the
> situation enough to allow revisiting that problem, and/or if storing
> the source role's OID in pg_db_role_setting would help.
>
> I don't immediately recall all the problems that led us to leave USERSET
> GUCs out of the feature, so maybe this is nuts; but maybe it isn't.
> It'd be worth considering if we're trying to improve matters here.

I think if we implement the user-visible USERSET flag for ALTER ROLE,
then we might just check permissions for such parameters from the
target role.

--
Regards,
Alexander Korotkov




Re: Allow placeholders in ALTER ROLE w/o superuser

2022-11-18 Thread Alexander Korotkov
On Sat, Nov 19, 2022 at 12:33 AM Tom Lane  wrote:
> Alexander Korotkov  writes:
> > This makes sense.  But do we really need to store the OID of the role?
> >  validate_option_array_item() already checks if the placeholder option
> > passes validation for PGC_SUSET.  So, we can just save a flag
> > indicating that this check was not successful.  If so, then the value
> > stored can be only used for PGC_USERSET.  Do you think this would be
> > correct?
>
> Meh ... doesn't seem like much of an improvement.  You still need
> to store something that's not there now.

Yes, but it wouldn't be needed to track dependencies of pg_role
mentions in pg_db_role_setting.  That seems to be a significant
simplification.

> This also seems to require
> some shaky assumptions about decisions having been made when storing
> still being valid later on.  Given the possibility of granting or
> revoking permissions for SET, I think we don't really want it to act
> that way.

Yes, it might be shaky.  Consider user sets parameter
pg_db_role_setting, and that appears to be capable only for
PGC_USERSET.  Next this user gets the SET permissions.  Then this
parameter needs to be set again in order for the new permission to
take effect.

But consider the other side.  How should we handle stored OID of a
role?  Should the privilege checking be moved from "set time" to "run
time"?  Therefore, revoking SET permission from role may affect
existing parameters in pg_db_role_setting.  It feels like revoke of
SET permission also aborts changes previously made with that
permission.  This is not how we normally do, and that seems confusing.

I think if we implement the flag and make it user-visible, e.g.
implement something like "ALTER ROLE ... SET ... USERSET;", then it
might be the lesser confusing option.

Thoughts?

--
Regards,
Alexander Korotkov




Re: test/modules/test_oat_hooks vs. debug_discard_caches=1

2022-11-18 Thread Andres Freund
Hi,

On 2022-11-18 15:55:34 -0500, Tom Lane wrote:
> We realized today [1] that it's been some time since the buildfarm
> had any debug_discard_caches (nee CLOBBER_CACHE_ALWAYS) coverage.

Do we know when it was covered last? I assume it's before the addition of
test_oat_hooks in 90efa2f5565?


> Sure enough, as soon as Tomas turned that back on, kaboom [2].
> The test_oat_hooks test is failing --- it's not crashing, but
> it's emitting more NOTICE lines than the expected output includes,
> evidently as a result of the hooks getting invoked extra times
> during cache reloads.  I can reproduce that here.

Did you already look into where those additional namespace searches are coming
from? There are case in which it is not unproblematic to have repeated
namespace searches due to the potential for races it opens up...

Greetings,

Andres Freund




Re: Allow placeholders in ALTER ROLE w/o superuser

2022-11-18 Thread Tom Lane
... BTW, re-reading the commit message for a0ffa885e:

One caveat is that PGC_USERSET GUCs are unaffected by the SET privilege
--- one could wish that those were handled by a revocable grant to
PUBLIC, but they are not, because we couldn't make it robust enough
for GUCs defined by extensions.

it suddenly struck me to wonder if the later 13d838815 changed the
situation enough to allow revisiting that problem, and/or if storing
the source role's OID in pg_db_role_setting would help.

I don't immediately recall all the problems that led us to leave USERSET
GUCs out of the feature, so maybe this is nuts; but maybe it isn't.
It'd be worth considering if we're trying to improve matters here.

regards, tom lane




Re: fix stats_fetch_consistency value in postgresql.conf.sample

2022-11-18 Thread Justin Pryzby
@cfbot: re-rebased again
>From e1f0940e7682052b2ec9d58c361f56630aa1742e Mon Sep 17 00:00:00 2001
From: Justin Pryzby 
Date: Tue, 19 Jul 2022 08:31:56 -0500
Subject: [PATCH 1/2] pg_settings_get_flags(): add DEFAULT_COMPILE and
 DEFAULT_INITDB ..

for settings which vary by ./configure or platform or initdb

Note that this is independent from PGC_S_DYNAMIC_DEFAULT.
---
 doc/src/sgml/func.sgml  | 15 ++
 src/backend/utils/misc/guc_funcs.c  |  6 ++-
 src/backend/utils/misc/guc_tables.c | 76 ++---
 src/include/utils/guc.h |  2 +
 4 files changed, 69 insertions(+), 30 deletions(-)

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 82fba48d5f7..67c68fc9292 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -24377,6 +24377,21 @@ SELECT collation for ('foo' COLLATE "de_DE");
  FlagDescription
 
 
+
+ 
+  DEFAULT_COMPILE
+  Parameters with this flag have default values which vary by
+  platform, or compile-time options.
+  
+ 
+
+ 
+  DEFAULT_INITDB
+  Parameters with this flag have default values which are
+  determined dynamically during initdb.
+  
+ 
+
  
   EXPLAIN
   Parameters with this flag are included in
diff --git a/src/backend/utils/misc/guc_funcs.c b/src/backend/utils/misc/guc_funcs.c
index 108b3bd1290..2b666e8d014 100644
--- a/src/backend/utils/misc/guc_funcs.c
+++ b/src/backend/utils/misc/guc_funcs.c
@@ -538,7 +538,7 @@ ShowAllGUCConfig(DestReceiver *dest)
 Datum
 pg_settings_get_flags(PG_FUNCTION_ARGS)
 {
-#define MAX_GUC_FLAGS	6
+#define MAX_GUC_FLAGS	8
 	char	   *varname = TextDatumGetCString(PG_GETARG_DATUM(0));
 	struct config_generic *record;
 	int			cnt = 0;
@@ -551,6 +551,10 @@ pg_settings_get_flags(PG_FUNCTION_ARGS)
 	if (record == NULL)
 		PG_RETURN_NULL();
 
+	if (record->flags & GUC_DEFAULT_COMPILE)
+		flags[cnt++] = CStringGetTextDatum("DEFAULT_COMPILE");
+	if (record->flags & GUC_DEFAULT_INITDB)
+		flags[cnt++] = CStringGetTextDatum("DEFAULT_INITDB");
 	if (record->flags & GUC_EXPLAIN)
 		flags[cnt++] = CStringGetTextDatum("EXPLAIN");
 	if (record->flags & GUC_NO_RESET)
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index e5cc4e3205a..6bd206c164a 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -1205,7 +1205,7 @@ struct config_bool ConfigureNamesBool[] =
 		{"debug_assertions", PGC_INTERNAL, PRESET_OPTIONS,
 			gettext_noop("Shows whether the running server has assertion checks enabled."),
 			NULL,
-			GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE
+			GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE | GUC_DEFAULT_COMPILE
 		},
 		&assert_enabled,
 		DEFAULT_ASSERT_ENABLED,
@@ -1377,7 +1377,8 @@ struct config_bool ConfigureNamesBool[] =
 	{
 		{"update_process_title", PGC_SUSET, PROCESS_TITLE,
 			gettext_noop("Updates the process title to show the active SQL command."),
-			gettext_noop("Enables updating of the process title every time a new SQL command is received by the server.")
+			gettext_noop("Enables updating of the process title every time a new SQL command is received by the server."),
+			GUC_DEFAULT_COMPILE
 		},
 		&update_process_title,
 		DEFAULT_UPDATE_PROCESS_TITLE,
@@ -2122,7 +2123,8 @@ struct config_int ConfigureNamesInt[] =
 	{
 		{"max_connections", PGC_POSTMASTER, CONN_AUTH_SETTINGS,
 			gettext_noop("Sets the maximum number of concurrent connections."),
-			NULL
+			NULL,
+			GUC_DEFAULT_INITDB
 		},
 		&MaxConnections,
 		100, 1, MAX_BACKENDS,
@@ -2159,7 +2161,7 @@ struct config_int ConfigureNamesInt[] =
 		{"shared_buffers", PGC_POSTMASTER, RESOURCES_MEM,
 			gettext_noop("Sets the number of shared memory buffers used by the server."),
 			NULL,
-			GUC_UNIT_BLOCKS
+			GUC_UNIT_BLOCKS | GUC_DEFAULT_INITDB
 		},
 		&NBuffers,
 		16384, 16, INT_MAX / 2,
@@ -2202,7 +2204,8 @@ struct config_int ConfigureNamesInt[] =
 	{
 		{"port", PGC_POSTMASTER, CONN_AUTH_SETTINGS,
 			gettext_noop("Sets the TCP port the server listens on."),
-			NULL
+			NULL,
+			GUC_DEFAULT_COMPILE
 		},
 		&PostPortNumber,
 		DEF_PGPORT, 1, 65535,
@@ -2231,7 +2234,8 @@ struct config_int ConfigureNamesInt[] =
 		 "to be a numeric mode specification in the form "
 		 "accepted by the chmod and umask system calls. "
 		 "(To use the customary octal format the number must "
-		 "start with a 0 (zero).)")
+		 "start with a 0 (zero).)"),
+			GUC_DEFAULT_INITDB
 		},
 		&Log_file_mode,
 		0600, , 0777,
@@ -2673,7 +2677,7 @@ struct config_int ConfigureNamesInt[] =
 		{"checkpoint_flush_after", PGC_SIGHUP, WAL_CHECKPOINTS,
 			gettext_noop("Number of pages after which previously performed writes are flushed to disk."),
 			NULL,
-			GUC_UNIT_BLOCKS
+			GUC_UNIT_BLOCKS | GUC_DEFAULT_COMPILE
 		},
 		&checkpoint_flush_after,
 		DEFAULT_CHECKPOINT_FLUSH_AFTER, 0, WRITEBACK_MAX_PENDING_FLUSHES,
@@ -2891,7 +2895,7 @@ struct config_int ConfigureNamesInt[] =
 		

Re: Allow placeholders in ALTER ROLE w/o superuser

2022-11-18 Thread Tom Lane
Alexander Korotkov  writes:
> This makes sense.  But do we really need to store the OID of the role?
>  validate_option_array_item() already checks if the placeholder option
> passes validation for PGC_SUSET.  So, we can just save a flag
> indicating that this check was not successful.  If so, then the value
> stored can be only used for PGC_USERSET.  Do you think this would be
> correct?

Meh ... doesn't seem like much of an improvement.  You still need
to store something that's not there now.  This also seems to require
some shaky assumptions about decisions having been made when storing
still being valid later on.  Given the possibility of granting or
revoking permissions for SET, I think we don't really want it to act
that way.

regards, tom lane




Re: Allow placeholders in ALTER ROLE w/o superuser

2022-11-18 Thread Alexander Korotkov
Hi!

I'd like to resume this discussion.

On Wed, Jul 20, 2022 at 6:50 PM Tom Lane  wrote:
> Kyotaro Horiguchi  writes:
> > At Tue, 19 Jul 2022 09:53:39 -0700, Nathan Bossart 
> >  wrote in
> >> Hm.  I would expect ALTER ROLE to store the PGC_SUSET context when executed
> >> by a superuser or a role with privileges via pg_parameter_acl.  Storing all
> >> placeholder GUC settings as PGC_USERSET would make things more restrictive
> >> than they are today.  For example, it would no longer be possible to apply
> >> any ALTER ROLE settings from superusers for placeholders that later become
> >> custom GUCS.
>
> > Returning to the topic, that operation can be allowed in PG15, having
> > being granted by superuser using the GRANT SET ON PARMETER command.
>
> I think that 13d838815 has completely changed the terms that this
> discussion needs to be conducted under.  It seems clear to me now
> that if you want to relax this only-superusers restriction, what
> you have to do is store the OID of the role that issued ALTER ROLE/DB SET,
> and then apply the same checks that would be used in the ordinary case
> where a placeholder is being filled in after being set intra-session.
> That is, we'd no longer assume that a value coming from pg_db_role_setting
> was set with superuser privileges, but we'd know exactly who did set it.

This makes sense.  But do we really need to store the OID of the role?
 validate_option_array_item() already checks if the placeholder option
passes validation for PGC_SUSET.  So, we can just save a flag
indicating that this check was not successful.  If so, then the value
stored can be only used for PGC_USERSET.  Do you think this would be
correct?

--
Regards,
Alexander Korotkov




Re: allowing for control over SET ROLE

2022-11-18 Thread Robert Haas
On Fri, Nov 18, 2022 at 1:50 PM Erik Rijkers  wrote:
> In grant.sgml,
>
>'actualy permisions'
>
> looks a bit unorthodox.

Fixed that, and the other mistake Álvaro spotted, and also bumped
catversion because I forgot that earlier.

-- 
Robert Haas
EDB: http://www.enterprisedb.com




test/modules/test_oat_hooks vs. debug_discard_caches=1

2022-11-18 Thread Tom Lane
We realized today [1] that it's been some time since the buildfarm
had any debug_discard_caches (nee CLOBBER_CACHE_ALWAYS) coverage.
Sure enough, as soon as Tomas turned that back on, kaboom [2].
The test_oat_hooks test is failing --- it's not crashing, but
it's emitting more NOTICE lines than the expected output includes,
evidently as a result of the hooks getting invoked extra times
during cache reloads.  I can reproduce that here.

Maybe it was a poor design that these hooks were placed someplace
that's sensitive to that.  I dunno.  The only short-term solution
I can think of is to force debug_discard_caches to 0 within that
test script, which is annoying but feasible (since that module
only exists in v15+).

Thoughts, other proposals?

regards, tom lane

[1] 
https://www.postgresql.org/message-id/6b52e783-1b32-e723-4311-0e433a5a5a75%40enterprisedb.com
[2] 
https://buildfarm.postgresql.org/cgi-bin/show_log.pl?nm=avocet&dt=2022-11-18%2016%3A01%3A43




Re: Reducing power consumption on idle servers

2022-11-18 Thread Thomas Munro
On Sat, Nov 19, 2022 at 7:54 AM Simon Riggs
 wrote:
> I agree. I can't see a reason to keep it anymore.

+Use of promote_trigger_file is deprecated. If you're

I think 'deprecated' usually implies that it still works but you
should avoid it.  I think you need something stronger.

> I'm nervous about not having any wakeup at all, but since we are
> removing the parameter there is no other reason not to do as Andres
> suggests.

Why?  If we're accidentally relying on this timeout for recovery to
not hang in some situation, that's a bug waiting to be discovered and
fixed and it won't be this patch's fault.

> New version attached, which assumes that the SIGALRMs are silenced on
> the other thread.

I tested this + Bharath's v5 from the other thread.  meson test
passes, and tracing the recovery process shows that it is indeed,
finally, completely idle.  Huzzah!




Re: New docs chapter on Transaction Management and related changes

2022-11-18 Thread Bruce Momjian
On Fri, Nov 18, 2022 at 02:33:26PM -0500, Bruce Momjian wrote:
> On Sun, Nov 13, 2022 at 12:56:30PM +0100, Laurenz Albe wrote:
> > > Maybe there's a way to reword the entire phrase that leads to a better
> > > formulation of the idea.
> > 
> > Any of your auggestions is better than "unaborted".
> > 
> > How about:
> > 
> >   If AND CHAIN is specified, a new transaction is
> >   immediately started with the same transaction characteristics (see  >   linkend="sql-set-transaction"/>) as the just finished one.
> >   This new transaction won't be in the aborted state, even
> >   if the old transaction was aborted.
> 
> I think I am going to keep "(unaborted)".

Attached is the most current version of the patch.

-- 
  Bruce Momjian  https://momjian.us
  EDB  https://enterprisedb.com

  Indecision is a decision.  Inaction is an action.  Mark Batterson

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index bd50ea8e48..ef3982f11a 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -7229,12 +7229,14 @@ local0.*/var/log/postgresql
 
 
  %v
- Virtual transaction ID (backendID/localXID)
+ Virtual transaction ID (backendID/localXID);  see
+ 
  no
 
 
  %x
- Transaction ID (0 if none is assigned)
+ Transaction ID (0 if none is assigned);  see
+ 
  no
 
 
diff --git a/doc/src/sgml/datatype.sgml b/doc/src/sgml/datatype.sgml
index b030b36002..fdffba4442 100644
--- a/doc/src/sgml/datatype.sgml
+++ b/doc/src/sgml/datatype.sgml
@@ -4992,7 +4992,8 @@ WHERE ...
 xmin and xmax.  Transaction identifiers are 32-bit quantities.
 In some contexts, a 64-bit variant xid8 is used.  Unlike
 xid values, xid8 values increase strictly
-monotonically and cannot be reused in the lifetime of a database cluster.
+monotonically and cannot be reused in the lifetime of a database
+cluster.  See  for more details.

 

diff --git a/doc/src/sgml/filelist.sgml b/doc/src/sgml/filelist.sgml
index de450cd661..0d6be9a2fa 100644
--- a/doc/src/sgml/filelist.sgml
+++ b/doc/src/sgml/filelist.sgml
@@ -104,6 +104,7 @@
 
 
 
+
 
 
 
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 6e0425cb3d..38ff0c82e3 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -24673,7 +24673,10 @@ SELECT collation for ('foo' COLLATE "de_DE");

 Returns the current transaction's ID.  It will assign a new one if the
 current transaction does not have one already (because it has not
-performed any database updates).
+performed any database updates);  see  for details.  If executed in a
+subtransaction, this will return the top-level transaction ID;
+see  for details.

   
 
@@ -24690,6 +24693,8 @@ SELECT collation for ('foo' COLLATE "de_DE");
 ID is assigned yet.  (It's best to use this variant if the transaction
 might otherwise be read-only, to avoid unnecessary consumption of an
 XID.)
+If executed in a subtransaction, this will return the top-level
+transaction ID.

   
 
@@ -24733,6 +24738,9 @@ SELECT collation for ('foo' COLLATE "de_DE");

 Returns a current snapshot, a data structure
 showing which transaction IDs are now in-progress.
+Only top-level transaction ID are included in the snapshot;
+subtransaction ID are not shown;  see 
+for details.

   
 
@@ -24787,7 +24795,8 @@ SELECT collation for ('foo' COLLATE "de_DE");
 Is the given transaction ID visible according
 to this snapshot (that is, was it completed before the snapshot was
 taken)?  Note that this function will not give the correct answer for
-a subtransaction ID.
+a subtransaction ID (subxid);  see  for
+details.

   
  
@@ -24799,8 +24808,9 @@ SELECT collation for ('foo' COLLATE "de_DE");
 wraps around every 4 billion transactions.  However,
 the functions shown in  use a
 64-bit type xid8 that does not wrap around during the life
-of an installation, and can be converted to xid by casting if
-required.  The data type pg_snapshot stores information about
+of an installation and can be converted to xid by casting if
+required;  see  for details. 
+The data type pg_snapshot stores information about
 transaction ID visibility at a particular moment in time.  Its components
 are described in .
 pg_snapshot's textual representation is
@@ -24846,7 +24856,7 @@ SELECT collation for ('foo' COLLATE "de_DE");
 xmax and not in this list was already completed at the time
 of the snapshot, and thus is either visible or dead according to its
 commit status.  This list does not include 

Re: New docs chapter on Transaction Management and related changes

2022-11-18 Thread Bruce Momjian
On Tue, Nov 15, 2022 at 10:16:44AM +, Simon Riggs wrote:
> On Tue, 8 Nov 2022 at 03:41, Bruce Momjian  wrote:
> >
> > On Mon, Nov  7, 2022 at 10:58:05AM +, Simon Riggs wrote:
> > > What I've posted is the merged patch, i.e. your latest patch, plus
> > > changes to RELEASE SAVEPOINT from you on Oct 16, plus changes based on
> > > the later comments from Robert and I.
> >
> > Thanks.  I have two changes to your patch.  First, I agree "destroy" is
> > the wrong word for this, but I don't think "subcommit" is good, for
> > three reasons:
> >
> > 1. Release merges the non-aborted changes into the previous transaction
> > _and_ frees their resources --- "subcommit" doesn't have both meanings,
> > which I think means if we need a single word, we should use "release"
> > and later define what that means.
> >
> > 2. The "subcommit" concept doesn't closely match the user-visible
> > behavior, even though we use subtransactions to accomplish this. Release
> > is more of a rollup/merge into the previously-active
> > transaction/savepoint.
> >
> > 3. "subcommit" is an implementation detail that I don't think we should
> > expose to users in the manual pages.
> 
> I don't understand this - you seem to be presuming that "subcommit"
> means something different and then objecting to that difference.
> 
> For me, "Subcommit" exactly matches what is happening because the code
> comments and details already use Subcommit in exactly this way.
> 
> The main purpose of this patch is to talk about what is happening
> using the same language as we do in the code. The gap between the code
> and the docs isn't helping anyone.

I didn't think that was the purpose, and certainly not in the
reference/ref/man pages.  I thought the purpose was to explain the
behavior clearly, and in the "Internals" section, the internal API we
expose to users.  I didn't think matching the code was ever a goal --- I
thought that is what the README files are for.

-- 
  Bruce Momjian  https://momjian.us
  EDB  https://enterprisedb.com

  Indecision is a decision.  Inaction is an action.  Mark Batterson





Re: New docs chapter on Transaction Management and related changes

2022-11-18 Thread Bruce Momjian
On Thu, Nov 10, 2022 at 11:31:25AM +, Simon Riggs wrote:
> > That's not true.  Transactions waiting for table-level locks are shown
> > waiting for a "relation" lock in both "pg_stat_activity" and "pg_locks".
> 
> Agreed.
> 
> Lock waits on table-locks are shown waiting for a lock type of
> relation,
> while lock waits on row-level locks are shown waiting for a lock type
> of transactionid.
> Table-level locks require only a virtualxid when the lock is less than an
> AccessExclusiveLock; in other cases an xid must be allocated.

Yeah, I went with more generic wording since the point seems to be that
sometimes xid and sometimes vxids are waited on.

-- 
  Bruce Momjian  https://momjian.us
  EDB  https://enterprisedb.com

  Indecision is a decision.  Inaction is an action.  Mark Batterson





Re: New docs chapter on Transaction Management and related changes

2022-11-18 Thread Bruce Momjian
On Sun, Nov 13, 2022 at 12:56:30PM +0100, Laurenz Albe wrote:
> > Maybe there's a way to reword the entire phrase that leads to a better
> > formulation of the idea.
> 
> Any of your auggestions is better than "unaborted".
> 
> How about:
> 
>   If AND CHAIN is specified, a new transaction is
>   immediately started with the same transaction characteristics (seelinkend="sql-set-transaction"/>) as the just finished one.
>   This new transaction won't be in the aborted state, even
>   if the old transaction was aborted.

I think I am going to keep "(unaborted)".

-- 
  Bruce Momjian  https://momjian.us
  EDB  https://enterprisedb.com

  Indecision is a decision.  Inaction is an action.  Mark Batterson





Re: New docs chapter on Transaction Management and related changes

2022-11-18 Thread Bruce Momjian
On Thu, Nov 10, 2022 at 08:39:29AM +0100, Laurenz Albe wrote:
> > > I don't think that is an improvement.  "Unaborted" is an un-word.  A new 
> > > transaction
> > > is always "unaborted", isn't it?
> > 
> > I thought about this as well when reviewing it, but I do think
> > something is needed for the case where you have a transaction which
> > has suffered an error and then you issue "rollback and chain"; if you
> > just say "a new transaction is immediately started with the same
> > transaction characteristics" it might imply to some the new
> > transaction has some kind of carry over of the previous broken
> > transaction... the use of the word unaborted makes it clear that the
> > new transaction is 100% functional.
> 
> A new transaction is never aborted in my understanding.  Being aborted is not 
> a
> characteristic of a transaction, but a state.

I used "(unaborted)", which seems to be a compromise.

> > > > +    The internal transaction ID type xid is 32-bits wide
> > > 
> > > There should be no hyphen in "32 bits wide", just as in "3 years old".
> > 
> > Minor aside, we should clean up glossary.sgml as well.
> 
> Right, it has this:
> 
>  The numerical, unique, sequentially-assigned identifier that each
>  transaction receives when it first causes a database modification.
>  Frequently abbreviated as xid.
>  When stored on disk, xids are only 32-bits wide, so only
>  approximately four billion write transaction IDs can be generated;
>  to permit the system to run for longer than that,
>  epochs are used, also 32 bits wide.
> 
> Which reminds me that I should have suggested  rather than
>  where I complained about the use of .

I changed them to "firstterm".

-- 
  Bruce Momjian  https://momjian.us
  EDB  https://enterprisedb.com

  Indecision is a decision.  Inaction is an action.  Mark Batterson





Re: New docs chapter on Transaction Management and related changes

2022-11-18 Thread Bruce Momjian
On Wed, Nov  9, 2022 at 09:16:18AM -0500, Robert Treat wrote:
> > "Subtransactions of the named savepoint" is somewhat confusing; how about
> > "subtransactions of the subtransaction established by the named savepoint"?
> >
> > If that is too long and explicit, perhaps "subtransactions of that 
> > subtransaction".
> >
> 
> Personally, I think these are more confusing.

That text is gone.

> > > --- a/doc/src/sgml/ref/rollback.sgml
> > > +++ b/doc/src/sgml/ref/rollback.sgml
> > > @@ -56,11 +56,14 @@ ROLLBACK [ WORK | TRANSACTION ] [ AND [ NO ] CHAIN ]
> > >  AND CHAIN
> > >  
> > >   
> > > -  If AND CHAIN is specified, a new transaction is
> > > +  If AND CHAIN is specified, a new unaborted 
> > > transaction is
> > >immediately started with the same transaction characteristics (see 
> > >  > >linkend="sql-set-transaction"/>) as the just finished one.  
> > > Otherwise,
> > >no new transaction is started.
> >
> > I don't think that is an improvement.  "Unaborted" is an un-word.  A new 
> > transaction
> > is always "unaborted", isn't it?
> >
> 
> I thought about this as well when reviewing it, but I do think
> something is needed for the case where you have a transaction which
> has suffered an error and then you issue "rollback and chain"; if you
> just say "a new transaction is immediately started with the same
> transaction characteristics" it might imply to some the new
> transaction has some kind of carry over of the previous broken
> transaction... the use of the word unaborted makes it clear that the
> new transaction is 100% functional.

I changed it to:

  a new (unaborted) transaction is immediately started

> ISTR that you only use a comma before since in cases where the
> preceding thought contains a negative.
> 
> In any case, are you thinking something like this:
> 
> " 64 open subxids are cached in shared memory for each backend; after
>  that point the overhead increases significantly due to additional disk I/O
>  from looking up subxid entries in pg_subtrans."

I went with:

   Up to 64 open subxids are cached in shared memory for
   each backend; after that point, the storage I/O overhead increases
   significantly due to additional lookups of subxid entries in
   pg_subtrans.

New patch attached.

-- 
  Bruce Momjian  https://momjian.us
  EDB  https://enterprisedb.com

  Indecision is a decision.  Inaction is an action.  Mark Batterson

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index bd50ea8e48..ef3982f11a 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -7229,12 +7229,14 @@ local0.*/var/log/postgresql
 
 
  %v
- Virtual transaction ID (backendID/localXID)
+ Virtual transaction ID (backendID/localXID);  see
+ 
  no
 
 
  %x
- Transaction ID (0 if none is assigned)
+ Transaction ID (0 if none is assigned);  see
+ 
  no
 
 
diff --git a/doc/src/sgml/datatype.sgml b/doc/src/sgml/datatype.sgml
index b030b36002..fdffba4442 100644
--- a/doc/src/sgml/datatype.sgml
+++ b/doc/src/sgml/datatype.sgml
@@ -4992,7 +4992,8 @@ WHERE ...
 xmin and xmax.  Transaction identifiers are 32-bit quantities.
 In some contexts, a 64-bit variant xid8 is used.  Unlike
 xid values, xid8 values increase strictly
-monotonically and cannot be reused in the lifetime of a database cluster.
+monotonically and cannot be reused in the lifetime of a database
+cluster.  See  for more details.

 

diff --git a/doc/src/sgml/filelist.sgml b/doc/src/sgml/filelist.sgml
index de450cd661..0d6be9a2fa 100644
--- a/doc/src/sgml/filelist.sgml
+++ b/doc/src/sgml/filelist.sgml
@@ -104,6 +104,7 @@
 
 
 
+
 
 
 
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 6e0425cb3d..38ff0c82e3 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -24673,7 +24673,10 @@ SELECT collation for ('foo' COLLATE "de_DE");

 Returns the current transaction's ID.  It will assign a new one if the
 current transaction does not have one already (because it has not
-performed any database updates).
+performed any database updates);  see  for details.  If executed in a
+subtransaction, this will return the top-level transaction ID;
+see  for details.

   
 
@@ -24690,6 +24693,8 @@ SELECT collation for ('foo' COLLATE "de_DE");
 ID is assigned yet.  (It's best to use this variant if the transaction
 might otherwise be read-only, to avoid unnecessary consumption of an
 XID.)
+If executed in a subtransaction, this will return the top-level
+transaction ID.

   
 
@@ -24733,6 +24738,9 @@ SELECT collation for ('foo' COLLATE "de_DE");

 Returns a cu

Re: Collation version tracking for macOS

2022-11-18 Thread Thomas Munro
On Sat, Nov 19, 2022 at 7:38 AM Thomas Munro  wrote:
> On Tue, Nov 15, 2022 at 1:55 PM Jeff Davis  wrote:
> > I realize your patch is experimental, but when there is a better
> > consensus on the approach, we should consider adding declarative syntax
> > such as:
> >
> >CREATE COLLATION (or LOCALE?) PROVIDER icu67
> >  TYPE icu VERSION '67' AS '/path/to/icui18n.so.67';
> >
> > It will offer more opportunities to catch errors early and offer better
> > error messages. It would also enable it to function if the library is
> > built with --disable-renaming (though we'd have to trust the user).
>
> Earlier in this and other threads, we wondered if each ICU major version 
> should
> be a separate provider, which is what you're showing there, or should be an
> independent property of an individual COLLATION, which is what v6 did with
> '63:en' and what Peter suggested I make more formal with CREATE COLLATION foo
> (..., ICU_VERSION=63).  I actually started out thinking we'd have multiple
> providers, but I couldn't really think of any advantage, and I think it makes
> some upgrade scenarios more painful.  Can you elaborate on why you'd want
> that model?

Hmm, thinking some more about this... I said the above thinking that
you couldn't change a provider after creating a database/collation.
But what if you could?

1.  CREATE DATABASE x LOCALE_PROVIDER=icu ...;
2.  Some time later after an upgrade, my postgres binary is linked
against a new ICU version and I start seeing warnings.
3.  ALTER DATABASE x LOCALE_PROVIDER=icu63;

I suppose you shouldn't be allowed to change libc -> icu, but you
could change icu - > icuXXX, or I guess icuXXX -> icuXXX.

What if you didn't have to manually manage the set of available
providers with DDL like you showed, but we just automatically
supported "icu" (= the linked ICU, whatever it might be), and icu50 up
to icuXXX where XXX is the linked ICU's version?  We can encode those
values + libc as an int, to replace the existing char the represents
providers in catalogues.

That's basically just a different way of encoding the same information
that Peter was suggesting I put in a new catalogue attribute.  How do
you like that bikeshed colour?




Re: New docs chapter on Transaction Management and related changes

2022-11-18 Thread Bruce Momjian
On Mon, Nov  7, 2022 at 11:04:46PM +0100, Laurenz Albe wrote:
> On Sat, 2022-11-05 at 10:08 +, Simon Riggs wrote:
> > Agreed; new compilation patch attached, including mine and then
> > Robert's suggested rewordings.
> 
> Thanks.  There is clearly a lot of usefule information in this.

Sorry again for the long delay in replying to this.

> Some comments:
> 
> > --- a/doc/src/sgml/func.sgml
> > +++ b/doc/src/sgml/func.sgml
> > @@ -24673,7 +24673,10 @@ SELECT collation for ('foo' COLLATE "de_DE");
> > 
> >  Returns the current transaction's ID.  It will assign a new one if 
> > the
> >  current transaction does not have one already (because it has not
> > -performed any database updates).
> > +performed any database updates);  see  > +linkend="transaction-id"/> for details.  If executed in a
> > +subtransaction this will return the top-level xid;  see  > +linkend="subxacts"/> for details.
> > 
> >
> 
> I would use a comma after "subtransaction", and I think it would be better to 
> write
> "transaction ID" instead of "xid".

Agreed.

> > @@ -24690,6 +24693,7 @@ SELECT collation for ('foo' COLLATE "de_DE");
> >  ID is assigned yet.  (It's best to use this variant if the 
> > transaction
> >  might otherwise be read-only, to avoid unnecessary consumption of 
> > an
> >  XID.)
> > +If executed in a subtransaction this will return the top-level xid.
> > 
> >
> 
> Same as above.

Agreed.

> > @@ -24733,6 +24737,8 @@ SELECT collation for ('foo' COLLATE "de_DE");
> > 
> >  Returns a current snapshot, a data structure
> >  showing which transaction IDs are now in-progress.
> > +Only top-level xids are included in the snapshot; subxids are not
> > +shown;  see  for details.
> > 
> >
> 
> Again, I would avoid "xid" and "subxid", or at least use "transaction ID 
> (xid)"
> and similar.

Done.

> > --- a/doc/src/sgml/ref/release_savepoint.sgml
> > +++ b/doc/src/sgml/ref/release_savepoint.sgml
> > @@ -34,23 +34,16 @@ RELEASE [ SAVEPOINT ] 
> > savepoint_name
> >Description
> >  
> >
> > -   RELEASE SAVEPOINT destroys a savepoint previously 
> > defined
> > -   in the current transaction.
> > +   RELEASE SAVEPOINT will subcommit the subtransaction
> > +   established by the named savepoint, if one exists. This will release
> > +   any resources held by the subtransaction. If there were any
> > +   subtransactions of the named savepoint, these will also be subcommitted.
> >
> > 
> >
> 
> "Subtransactions of the named savepoint" is somewhat confusing; how about
> "subtransactions of the subtransaction established by the named savepoint"?
> 
> If that is too long and explicit, perhaps "subtransactions of that 
> subtransaction".

This paragraph has been rewritten to:

   RELEASE SAVEPOINT releases the named savepoint and
   all active savepoints that were created after the named savepoint,
   and frees their resources.  All changes made since the creation of the
   savepoint, excluding rolled back savepoints changes, are merged into
   the transaction or savepoint that was active when the named savepoint
   was created.  Changes made after RELEASE SAVEPOINT
   will also be part of this active transaction or savepoint.

> > @@ -78,7 +71,7 @@ RELEASE [ SAVEPOINT ] 
> > savepoint_name
> > 
> >
> > It is not possible to release a savepoint when the transaction is in
> > -   an aborted state.
> > +   an aborted state, to do that use .
> >
> > 
> >
> 
> I think the following is more English:
> "It is not possible ... state; to do that, use ."

Changed to:

   It is not possible to release a savepoint when the transaction is in
   an aborted state; to do that, use .

> > --- a/doc/src/sgml/ref/rollback.sgml
> > +++ b/doc/src/sgml/ref/rollback.sgml
> > @@ -56,11 +56,14 @@ ROLLBACK [ WORK | TRANSACTION ] [ AND [ NO ] CHAIN ]
> >  AND CHAIN
> >  
> >   
> > -  If AND CHAIN is specified, a new transaction is
> > +  If AND CHAIN is specified, a new unaborted 
> > transaction is
> >immediately started with the same transaction characteristics (see 
> >  >linkend="sql-set-transaction"/>) as the just finished one.  
> > Otherwise,
> >no new transaction is started.
> 
> I don't think that is an improvement.  "Unaborted" is an un-word.  A new 
> transaction
> is always "unaborted", isn't it?

Agreed.

> > --- a/doc/src/sgml/wal.sgml
> > +++ b/doc/src/sgml/wal.sgml
> > @@ -909,4 +910,36 @@
> > seem to be a problem in practice.
> >
> >   
> > +
> > + 
> > +
> > +  Two-Phase Transactions
> > +
> > +  
> > +   PostgreSQL supports a two-phase commit (2PC)
> [...]
> > +   pg_twophase directory. Currently-prepared
> > +   transactions can be inspected using  > +   
> > linkend="view-pg-prepared-xacts">pg_prepared_xacts.
> > +  
> > + 
> > +
> >  
> 
> I don't like "currently-prepar

Re: MERGE regress test

2022-11-18 Thread Alvaro Herrera
On 2022-Nov-16, Alvaro Herrera wrote:

> Hmm, good find.  As I recall, I was opposed to the idea of throwing an
> error if the WHEN expression writes to the database, and the previous
> implementation had some hole, so I just ripped it out after discussing
> it; but I evidently failed to notice this test case about it.

Ah, I found out what happened, and my memory as usual is betraying me.
This was changed before I was involved with the patch at all: Pavan
changed it between his v18[1] and v19[2]:

if (action->whenqual)
{
-   int64 startWAL = GetXactWALBytes();
-   boolqual = ExecQual(action->whenqual, econtext);
-
-   /*
-* SQL Standard says that WHEN AND conditions must not
-* write to the database, so check we haven't written
-* any WAL during the test. Very sensible that is, since
-* we can end up evaluating some tests multiple times if
-* we have concurrent activity and complex WHEN clauses.
-*
-* XXX If we had some clear form of functional labelling
-* we could use that, if we trusted it.
-*/
-   if (startWAL < GetXactWALBytes())
-   ereport(ERROR,
-   (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-errmsg("cannot write to database within WHEN AND 
condition")));

This is what Peter Geoghegan had to say about it at the time:

> This needs to go. Apart from the fact that GetXactWALBytes() is buggy
> (it returns int64 for the unsigned type XLogRecPtr), the whole idea
> just seems unnecessary. I don't see why this is any different to using
> a volatile function in a regular UPDATE.

Pavan just forgot to remove the test.  I'll do so now.

[1] 
https://postgr.es/m/CABOikdPFCcgp7=zoN4M=y0tefw4q9dpau+oy5jn5a+hwydn...@mail.gmail.com
[2] 
https://postgr.es/m/caboikdouoaxt1h885tc_ca1loereejqdvdzqg62rqkizppy...@mail.gmail.com

-- 
Álvaro HerreraBreisgau, Deutschland  —  https://www.EnterpriseDB.com/
"No tengo por qué estar de acuerdo con lo que pienso"
 (Carlos Caszeli)




Re: Reducing power consumption on idle servers

2022-11-18 Thread Simon Riggs
On Thu, 17 Nov 2022 at 20:38, Robert Haas  wrote:
>
> On Thu, Nov 17, 2022 at 2:55 AM Simon Riggs
>  wrote:
> > No, it will have a direct effect only on people using promote_trigger_file
> > who do not read and act upon the deprecation notice before upgrading
> > by making a one line change to their failover scripts.
>
> TBH, I wonder if we shouldn't just nuke promote_trigger_file.
> pg_promote was added in 2018, and pg_ctl promote was added in 2011. So
> even if we haven't said promote_trigger_file was deprecated, it hasn't
> been the state-of-the-art way of doing this in a really long time. If
> we think that there are still a lot of people using it, or if popular
> tools are relying on it, then perhaps a deprecation period is
> warranted anyway. But I think we should at least consider the
> possibility that it's OK to just get rid of it right away.

I agree. I can't see a reason to keep it anymore.

I'm nervous about not having any wakeup at all, but since we are
removing the parameter there is no other reason not to do as Andres
suggests.

New version attached, which assumes that the SIGALRMs are silenced on
the other thread.

-- 
Simon Riggshttp://www.EnterpriseDB.com/


hibernate_startup.v8.patch
Description: Binary data


Re: allowing for control over SET ROLE

2022-11-18 Thread Erik Rijkers

Op 18-11-2022 om 19:43 schreef Robert Haas:

On Fri, Nov 18, 2022 at 12:50 PM Robert Haas  wrote:

Here's a rebased v3 to see what cfbot thinks.


cfbot is happy, so committed.


In grant.sgml,

  'actualy permisions'

looks a bit unorthodox.





Sending SIGABRT to child processes (was Re: Strange failure on mamba)

2022-11-18 Thread Tom Lane
Andres Freund  writes:
> On 2022-11-17 17:47:50 -0500, Tom Lane wrote:
>> So I'd like to have some way to make the postmaster send SIGABRT instead
>> of SIGKILL in the buildfarm environment.  The lowest-tech way would be
>> to drive that off some #define or other.  We could scale it up to a GUC
>> perhaps.  Adjacent to that, I also wonder whether SIGABRT wouldn't be
>> more useful than SIGSTOP for the existing SendStop half-a-feature ---
>> the idea that people should collect cores manually seems mighty
>> last-century.

> I suspect that having a GUC would be a good idea. I needed something similar
> recently, debugging an occasional hang in the AIO patchset. I first tried
> something like your #define approach and it did cause a problematic flood of
> core files.

Yeah, the main downside of such a thing is the risk of lots of core files
accumulating over repeated crashes.  Nonetheless, I think it'll be a
useful debugging aid.  Here's a proposed patch.  (I took the opportunity
to kill off the long-since-unimplemented Reinit switch, too.)

One thing I'm not too clear on is if we want to send SIGABRT to the child
groups (ie, SIGABRT grandchild processes too).  I made signal_child do
so here, but perhaps it's overkill.

regards, tom lane

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index bd50ea8e48..24b1624bad 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -11500,6 +11500,62 @@ LOG:  CleanUpLock: deleting: lock(0xb7acd844) id(24688,24696,0,0,0,1)
   
  
 
+ 
+  send_abort_for_crash (boolean)
+  
+   send_abort_for_crash configuration parameter
+  
+  
+  
+   
+By default, after a backend crash the postmaster will stop remaining
+child processes by sending them SIGQUIT
+signals, which permits them to exit more-or-less gracefully.  When
+this option is set to on,
+SIGABRT is sent instead.  That normally
+results in production of a core dump file for each such child
+process.
+This can be handy for investigating the states of other processes
+after a crash.  It can also consume lots of disk space in the event
+of repeated crashes, so do not enable this on systems you are not
+monitoring carefully.
+Beware that no support exists for cleaning up the core file(s)
+automatically.
+This parameter can only be set in
+the postgresql.conf file or on the server
+command line.
+   
+  
+ 
+
+ 
+  send_abort_for_kill (boolean)
+  
+   send_abort_for_kill configuration parameter
+  
+  
+  
+   
+By default, after attempting to stop a child process with
+SIGQUIT, the postmaster will wait five
+seconds and then send SIGKILL to force
+immediate termination.  When this option is set
+to on, SIGABRT is sent
+instead of SIGKILL.  That normally results
+in production of a core dump file for each such child process.
+This can be handy for investigating the states
+of stuck child processes.  It can also consume lots
+of disk space in the event of repeated crashes, so do not enable
+this on systems you are not monitoring carefully.
+Beware that no support exists for cleaning up the core file(s)
+automatically.
+This parameter can only be set in
+the postgresql.conf file or on the server
+command line.
+   
+  
+ 
+
 
   
   
diff --git a/doc/src/sgml/ref/postgres-ref.sgml b/doc/src/sgml/ref/postgres-ref.sgml
index 55a3f6c69d..b13a16a117 100644
--- a/doc/src/sgml/ref/postgres-ref.sgml
+++ b/doc/src/sgml/ref/postgres-ref.sgml
@@ -409,24 +409,6 @@ PostgreSQL documentation
   
  
 
- 
-  -n
-  
-   
-This option is for debugging problems that cause a server
-process to die abnormally.  The ordinary strategy in this
-situation is to notify all other server processes that they
-must terminate and then reinitialize the shared memory and
-semaphores.  This is because an errant server process could
-have corrupted some shared state before terminating.  This
-option specifies that postgres will
-not reinitialize shared data structures.  A knowledgeable
-system programmer can then use a debugger to examine shared
-memory and semaphore state.
-   
- 
-
-
  
   -O
   
@@ -466,14 +448,9 @@ PostgreSQL documentation
 This option is for debugging problems that cause a server
 process to die abnormally.  The ordinary strategy in this
 situation is to notify all other server processes that they
-must terminate and then reinitialize the shared memory and
-semaphores.  This is because an errant server process could
-have corrupted some shared state before termin

Re: allowing for control over SET ROLE

2022-11-18 Thread Robert Haas
On Fri, Nov 18, 2022 at 12:50 PM Robert Haas  wrote:
> Here's a rebased v3 to see what cfbot thinks.

cfbot is happy, so committed.

-- 
Robert Haas
EDB: http://www.enterprisedb.com




Re: allowing for control over SET ROLE

2022-11-18 Thread Alvaro Herrera
On 2022-Nov-18, Robert Haas wrote:

> On Thu, Nov 17, 2022 at 7:24 PM Jeff Davis  wrote:
> > But I'm fine if you'd like to move on with the SET ROLE privilege
> > instead, as long as we believe it grants a stable set of capabilities
> > (and conversely, that if the SET ROLE privilege is revoked, that it
> > revokes a stable set of capabilities).
> 
> OK.
> 
> Here's a rebased v3 to see what cfbot thinks.

I think this hunk in dumpRoleMembership() leaves an unwanted line
behind.

/*
-* Previous versions of PostgreSQL also did not have a grant-level
+* Previous versions of PostgreSQL also did not have grant-level options.
 * INHERIT option.
 */

(I was just looking at the doc part of this patch.)

-- 
Álvaro Herrera   48°01'N 7°57'E  —  https://www.EnterpriseDB.com/
"El hombre nunca sabe de lo que es capaz hasta que lo intenta" (C. Dickens)




Re: Collation version tracking for macOS

2022-11-18 Thread Thomas Munro
Replying to Peter and Jeff in one email.

On Sat, Nov 12, 2022 at 3:57 AM Peter Eisentraut
 wrote:
> On 22.10.22 03:22, Thomas Munro wrote:
> > I'd love to hear others' thoughts on how we can turn this into a
> > workable solution.  Hopefully while staying simple...
>
> I played with this patch a bit.  It looks like a reasonable approach.

Great news.

> Attached is a small patch to get the dynamic libicu* lookup working with
> the library naming on macOS.

Thanks, squashed.

> Instead of packing the ICU version into the locale field ('63:en'), I
> would make it a separate field in pg_collation and a separate argument
> in CREATE COLLATION.

I haven't tried this yet, as I focused on coming up with a way of testing in
this iteration.  I can try this next.  I'm imagining that we'd have
pg_collation.collicuversion and pg_database.daticuversion, and they'd default
to 0 for "use the GUC", and perhaps you'd even be able to ALTER them.  Perhaps
we wouldn't even need the GUC then...  0 could mean "the linked version", and
if you don't like it, you ALTER it.  Thinking about this.

> At this point, perhaps it would be good to start building some tests to
> demonstrate various upgrade scenarios and to ensure portability.

OK, here's what I came up with.  You enable it in PG_TEST_EXTRA, and
tell it about an alternative ICU version you have in the standard library
search path that is not the same as the main/linked one:

$ meson configure -DPG_TEST_EXTRA="icu=63"
$ meson test icu/020_multiversion

Another change from your feedback:  you mentioned that RHEL7 shipped with ICU
50, so I removed my suggestion of dropping some extra code we carry for
versions before 54 and set the minimum acceptable version to 50.  It probably
works further back than that, but that's a decent range, I think.

On Tue, Nov 15, 2022 at 1:55 PM Jeff Davis  wrote:
> I looked at v6.

Thanks for jumping in and testing!

>   * We'll need some clearer instructions on how to build/install extra
> ICU versions that might not be provided by the distribution packaging.
> For instance, I got a cryptic error until I used --enable-rpath, which
> might not be obvious to all users.

Suggestions welcome.  No docs at all yet...

>   * Can we have a better error when the library was built with --
> disable-renaming? We can just search for the plain (no suffix) symbol.

I threw out that symbol probing logic, and wrote something simpler that should
now also work with --disable-renaming (though not tested).  Now it does a
cross-check with the library's self-reported major version, just to make
sure there wasn't a badly named library file, which may be more likely
with --disable-renaming.

>   * We should use dlerror() instead of %m to report dlopen() errors.

Fixed.

>   * It seems like the collation version is just there to issue WARNINGs
> when a user is using the non-versioned locale syntax and the library
> changes underneath them (or if there is collation version change within
> a single ICU major version)?

Correct.

I have now updated the warning messages you get when they don't match, to
provide a hint about what to do about it.  I am sure they need some more
word-smithing, though.

>   * How are you testing this?

Ad hoc noodling before now, but see attached.

> I realize your patch is experimental, but when there is a better
> consensus on the approach, we should consider adding declarative syntax
> such as:
>
>CREATE COLLATION (or LOCALE?) PROVIDER icu67
>  TYPE icu VERSION '67' AS '/path/to/icui18n.so.67';
>
> It will offer more opportunities to catch errors early and offer better
> error messages. It would also enable it to function if the library is
> built with --disable-renaming (though we'd have to trust the user).

Earlier in this and other threads, we wondered if each ICU major version should
be a separate provider, which is what you're showing there, or should be an
independent property of an individual COLLATION, which is what v6 did with
'63:en' and what Peter suggested I make more formal with CREATE COLLATION foo
(..., ICU_VERSION=63).  I actually started out thinking we'd have multiple
providers, but I couldn't really think of any advantage, and I think it makes
some upgrade scenarios more painful.  Can you elaborate on why you'd want
that model?

> On Sat, 2022-10-22 at 14:22 +1300, Thomas Munro wrote:
> > Problem 1:  Suppose you're ready to start using (say) v72.  I guess
> > you'd use the REFRESH command, which would open the main linked ICU's
> > collversion and stamp that into the catalogue, at which point new
> > sessions would start using that, and then you'd have to rebuild all
> > your indexes (with no help from PG to tell you how to find everything
> > that needs to be rebuilt, as belaboured in previous reverted work).
> > Aside from the possibility of getting the rebuilding job wrong (as
> > belaboured elsewhere), it's not great, because there is still a
> > transitional period where you can be using the wrong version for 

Re: Allow single table VACUUM in transaction block

2022-11-18 Thread Tom Lane
Robert Haas  writes:
> On Fri, Nov 18, 2022 at 7:04 AM Simon Riggs
>  wrote:
>> So if consistency is also a strong requirement, then maybe we should
>> make that new command the default, i.e. make VACUUM always just a
>> request to vacuum in background. That way it will be consistent.

> Since one fairly common reason for running vacuum in the foreground is
> needing to vacuum a table when all autovacuum workers are busy, or
> when they are vacuuming it with a cost limit and it needs to get done
> sooner, I think this would surprise a lot of users in a negative way.

It would also break a bunch of our regression tests, which expect a
VACUUM to complete immediately.

>> Can we at least have a vacuum_runs_in_background = on | off, to allow
>> users to take advantage of this WITHOUT needing to rewrite all of
>> their scripts?

> I'm not entirely convinced that's a good idea, but happy to hear what
> others think.

I think the answer to that one is flat no.  We learned long ago that GUCs
with significant semantic impact on queries are a bad idea.  For example,
if a user issues VACUUM expecting behavior A and she gets behavior B
because somebody changed the postgresql.conf entry, she won't be happy.

Basically, I am not buying Simon's requirement that this be transparent.
I think the downsides would completely outweigh whatever upside there
may be (and given the shortage of prior complaints, I don't think the
upside is very large).

regards, tom lane




Re: allowing for control over SET ROLE

2022-11-18 Thread Robert Haas
On Thu, Nov 17, 2022 at 7:24 PM Jeff Davis  wrote:
> But I'm fine if you'd like to move on with the SET ROLE privilege
> instead, as long as we believe it grants a stable set of capabilities
> (and conversely, that if the SET ROLE privilege is revoked, that it
> revokes a stable set of capabilities).

OK.

Here's a rebased v3 to see what cfbot thinks.

-- 
Robert Haas
EDB: http://www.enterprisedb.com


v3-0001-Add-a-SET-option-to-the-GRANT-command.patch
Description: Binary data


Re: Avoid double lookup in pgstat_fetch_stat_tabentry()

2022-11-18 Thread Andres Freund
Hi,

On 2022-11-18 11:09:43 +0100, Drouvot, Bertrand wrote:
> > Furthermore, the pgstat_fetch_stat_tabentry() can just be a
> > static inline function.

I think that's just premature optimization for something like this. The
function call overhead on accessing stats can't be a relevant factor - the
increase in code size is more likely to matter (but still unlikely).


> Good point. While at it, why not completely get rid of
> pgstat_fetch_stat_tabentry_ext(), like in v2 the attached?

-1, I don't think spreading the IsSharedRelation() is a good idea. It costs
more code than it saves.

Greetings,

Andres Freund




Re: predefined role(s) for VACUUM and ANALYZE

2022-11-18 Thread Nathan Bossart
rebased

-- 
Nathan Bossart
Amazon Web Services: https://aws.amazon.com
>From 0643a4dbc9f36d9fd383ef0cfebef13875237718 Mon Sep 17 00:00:00 2001
From: Nathan Bossart 
Date: Wed, 7 Sep 2022 22:25:29 -0700
Subject: [PATCH v10 1/4] Change AclMode from a uint32 to a uint64.

---
 src/backend/nodes/outfuncs.c|  2 +-
 src/bin/pg_upgrade/check.c  | 35 +
 src/include/catalog/pg_type.dat |  4 ++--
 src/include/nodes/parsenodes.h  |  6 +++---
 src/include/utils/acl.h | 28 +-
 5 files changed, 55 insertions(+), 20 deletions(-)

diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index f05e72f0dc..8f150e9a2e 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -560,7 +560,7 @@ _outRangeTblEntry(StringInfo str, const RangeTblEntry *node)
 	WRITE_BOOL_FIELD(lateral);
 	WRITE_BOOL_FIELD(inh);
 	WRITE_BOOL_FIELD(inFromCl);
-	WRITE_UINT_FIELD(requiredPerms);
+	WRITE_UINT64_FIELD(requiredPerms);
 	WRITE_OID_FIELD(checkAsUser);
 	WRITE_BITMAPSET_FIELD(selectedCols);
 	WRITE_BITMAPSET_FIELD(insertedCols);
diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c
index f1bc1e6886..615a53a864 100644
--- a/src/bin/pg_upgrade/check.c
+++ b/src/bin/pg_upgrade/check.c
@@ -28,6 +28,7 @@ static void check_for_incompatible_polymorphics(ClusterInfo *cluster);
 static void check_for_tables_with_oids(ClusterInfo *cluster);
 static void check_for_composite_data_type_usage(ClusterInfo *cluster);
 static void check_for_reg_data_type_usage(ClusterInfo *cluster);
+static void check_for_aclitem_data_type_usage(ClusterInfo *cluster);
 static void check_for_jsonb_9_4_usage(ClusterInfo *cluster);
 static void check_for_pg_role_prefix(ClusterInfo *cluster);
 static void check_for_new_tablespace_dir(ClusterInfo *new_cluster);
@@ -107,6 +108,13 @@ check_and_dump_old_cluster(bool live_check)
 	check_for_reg_data_type_usage(&old_cluster);
 	check_for_isn_and_int8_passing_mismatch(&old_cluster);
 
+	/*
+	 * PG 16 increased the size of the 'aclitem' type, which breaks the on-disk
+	 * format for existing data.
+	 */
+	if (GET_MAJOR_VERSION(old_cluster.major_version) <= 1500)
+		check_for_aclitem_data_type_usage(&old_cluster);
+
 	/*
 	 * PG 14 changed the function signature of encoding conversion functions.
 	 * Conversions from older versions cannot be upgraded automatically
@@ -1319,6 +1327,33 @@ check_for_reg_data_type_usage(ClusterInfo *cluster)
 		check_ok();
 }
 
+/*
+ * check_for_aclitem_data_type_usage
+ *
+ *	aclitem changed its storage format in 16, so check for it.
+ */
+static void
+check_for_aclitem_data_type_usage(ClusterInfo *cluster)
+{
+	char		output_path[MAXPGPATH];
+
+	prep_status("Checking for incompatible aclitem data type in user tables");
+
+	snprintf(output_path, sizeof(output_path), "tables_using_aclitem.txt");
+
+	if (check_for_data_type_usage(cluster, "pg_catalog.aclitem", output_path))
+	{
+		pg_log(PG_REPORT, "fatal");
+		pg_fatal("Your installation contains the \"aclitem\" data type in user tables.\n"
+ "The internal format of \"aclitem\" changed in PostgreSQL version 16\n"
+ "so this cluster cannot currently be upgraded.  You can drop the\n"
+ "problem columns and restart the upgrade.  A list of the problem\n"
+ "columns is in the file:\n"
+ "%s", output_path);
+	}
+	else
+		check_ok();
+}
 
 /*
  * check_for_jsonb_9_4_usage()
diff --git a/src/include/catalog/pg_type.dat b/src/include/catalog/pg_type.dat
index df45879463..0763dfde39 100644
--- a/src/include/catalog/pg_type.dat
+++ b/src/include/catalog/pg_type.dat
@@ -267,9 +267,9 @@
 # OIDS 1000 - 1099
 
 { oid => '1033', array_type_oid => '1034', descr => 'access control list',
-  typname => 'aclitem', typlen => '12', typbyval => 'f', typcategory => 'U',
+  typname => 'aclitem', typlen => '16', typbyval => 'f', typcategory => 'U',
   typinput => 'aclitemin', typoutput => 'aclitemout', typreceive => '-',
-  typsend => '-', typalign => 'i' },
+  typsend => '-', typalign => 'd' },
 { oid => '1042', array_type_oid => '1014',
   descr => 'char(length), blank-padded string, fixed storage length',
   typname => 'bpchar', typlen => '-1', typbyval => 'f', typcategory => 'S',
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index 7caff62af7..f4ed9bbff9 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -73,12 +73,12 @@ typedef enum SetQuantifier
 
 /*
  * Grantable rights are encoded so that we can OR them together in a bitmask.
- * The present representation of AclItem limits us to 16 distinct rights,
- * even though AclMode is defined as uint32.  See utils/acl.h.
+ * The present representation of AclItem limits us to 32 distinct rights,
+ * even though AclMode is defined as uint64.  See utils/acl.h.
  *
  * Caution: changing these codes breaks stored ACLs, hence forces initdb.
  */
-typedef uint32 AclMode;			/* a bitmask of privilege bits */
+typedef uint64 AclMode;		

Re: Allow single table VACUUM in transaction block

2022-11-18 Thread Robert Haas
On Fri, Nov 18, 2022 at 7:04 AM Simon Riggs
 wrote:
> Outside a transaction - works perfectly
> In a transaction - throws ERROR, which prevents a whole script from
> executing correctly

Right, but your proposal would move that inconsistency to a different
place. It wouldn't eliminate it. I don't think we can pretend that
nobody will notice their operation being moved to the background. For
instance, there might not be an available background worker for a long
time, which could mean that some vacuums work right away and others
just sit there for reasons that aren't obvious to the user.

> So if consistency is also a strong requirement, then maybe we should
> make that new command the default, i.e. make VACUUM always just a
> request to vacuum in background. That way it will be consistent.

Since one fairly common reason for running vacuum in the foreground is
needing to vacuum a table when all autovacuum workers are busy, or
when they are vacuuming it with a cost limit and it needs to get done
sooner, I think this would surprise a lot of users in a negative way.

> Can we at least have a vacuum_runs_in_background = on | off, to allow
> users to take advantage of this WITHOUT needing to rewrite all of
> their scripts?

I'm not entirely convinced that's a good idea, but happy to hear what
others think.

-- 
Robert Haas
EDB: http://www.enterprisedb.com




Re: libpq compression (part 2)

2022-11-18 Thread Peter Eisentraut

On 18.11.22 02:07, Andrey Borodin wrote:

2. This literal
{no_compression_name}
should be replaced by explicit form
{no_compression_name, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}


That doesn't seem better.




Re: Optimize join selectivity estimation by not reading MCV stats for unique join attributes

2022-11-18 Thread Tom Lane
David Geier  writes:
> On 11/18/22 14:00, Tomas Vondra wrote:
>> Seems fine. I wonder if we could/could introduce a new constant for 0,
>> similar to ATTSTATSSLOT_NUMBERS/ATTSTATSSLOT_VALUES, instead of using a
>> magic constant. Say, ATTSTATSSLOT_NONE or ATTSTATSSLOT_CHECK.

> Good idea. I called it ATTSTATSSLOT_EXISTS. New patch attached.

No, I don't think it's a good idea.  The flags argument is documented as,
and used as, a bitmask of multiple options.  Passing zero fits fine with
that and is consistent with what we do elsewhere.  Turning it into
sort-of-an-enum-but-not-really isn't an improvement.

I didn't like your draft comment too much, because it didn't cover
what I think is the most important point: after a call with flags=0
we do not need a matching free_attstatsslot call to avoid leaking
anything.  (If we did, this patch would be a lot hairier.)

I rewrote the comment the way I wanted it and pushed.

regards, tom lane




Re: Optimize join selectivity estimation by not reading MCV stats for unique join attributes

2022-11-18 Thread David Geier

On 11/18/22 14:00, Tomas Vondra wrote:

Seems fine. I wonder if we could/could introduce a new constant for 0,
similar to ATTSTATSSLOT_NUMBERS/ATTSTATSSLOT_VALUES, instead of using a
magic constant. Say, ATTSTATSSLOT_NONE or ATTSTATSSLOT_CHECK.

Good idea. I called it ATTSTATSSLOT_EXISTS. New patch attached.

I don't think you can write a test for this, because there is no change
to behavior that can be observed by the user. If one side has no MCV,
the only difference is whether we try to load the other MCV or not.


Yeah. I thought along the lines of checking the number of pages read 
when the pg_stats entry is not in syscache yet. But that seems awfully 
implementation specific. So no test provided.


--
David Geier
(ServiceNow)
From 5c5c0fb9dd99e79daaa015984c9dda22e4ccda17 Mon Sep 17 00:00:00 2001
From: David Geier 
Date: Fri, 18 Nov 2022 09:35:08 +0100
Subject: [PATCH] Don't read MCV stats needlessly in join selectivity
 estimation

Join selectivity estimation only uses MCV stats if they're present
on both join attributes. Therefore, if MCV stats are not available
on one of the two columns, skip reading MCV stats altogether.

Frequently encountered situations in which MCV stats not available
is unique columns or columns for which all values have roughly
the same frequency and therefore ANALYZE decided to not store them.
---
 src/backend/utils/adt/selfuncs.c| 17 +++--
 src/backend/utils/cache/lsyscache.c |  4 
 src/include/utils/lsyscache.h   |  3 ++-
 3 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index d597b7e81f..85a29eaeb9 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -2263,6 +2263,7 @@ eqjoinsel(PG_FUNCTION_ARGS)
 	bool		have_mcvs2 = false;
 	bool		join_is_reversed;
 	RelOptInfo *inner_rel;
+	bool		get_mcv_stats;
 
 	get_join_variables(root, args, sjinfo,
 	   &vardata1, &vardata2, &join_is_reversed);
@@ -2275,11 +2276,23 @@ eqjoinsel(PG_FUNCTION_ARGS)
 	memset(&sslot1, 0, sizeof(sslot1));
 	memset(&sslot2, 0, sizeof(sslot2));
 
+	/*
+	 * There is no use in fetching one side's MCVs if we lack MCVs for the
+	 * other side, so do a quick check to verify that both stats exist.
+	 */
+	get_mcv_stats = (HeapTupleIsValid(vardata1.statsTuple) &&
+			 HeapTupleIsValid(vardata2.statsTuple) &&
+			 get_attstatsslot(&sslot1, vardata1.statsTuple,
+	  STATISTIC_KIND_MCV, InvalidOid, ATTSTATSSLOT_EXISTS) &&
+			 get_attstatsslot(&sslot2, vardata2.statsTuple,
+	  STATISTIC_KIND_MCV, InvalidOid, ATTSTATSSLOT_EXISTS));
+
+
 	if (HeapTupleIsValid(vardata1.statsTuple))
 	{
 		/* note we allow use of nullfrac regardless of security check */
 		stats1 = (Form_pg_statistic) GETSTRUCT(vardata1.statsTuple);
-		if (statistic_proc_security_check(&vardata1, opfuncoid))
+		if (get_mcv_stats && statistic_proc_security_check(&vardata1, opfuncoid))
 			have_mcvs1 = get_attstatsslot(&sslot1, vardata1.statsTuple,
 		  STATISTIC_KIND_MCV, InvalidOid,
 		  ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS);
@@ -2289,7 +2302,7 @@ eqjoinsel(PG_FUNCTION_ARGS)
 	{
 		/* note we allow use of nullfrac regardless of security check */
 		stats2 = (Form_pg_statistic) GETSTRUCT(vardata2.statsTuple);
-		if (statistic_proc_security_check(&vardata2, opfuncoid))
+		if (get_mcv_stats && statistic_proc_security_check(&vardata2, opfuncoid))
 			have_mcvs2 = get_attstatsslot(&sslot2, vardata2.statsTuple,
 		  STATISTIC_KIND_MCV, InvalidOid,
 		  ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS);
diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c
index a16a63f495..191f68f7b4 100644
--- a/src/backend/utils/cache/lsyscache.c
+++ b/src/backend/utils/cache/lsyscache.c
@@ -3157,6 +3157,10 @@ get_attavgwidth(Oid relid, AttrNumber attnum)
  * reqkind: STAKIND code for desired statistics slot kind.
  * reqop: STAOP value wanted, or InvalidOid if don't care.
  * flags: bitmask of ATTSTATSSLOT_VALUES and/or ATTSTATSSLOT_NUMBERS.
+ *Passing 0 will only check if the requested slot type exists but not
+ *extract any content. This is useful in cases where MCV stats are only
+ *useful if they exists on all involved columns (e.g. during join
+ *selectivity computation).
  *
  * If a matching slot is found, true is returned, and *sslot is filled thus:
  * staop: receives the actual STAOP value.
diff --git a/src/include/utils/lsyscache.h b/src/include/utils/lsyscache.h
index 50f0288305..d9a3916997 100644
--- a/src/include/utils/lsyscache.h
+++ b/src/include/utils/lsyscache.h
@@ -39,7 +39,8 @@ typedef enum IOFuncSelector
 } IOFuncSelector;
 
 /* Flag bits for get_attstatsslot */
-#define ATTSTATSSLOT_VALUES		0x01
+#define ATTSTATSSLOT_EXISTS	0x00
+#define ATTSTATSSLOT_VALUES	0x01
 #define ATTSTATSSLOT_NUMBERS	0x02
 
 /* Result struct for get_attstatsslot */
-- 
2.34.1



Re: Glossary and initdb definition work for "superuser" and database/cluster

2022-11-18 Thread David G. Johnston
On Fri, Nov 18, 2022 at 4:11 AM Alvaro Herrera 
wrote:

> On 2022-Nov-02, David G. Johnston wrote:
>
> > Version 2 attached, some significant re-working.  Starting to think that
> > initdb isn't the place for some of this content - in particular the stuff
> > I'm deciding to move down to the Notes section.  Might consider moving
> some
> > of it to the Server Setup and Operation chapter 19 - Creating Cluster (or
> > nearby...) [1].
> >
> > I settled on "cluster owner" over "cluster user" and made the terminology
> > consistent throughout initdb and the glossary (haven't looked at chapter
> 19
> > yet).  Also added it to the glossary.
>
> Generally speaking, I like the idea of documenting these things.
> However it sounds like you're not done with the wording and editing, so
> I'm not committing the whole patch, but it seems a good starting point
> to at least have some basic definitions.  So I've extracted them from
> your patch and pushed those.  You can already see it at
> https://www.postgresql.org/docs/devel/glossary.html


Agreed on the not quite ready yet, and that the glossary is indeed
self-contained enough to go in by itself at this point.  Thank you for
doing that.


> I left out almost all the material from the patch that's not in the
> glossary proper, and also a few phrases in the glossary itself.  Some of
> these sounded like security considerations rather than part of the
> definitions.  I think we should have a separate chapter in Part III
> (Server Administration) that explains many security aspects; right now
> there's no hope of collecting a lot of very important advice in a single
> place, so a wannabe admin has no chance of getting things right.  That
> seems to me a serious deficiency.  A new chapter could provide a lot of
> general advice on every aspect that needs to be considered, and link to
> the reference section for additional details.  Maybe part of these
> initdb considerations could be there, too.
>

I'll consider that approach as well as other spots in the documentation on
this next pass.

David J.


Re: [PoC] configurable out of disk space elog level

2022-11-18 Thread Maxim Orlov
> I don't think this is a good feature to add to PostgreSQL. First, it's
> unclear why stopping the cluster is a desirable behavior. It doesn't
> stop the user transactions from failing; it just makes them fail in
> some other way. Now it is of course perfectly legitimate for a
> particular user to want that particular behavior anyway, but there are
> a bunch of other things that a user could equally legitimately want to
> do, like page the DBA or trigger a failover or kill off sessions that
> are using large temporary relations or whatever. And, equally, there
> are many other operating system errors to which a user could want the
> database system to respond in similar ways. For example, someone might
> want any given one of those treatments when an I/O error occurs
> writing to the data directory, or a read-only filesystem error, or a
> permission denied error.
>
> Having a switch for one particular kind of error (out of many that
> could possibly occur) that triggers one particular coping strategy
> (out of many that could possibly be used) seems far too specific a
> thing to add as a core feature. And even if we had something much more
> general, I'm not sure why that should go into the database rather than
> being implemented outside it. After all, nothing at all prevents the
> user from scanning the database logs for "out of space" errors and
> shutting down the database if any are found.


Yes, you are right. Actually, this is one of possible ways to deal with
described situation I
mentioned above. And if I would deal with such a task, I would make it via
log monitoring.
The question was: "could we be more general here?". Probably, not.


> While you're at it, you
> could make your monitoring script also check the free space on the
> relevant partition using statfs() and page somebody if the utilization
> goes above 95% or whatever threshold you like, which would probably
> avoid service outages much more effectively than $SUBJECT.
>
> I just can't see much real benefit in putting this logic inside the
> database.
>

OK, I got it. Thanks for your thoughts!

-- 
Best regards,
Maxim Orlov.


Re: Patch: Global Unique Index

2022-11-18 Thread Pavel Stehule
pá 18. 11. 2022 v 16:06 odesílatel Tom Lane  napsal:

> Sergei Kornilov  writes:
> > Do we need new syntax actually? I think that a global unique index can
> be created automatically instead of raising an error "unique constraint on
> partitioned table must include all partitioning columns"
>
> I'm not convinced that we want this feature at all: as far as I can see,
> it will completely destroy the benefits of making a partitioned table
> in the first place.  But if we do want it, I don't think it should be
> so easy to create a global index by accident as that syntax approach
> would make it.  I think there needs to be a pretty clear YES I WANT TO
> SHOOT MYSELF IN THE FOOT clause in the command.
>

isn't possible to have a partitioned index?

https://www.highgo.ca/2022/10/14/global-index-a-different-approach/

Regards

Pavel


>
> regards, tom lane
>
>
>


Re: Patch: Global Unique Index

2022-11-18 Thread Tom Lane
Sergei Kornilov  writes:
> Do we need new syntax actually? I think that a global unique index can be 
> created automatically instead of raising an error "unique constraint on 
> partitioned table must include all partitioning columns"

I'm not convinced that we want this feature at all: as far as I can see,
it will completely destroy the benefits of making a partitioned table
in the first place.  But if we do want it, I don't think it should be
so easy to create a global index by accident as that syntax approach
would make it.  I think there needs to be a pretty clear YES I WANT TO
SHOOT MYSELF IN THE FOOT clause in the command.

regards, tom lane




Re: CREATE UNLOGGED TABLE seq faults when debug_discard_caches=1

2022-11-18 Thread Tomas Vondra



On 11/18/22 15:43, Tom Lane wrote:
> David Geier  writes:
>> On a different note: are we frequently running our tests suites with 
>> debug_discard_caches=1 enabled?
>> It doesn't seem like.
> 
> Hmm.   Buildfarm members avocet and trilobite are supposed to be
> doing that, but their runtimes of late put the lie to it.
> Configuration option got lost somewhere?
> 

Yup, my bad - I forgot to tweak CPPFLAGS when upgrading the buildfarm
client to v12. Fixed, next run should be with

CPPFLAGS => '-DCLOBBER_CACHE_ALWAYS',


regards

-- 
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company




Re: Optimize join selectivity estimation by not reading MCV stats for unique join attributes

2022-11-18 Thread Tom Lane
Tomas Vondra  writes:
> On 11/18/22 09:54, David Geier wrote:
>> I couldn't come up with any reasonable way of writing an automated test
>> for that.

> I don't think you can write a test for this, because there is no change
> to behavior that can be observed by the user.

Yeah, and the delta in performance is surely too small to be
measured reliably in the buildfarm.  I think coverage will have
to be sufficient.

regards, tom lane




Re: CREATE UNLOGGED TABLE seq faults when debug_discard_caches=1

2022-11-18 Thread Tom Lane
David Geier  writes:
> On a different note: are we frequently running our tests suites with 
> debug_discard_caches=1 enabled?
> It doesn't seem like.

Hmm.   Buildfarm members avocet and trilobite are supposed to be
doing that, but their runtimes of late put the lie to it.
Configuration option got lost somewhere?

prion is running with -DRELCACHE_FORCE_RELEASE -DCATCACHE_FORCE_RELEASE,
which I would have thought would be enough to catch this, but I guess
not.

regards, tom lane




Re: Fix proposal for comparaison bugs in PostgreSQL::Version

2022-11-18 Thread Andrew Dunstan


On 2022-11-17 Th 17:11, Andrew Dunstan wrote:
> On 2022-11-04 Fr 10:06, Jehan-Guillaume de Rorthais wrote:
>> On Thu, 3 Nov 2022 13:11:18 -0500
>> Justin Pryzby  wrote:
>>
>>> On Tue, Jun 28, 2022 at 06:17:40PM -0400, Andrew Dunstan wrote:
 Nice catch, but this looks like massive overkill. I think we can very
 simply fix the test in just a few lines of code, instead of a 190 line
 fix and a 130 line TAP test.

 It was never intended to be able to compare markers like rc1 vs rc2, and
 I don't see any need for it. If you can show me a sane use case I'll
 have another look, but right now it seems quite unnecessary.

 Here's my proposed fix.

 diff --git a/src/test/perl/PostgreSQL/Version.pm
 b/src/test/perl/PostgreSQL/Version.pm index 8f70491189..8d4dbbf694 100644
 --- a/src/test/perl/PostgreSQL/Version.pm  
>>> Is this still an outstanding issue ?
>> The issue still exists on current HEAD:
>>
>>   $ perl -Isrc/test/perl/ -MPostgreSQL::Version -le \
>>   'print "bug" if PostgreSQL::Version->new("9.6") <= 9.0'
>>   bug
>>
>> Regards,
>>
> Oops. this slipped off mt radar. I'll apply a fix shortly, thanks for
> the reminder.
>
>

Done.


cheers


andrew


--
Andrew Dunstan
EDB: https://www.enterprisedb.com





Re: [PoC] Improve dead tuple storage for lazy vacuum

2022-11-18 Thread Masahiko Sawada
On Thu, Nov 17, 2022 at 12:24 AM Masahiko Sawada  wrote:
>
> On Wed, Nov 16, 2022 at 4:39 PM John Naylor
>  wrote:
> >
> >
> > On Wed, Nov 16, 2022 at 12:33 PM Masahiko Sawada  
> > wrote:
> > >
> > > On Wed, Nov 16, 2022 at 1:46 PM John Naylor
> > >  wrote:
> > > >
> > > >
> > > > On Tue, Nov 15, 2022 at 11:59 AM Masahiko Sawada 
> > > >  wrote:
> > > > > Thanks! Please let me know if there is something I can help with.
> > > >
> > > > I didn't get very far because the tests fail on 0004 in rt_verify_node:
> > > >
> > > > TRAP: failed Assert("n4->chunks[i - 1] < n4->chunks[i]"), File: 
> > > > "../src/backend/lib/radixtree.c", Line: 2186, PID: 18242
> > >
> > > Which tests do you use to get this assertion failure? I've confirmed
> > > there is a bug in 0005 patch but without it, "make check-world"
> > > passed.
> >
> > Hmm, I started over and rebuilt and it didn't reproduce. Not sure what 
> > happened, sorry for the noise.
>
> Good to know. No problem.
>
> > I'm attaching a test I wrote to stress test branch prediction in search, 
> > and while trying it out I found two possible issues.
>
> Thank you for testing!
>
> >
> > It's based on the random int load test, but tests search speed. Run like 
> > this:
> >
> > select * from bench_search_random_nodes(10 * 1000 * 1000)
> >
> > It also takes some care to include all the different node kinds, 
> > restricting the possible keys by AND-ing with a filter. Here's a simple 
> > demo:
> >
> > filter = ((uint64)1<<40)-1;
> > LOG:  num_keys = 967, height = 4, n4 = 17513814, n32 = 6320, n128 = 
> > 62663, n256 = 3130
> >
> > Just using random integers leads to >99% using the smallest node. I wanted 
> > to get close to having the same number of each, but that's difficult while 
> > still using random inputs. I ended up using
> >
> > filter = (((uint64) 0x7F<<32) | (0x07<<24) | (0xFF<<16) | 0xFF)
> >
> > which gives
> >
> > LOG:  num_keys = 9291812, height = 4, n4 = 262144, n32 = 79603, n128 = 
> > 182670, n256 = 1024
> >
> > Which seems okay for the task. One puzzling thing I found while trying 
> > various filters is that sometimes the reported tree height would change. 
> > For example:
> >
> > filter = (((uint64) 1<<32) | (0xFF<<24));
> > LOG:  num_keys = 944, height = 7, n4 = 47515559, n32 = 6209, n128 = 
> > 62632, n256 = 3161
> >
> > 1) Any idea why the tree height would be reported as 7 here? I didn't 
> > expect that.
>
> In my environment, (0xFF<<24) is 0xFF00, not 0xFF00.
> It seems the filter should be (((uint64) 1<<32) | ((uint64)
> 0xFF<<24)).
>
> >
> > 2) It seems that 0004 actually causes a significant slowdown in this test 
> > (as in the attached, using the second filter above and with turboboost 
> > disabled):
> >
> > v9 0003: 2062 2051 2050
> > v9 0004: 2346 2316 2321
> >
> > That means my idea for the pointer struct might have some problems, at 
> > least as currently implemented. Maybe in the course of separating out and 
> > polishing that piece, an inefficiency will fall out. Or, it might be 
> > another reason to template local and shared separately. Not sure yet. I 
> > also haven't tried to adjust this test for the shared memory case.
>
> I'll also run the test on my environment and do the investigation tomorrow.
>

FYI I've not tested the patch you shared today but here are the
benchmark results I did with the v9 patch in my environment (I used
the second filter). I splitted 0004 patch into two patches: a patch
for pure refactoring patch to introduce rt_node_ptr and a patch to do
pointer tagging.

v9 0003 patch: 1113 1114 1114
introduce rt_node_ptr: 1127 1128 1128
pointer tagging  : 1085 1087 1086 (equivalent to 0004 patch)

In my environment, rt_node_ptr seemed to lead some overhead but
pointer tagging had performance benefits. I'm not sure the reason why
the results are different from yours. The radix tree stats shows the
same as your tests.

=# select * from bench_search_random_nodes(10 * 1000 * 1000);
2022-11-18 22:18:21.608 JST [3913544] LOG:  num_keys = 9291812, height
= 4, n4 = 262144, n32 =79603, n128 = 182670, n256 = 1024

Regards,

--
Masahiko Sawada
Amazon Web Services: https://aws.amazon.com




Re: Add index scan progress to pg_stat_progress_vacuum

2022-11-18 Thread Masahiko Sawada
On Sat, Nov 12, 2022 at 4:10 AM Imseih (AWS), Sami  wrote:
>
> >I don't think any of these progress callbacks should be done while 
> > pinning a
> >buffer and ...
>
> Good point.
>
> >I also don't understand why info->parallel_progress_callback exists? 
> > It's only
> >set to parallel_vacuum_progress_report(). Why make this stuff more 
> > expensive
> >than it has to already be?
>
> Agree. Modified the patch to avoid the callback .
>
> >So each of the places that call this need to make an additional external
> >function call for each page, just to find that there's nothing to do or 
> > to
> >make yet another indirect function call. This should probably a static 
> > inline
> >function.
>
> Even better to just remove a function call altogether.
>
> >This is called, for every single page, just to read the number of indexes
> >completed by workers? A number that barely ever changes?
>
> I will take the initial suggestion by Sawada-san to update the progress
> every 1GB of blocks scanned.
>
> Also, It sems to me that we don't need to track progress in brin indexes,
> Since it is rare, if ever, this type of index will go through very heavy
> block scans. In fact, I noticed the vacuum AMs for brin don't invoke the
> vacuum_delay_point at all.
>
> The attached patch addresses the feedback.
>

Thank you for updating the patch! Here are review comments on v15 patch:

+  
+   Number of indexes that wil be vacuumed. This value will be
+   0 if there are no indexes to vacuum or
+   vacuum failsafe is triggered.

I think that indexes_total should be 0 also when INDEX_CLEANUP is off.

---
+/*
+ * Reset the indexes completed at this point.
+ * If we end up in another index vacuum cycle, we will
+ * start counting from the start.
+ */
+pgstat_progress_update_param(PROGRESS_VACUUM_INDEX_COMPLETED, 0);

I think we don't need to reset it at the end of index vacuuming. There
is a small window before switching to the next phase. If we reset this
value while showing "index vacuuming" phase, the user might get
confused. Instead, we can reset it at the beginning of the index
vacuuming.

---
+void
+parallel_wait_for_workers_to_finish(ParallelVacuumState *pvs)
+{
+while (pg_atomic_read_u32(&(pvs->shared->idx_completed_progress))
< pvs->nindexes)
+{
+pgstat_progress_update_param(PROGRESS_VACUUM_INDEX_COMPLETED,
+
   pg_atomic_read_u32(&(pvs->shared->idx_completed_progress)));
+
+(void) WaitLatch(MyLatch, WL_LATCH_SET |
WL_EXIT_ON_PM_DEATH, -1,
+ WAIT_EVENT_PARALLEL_FINISH);
+ResetLatch(MyLatch);
+}
+}

We should add CHECK_FOR_INTERRUPTS() at the beginning of the loop to
make the wait interruptible.

I think it would be better to update the counter only when the value
has been increased.

I think we should set a timeout, say 1 sec, to WaitLatch so that it
can periodically check the progress.

Probably it's better to have a new wait event for this wait in order
to distinguish wait for workers to complete index vacuum from the wait
for workers to exit.

---
@@ -838,7 +867,12 @@
parallel_vacuum_process_one_index(ParallelVacuumState *pvs, Relation
indrel,
 ivinfo.estimated_count = pvs->shared->estimated_count;
 ivinfo.num_heap_tuples = pvs->shared->reltuples;
 ivinfo.strategy = pvs->bstrategy;
-
+ivinfo.idx_completed_progress = pvs->shared->idx_completed_progress;

and

@@ -998,6 +998,9 @@ btvacuumscan(IndexVacuumInfo *info,
IndexBulkDeleteResult *stats,
 if (info->report_progress)

pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,

   scanblkno);
+if (info->report_parallel_progress &&
(scanblkno % REPORT_PARALLEL_VACUUM_EVERY_PAGES) == 0)
+
pgstat_progress_update_param(PROGRESS_VACUUM_INDEX_COMPLETED,
+

pg_atomic_read_u32(&(info->idx_completed_progress)));
 }

I think this doesn't work, since ivinfo.idx_completed is in the
backend-local memory. Instead, I think we can have a function in
vacuumparallel.c that updates the progress. Then we can have index AM
call this function.

---
+if (!IsParallelWorker())
+ivinfo.report_parallel_progress = true;
+else
+ivinfo.report_parallel_progress = false;

We can do like:

ivinfo.report_parallel_progress = !IsParallelWorker();

Regards,

-- 
Masahiko Sawada
Amazon Web Services: https://aws.amazon.com




Re: Optimize join selectivity estimation by not reading MCV stats for unique join attributes

2022-11-18 Thread Tomas Vondra
On 11/18/22 09:54, David Geier wrote:
> Thanks everyone for the great feedback and suggestions.
> 
>>
>>> Yes, it is.  Using zero flag would short-cut get_attstatsslot() to just
>>> return whether the slot type exists without loading it.  Do you think we
>>> need to emphasize this use case in the comments for 'flags'?
>> Perhaps, it's not really obvious now.
> 
> Comment added.
> 
> 
>> I wonder whether we need to also check statistic_proc_security_check()
>>> when determining if MCVs exists in both sides.
>> Yeah, I thought about hoisting the statistic_proc_security_check
>> tests up into get_mcv_stats.  I don't think it's a great idea
>> though.  Again, it'd complicate untangling this if we ever
>> generalize the use of MCVs in this function.  Also, I don't
>> think we should be micro-optimizing the case where the security
>> check doesn't pass --- if it doesn't, you're going to be hurting
>> from bad plans a lot more than you are from some wasted cycles
>> here.
> 
> Sounds reasonable.
> 
> Attached is v2 of the patch.
> This is basically Tom's version plus a comment for the flags of
> get_attstatslot() as suggested by Richard.
> 

Seems fine. I wonder if we could/could introduce a new constant for 0,
similar to ATTSTATSSLOT_NUMBERS/ATTSTATSSLOT_VALUES, instead of using a
magic constant. Say, ATTSTATSSLOT_NONE or ATTSTATSSLOT_CHECK.

> I couldn't come up with any reasonable way of writing an automated test
> for that.
> Any ideas?
> 

I don't think you can write a test for this, because there is no change
to behavior that can be observed by the user. If one side has no MCV,
the only difference is whether we try to load the other MCV or not.
There's no impact on estimates, because we won't use it.

IMO the best thing we can do is check coverage, that the new code is
exercised in regression tests. And I think that's fine.


regards

-- 
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company




Re: CREATE UNLOGGED TABLE seq faults when debug_discard_caches=1

2022-11-18 Thread David Geier

Hi Tom,

Back-patching but keeping RelationOpenSgmr() for extensions sounds 
reasonable.


On a different note: are we frequently running our tests suites with 
debug_discard_caches=1 enabled?

It doesn't seem like. I just ran make check with debug_discard_caches=1 on

- latest master: everything passes.
- version 14.5: fails in create_index, create_index_spgist, create_view.

So the buggy code path is at least covered by the tests. But it seems 
like we could have found it earlier by regularly running with 
debug_discard_caches=1.


--
David Geier
(ServiceNow)

On 11/17/22 18:51, Tom Lane wrote:

I wrote:

I wonder whether we ought to back-patch f10f0ae42.  We could
leave the RelationOpenSmgr macro in existence to avoid unnecessary
breakage of extension code, but stop using it within our own code.

Concretely, about like this for v14 (didn't look at the older
branches yet).

I'm not sure whether to recommend that outside extensions switch to using
RelationGetSmgr in pre-v15 branches.  If they do, they run a risk
of compile failure should they be built against old back-branch
headers.  Once compiled, though, they'd work against any minor release
(since RelationGetSmgr is static inline, not something in the core
backend).  So maybe that'd be good enough, and keeping their code in
sync with what they need for v15 would be worth something.

regards, tom lane






Re: Allow single table VACUUM in transaction block

2022-11-18 Thread Simon Riggs
On Thu, 17 Nov 2022 at 20:06, Robert Haas  wrote:
>
> On Wed, Nov 16, 2022 at 5:14 PM Greg Stark  wrote:
> > However I'm not a fan of commands that sometimes do one thing and
> > sometimes magically do something very different. I don't like the idea
> > that the same vacuum command would sometimes run in-process and
> > sometimes do this out of process request. And the rules for when it
> > does which are fairly complex to explain -- it runs in process unless
> > you're in a transaction when it runs out of process unless it's a
> > temporary table ...
>
> 100% agree.

I agree as well.

At the moment, the problem (OT) is that VACUUM behaves inconsistently

Outside a transaction - works perfectly
In a transaction - throws ERROR, which prevents a whole script from
executing correctly

What we are trying to do is avoid the ERROR. I don't want them to
behave like this, but that's the only option possible to avoid ERROR.

So if consistency is also a strong requirement, then maybe we should
make that new command the default, i.e. make VACUUM always just a
request to vacuum in background. That way it will be consistent.

Can we at least have a vacuum_runs_in_background = on | off, to allow
users to take advantage of this WITHOUT needing to rewrite all of
their scripts?

-- 
Simon Riggshttp://www.EnterpriseDB.com/




Re: Allow single table VACUUM in transaction block

2022-11-18 Thread Simon Riggs
On Thu, 17 Nov 2022 at 20:00, Justin Pryzby  wrote:
>
> On Wed, Nov 16, 2022 at 05:14:07PM -0500, Greg Stark wrote:
> > I think this requesting autovacuum worker should be a distinct
> > command. Or at least an explicit option to vacuum.
>
> +1.  I was going to suggest VACUUM (NOWAIT) ..

Yes, I have no problem with an explicit command.

At the moment the patch runs VACUUM in the background in an autovacuum
process, but the call is asynchronous, since we do not wait for the
command to finish (or even start).

So the command names I was thinking of would be one of these:

VACUUM (BACKGROUND) or VACUUM (AUTOVACUUM) - which might be clearer
or
VACUUM (ASYNC) - which is more descriptive of the behavior

or we could go for both
VACUUM (BACKGROUND, ASYNC) - since this allows us to have a
BACKGROUND, SYNC version in the future

Thoughts?

-- 
Simon Riggshttp://www.EnterpriseDB.com/




Re: postgres_fdw: batch inserts vs. before row triggers

2022-11-18 Thread Etsuro Fujita
Hi,

While working on something else, I notice some more oddities.  Here is
an example:

create extension postgres_fdw;
create server loopback foreign data wrapper postgres_fdw options
(dbname 'postgres');
create user mapping for current_user server loopback;
create table t1 (a text, b int);
create foreign table ft1 (a text, b int) server loopback options
(table_name 't1');
create table w1 (a text, b int);
create function ft1_rowcount_trigf() returns trigger language plpgsql
as $$ begin raise notice '%: there are % rows in ft1', tg_name,
(select count(*) from ft1); return new; end; $$;
create trigger ft1_rowcount_trigger before insert on w1 for each row
execute function ft1_rowcount_trigf();
alter server loopback options (add batch_size '10');

with t as (insert into w1 values ('foo', 10), ('bar', 20) returning *)
insert into ft1 select * from t;
NOTICE:  ft1_rowcount_trigger: there are 0 rows in ft1
NOTICE:  ft1_rowcount_trigger: there are 0 rows in ft1
INSERT 0 2

The command tag shows that two rows were inserted into ft1, but:

select * from ft1;
  a  | b
-+
 foo | 10
 bar | 20
 foo | 10
 bar | 20
(4 rows)

ft1 has four rows, which is wrong.  Also, when inserting the second
row (‘bar’, 20) into w1, the BEFORE ROW INSERT trigger should see the
first row (‘foo’, 10) in ft1, but it reports no rows were visible
there.

The reason for the former is that this bit added by commit b663a4136
is done not only when running the primary ModifyTable node but when
running the secondary ModifyTable node (with the wrong
ModifyTableState).

/*
 * Insert remaining tuples for batch insert.
 */
if (proute)
relinfos = estate->es_tuple_routing_result_relations;
else
relinfos = estate->es_opened_result_relations;

foreach(lc, relinfos)
{
resultRelInfo = lfirst(lc);
if (resultRelInfo->ri_NumSlots > 0)
ExecBatchInsert(node, resultRelInfo,
resultRelInfo->ri_Slots,
resultRelInfo->ri_PlanSlots,
resultRelInfo->ri_NumSlots,
estate, node->canSetTag);
}

The reason for the latter is that that commit fails to flush pending
inserts before executing any BEFORE ROW triggers, so that rows are
visible to such triggers.

Attached is a patch for fixing these issues.  In the patch I added to
the EState struct a List member es_insert_pending_result_relations to
store ResultRelInfos for foreign tables on which batch inserts are to
be performed, so that we avoid scanning through
es_tuple_routing_result_relations or es_opened_result_relations each
time when flushing pending inserts to the foreign tables.

Best regards,
Etsuro Fujita


fix-handling-of-pending-inserts.patch
Description: Binary data


Re: Avoid double lookup in pgstat_fetch_stat_tabentry()

2022-11-18 Thread Bharath Rupireddy
On Fri, Nov 18, 2022 at 3:41 PM Drouvot, Bertrand
 wrote:
>
> > However, I have a suggestion to simplify it
> > further by getting rid of the local variable tabentry and just
> > returning pgstat_fetch_stat_tabentry_ext(IsSharedRelation(relid),
> > relid);. Furthermore, the pgstat_fetch_stat_tabentry() can just be a
> > static inline function.
> Good point. While at it, why not completely get rid of
> pgstat_fetch_stat_tabentry_ext(), like in v2 the attached?

Hm. While it saves around 20 LOC, IsSharedRelation() is now spread
across, but WFM.

--
Bharath Rupireddy
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com




Re: Split index and table statistics into different types of stats

2022-11-18 Thread Drouvot, Bertrand

Hi,

On 11/16/22 9:12 PM, Andres Freund wrote:

Hi,

On 2022-11-15 10:48:40 +0100, Drouvot, Bertrand wrote:

diff --git a/src/backend/utils/adt/pgstatfuncs.c 
b/src/backend/utils/adt/pgstatfuncs.c
index ae3365d917..be7f175bf1 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -36,24 +36,34 @@
  
  #define HAS_PGSTAT_PERMISSIONS(role)	 (has_privs_of_role(GetUserId(), ROLE_PG_READ_ALL_STATS) || has_privs_of_role(GetUserId(), role))
  
+#define PGSTAT_FETCH_STAT_ENTRY(entry, stat_name) ((entry == NULL) ? 0 : (int64) (entry->stat_name))

+
  Datum
-pg_stat_get_numscans(PG_FUNCTION_ARGS)
+pg_stat_get_index_numscans(PG_FUNCTION_ARGS)
  {
Oid relid = PG_GETARG_OID(0);
int64   result;
-   PgStat_StatTabEntry *tabentry;
+   PgStat_StatIndEntry *indentry = pgstat_fetch_stat_indentry(relid);
  
-	if ((tabentry = pgstat_fetch_stat_tabentry(relid)) == NULL)

-   result = 0;
-   else
-   result = (int64) (tabentry->numscans);
+   result = PGSTAT_FETCH_STAT_ENTRY(indentry, numscans);
  
  	PG_RETURN_INT64(result);

  }


This still leaves a fair bit of boilerplate. ISTM that the function body
really should just be a single line.

Might even be worth defining the whole function via a macro. Perhaps something 
like

PGSTAT_DEFINE_REL_FIELD_ACCESSOR(PGSTAT_KIND_INDEX, pg_stat_get_index, 
numscans);


Thanks for the feedback!

Right, what about something like the following?

"
#define PGSTAT_FETCH_STAT_ENTRY(pgstat_entry_kind, pgstat_fetch_stat_function, 
relid, stat_name) \
do { \
pgstat_entry_kind *entry = pgstat_fetch_stat_function(relid); \
PG_RETURN_INT64(entry == NULL ? 0 : (int64) 
(entry->stat_name)); \
} while (0)

Datum
pg_stat_get_index_numscans(PG_FUNCTION_ARGS)
{
PGSTAT_FETCH_STAT_ENTRY(PgStat_StatIndEntry, 
pgstat_fetch_stat_indentry, PG_GETARG_OID(0), numscans);
}
"



I think the logic to infer which DB oid to use for a stats entry could be
shared between different kinds of stats. We don't need to duplicate it.



Agree, will provide a new patch version once [1] is committed.



Is there any reason to not replace the "double lookup" in
pgstat_fetch_stat_tabentry() with IsSharedRelation()?




Thanks for the suggestion!


This should probably be done in a preparatory commit.


Proposal submitted in [1].

[1]: 
https://www.postgresql.org/message-id/flat/2e4a0ae1-2696-9f0c-301c-2330e447133f%40gmail.com#e47bf5d2902121461b61ed47413628fc

Regards,

--
Bertrand Drouvot
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com




Re: Glossary and initdb definition work for "superuser" and database/cluster

2022-11-18 Thread Alvaro Herrera
On 2022-Nov-02, David G. Johnston wrote:

> Version 2 attached, some significant re-working.  Starting to think that
> initdb isn't the place for some of this content - in particular the stuff
> I'm deciding to move down to the Notes section.  Might consider moving some
> of it to the Server Setup and Operation chapter 19 - Creating Cluster (or
> nearby...) [1].
> 
> I settled on "cluster owner" over "cluster user" and made the terminology
> consistent throughout initdb and the glossary (haven't looked at chapter 19
> yet).  Also added it to the glossary.

Generally speaking, I like the idea of documenting these things.
However it sounds like you're not done with the wording and editing, so
I'm not committing the whole patch, but it seems a good starting point
to at least have some basic definitions.  So I've extracted them from
your patch and pushed those.  You can already see it at
https://www.postgresql.org/docs/devel/glossary.html

I left out almost all the material from the patch that's not in the
glossary proper, and also a few phrases in the glossary itself.  Some of
these sounded like security considerations rather than part of the
definitions.  I think we should have a separate chapter in Part III
(Server Administration) that explains many security aspects; right now
there's no hope of collecting a lot of very important advice in a single
place, so a wannabe admin has no chance of getting things right.  That
seems to me a serious deficiency.  A new chapter could provide a lot of
general advice on every aspect that needs to be considered, and link to
the reference section for additional details.  Maybe part of these
initdb considerations could be there, too.

> Moved quite a bit of material to notes from the description and options and
> expanded upon what had already been said based upon various discussions
> I've been part of on the mailing lists.

Please rebase.

-- 
Álvaro Herrera   48°01'N 7°57'E  —  https://www.EnterpriseDB.com/
"Always assume the user will do much worse than the stupidest thing
you can imagine."(Julien PUYDT)




Re: when the startup process doesn't (logging startup delays)

2022-11-18 Thread Bharath Rupireddy
On Fri, Nov 18, 2022 at 12:42 AM Robert Haas  wrote:
>
> On Thu, Nov 17, 2022 at 2:22 AM Bharath Rupireddy
>  wrote:
> > Duplication is a problem that I agree with and I have an idea here -
> > how about introducing a new function, say EnableStandbyMode() that
> > sets StandbyMode to true and disables the startup progress timeout,
> > something like the attached?
>
> That works for me, more or less. But I think that
> enable_startup_progress_timeout should be amended to either say if
> (log_startup_progress_interval == 0 || StandbyMode) return; or else it
> should at least Assert(!StandbyMode), so that we can't accidentally
> re-enable the timer after we shut it off.

Hm, an assertion may not help in typical production servers running on
non-assert builds. I've modified the if condition, please see the
attached v5 patch.

--
Bharath Rupireddy
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
From 758d4cc81b1c7ed087969df07a958e92a374ee0f Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy 
Date: Fri, 18 Nov 2022 09:53:15 +
Subject: [PATCH v5] Disable STARTUP_PROGRESS_TIMEOUT in standby mode

In standby mode, we actually don't report progress of recovery to
not bloat server logs as the standby is always in recovery unless
promoted. However, startup_progress_timeout_handler() gets called
every log_startup_progress_interval seconds, which is unnecessary.

Therefore, we disable the timeout in standby mode.

Reported-by: Thomas Munro
Authors: Bharath Rupireddy, Simon Riggs
Reviewed-by: Robert Haas
Backpatch-through: 15
Discussion: https://www.postgresql.org/message-id/CA%2BhUKGKCHSffAj8zZJKJvNX7ygnQFxVD6wm1d-2j3fVw%2BMafPQ%40mail.gmail.com
---
 src/backend/access/transam/xlogrecovery.c | 21 ---
 src/backend/postmaster/startup.c  | 33 +++
 src/include/postmaster/startup.h  |  2 ++
 3 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index cb07694aea..2feae1ebd3 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -386,6 +386,7 @@ static bool recoveryStopAfter;
 /* prototypes for local functions */
 static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
 
+static void EnableStandbyMode(void);
 static void readRecoverySignalFile(void);
 static void validateRecoveryParameters(void);
 static bool read_backup_label(XLogRecPtr *checkPointLoc,
@@ -470,6 +471,20 @@ XLogRecoveryShmemInit(void)
 	ConditionVariableInit(&XLogRecoveryCtl->recoveryNotPausedCV);
 }
 
+static void
+EnableStandbyMode(void)
+{
+	StandbyMode = true;
+
+	/*
+	 * To avoid server log bloat, we don't report recovery progress in a
+	 * standby as it will always be in recovery unless promoted. We disable
+	 * startup progress timeout in standby mode to avoid calling
+	 * startup_progress_timeout_handler() unnecessarily.
+	 */
+	disable_startup_progress_timeout();
+}
+
 /*
  * Prepare the system for WAL recovery, if needed.
  *
@@ -603,7 +618,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
 		 */
 		InArchiveRecovery = true;
 		if (StandbyModeRequested)
-			StandbyMode = true;
+			EnableStandbyMode();
 
 		/*
 		 * When a backup_label file is present, we want to roll forward from
@@ -740,7 +755,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
 		{
 			InArchiveRecovery = true;
 			if (StandbyModeRequested)
-StandbyMode = true;
+EnableStandbyMode();
 		}
 
 		/* Get the last valid checkpoint record. */
@@ -3115,7 +3130,7 @@ ReadRecord(XLogPrefetcher *xlogprefetcher, int emode,
 		(errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
 InArchiveRecovery = true;
 if (StandbyModeRequested)
-	StandbyMode = true;
+	EnableStandbyMode();
 
 SwitchIntoArchiveRecovery(xlogreader->EndRecPtr, replayTLI);
 minRecoveryPoint = xlogreader->EndRecPtr;
diff --git a/src/backend/postmaster/startup.c b/src/backend/postmaster/startup.c
index f99186eab7..4a46d2070c 100644
--- a/src/backend/postmaster/startup.c
+++ b/src/backend/postmaster/startup.c
@@ -314,20 +314,29 @@ startup_progress_timeout_handler(void)
 	startup_progress_timer_expired = true;
 }
 
+void
+disable_startup_progress_timeout(void)
+{
+	/* Quick exit if feature is disabled. */
+	if (log_startup_progress_interval == 0)
+		return;
+
+	disable_timeout(STARTUP_PROGRESS_TIMEOUT, false);
+	startup_progress_timer_expired = false;
+}
+
 /*
  * Set the start timestamp of the current operation and enable the timeout.
  */
 void
-begin_startup_progress_phase(void)
+enable_startup_progress_timeout(void)
 {
 	TimestampTz fin_time;
 
-	/* Feature is disabled. */
-	if (log_startup_progress_interval == 0)
+	/* Quick exit if feature is disabled or we're in standby mode */
+	if (log_startup_progress_interval == 0 || StandbyMode)
 	

Re: Avoid double lookup in pgstat_fetch_stat_tabentry()

2022-11-18 Thread Drouvot, Bertrand

Hi,

On 11/18/22 7:06 AM, Bharath Rupireddy wrote:

On Fri, Nov 18, 2022 at 10:32 AM Drouvot, Bertrand
 wrote:


Hi hackers,

Please find attached a patch proposal to avoid 2 calls to
pgstat_fetch_stat_tabentry_ext() in pgstat_fetch_stat_tabentry() in case
the relation is not a shared one and no statistics are found.

Thanks Andres for the suggestion done in [1].

[1]:
https://www.postgresql.org/message-id/20221116201202.3k74ajawyom2c3eq%40awork3.anarazel.de


+1. The patch LGTM. 


Thanks for looking at it!


However, I have a suggestion to simplify it
further by getting rid of the local variable tabentry and just
returning pgstat_fetch_stat_tabentry_ext(IsSharedRelation(relid),
relid);. Furthermore, the pgstat_fetch_stat_tabentry() can just be a
static inline function.
Good point. While at it, why not completely get rid of 
pgstat_fetch_stat_tabentry_ext(), like in v2 the attached?


Regards,

--
Bertrand Drouvot
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.comdiff --git a/src/backend/postmaster/autovacuum.c 
b/src/backend/postmaster/autovacuum.c
index 601834d4b4..fde8458f5e 100644
--- a/src/backend/postmaster/autovacuum.c
+++ b/src/backend/postmaster/autovacuum.c
@@ -2101,8 +2101,8 @@ do_autovacuum(void)
 
/* Fetch reloptions and the pgstat entry for this table */
relopts = extract_autovac_opts(tuple, pg_class_desc);
-   tabentry = 
pgstat_fetch_stat_tabentry_ext(classForm->relisshared,
-   
  relid);
+   tabentry = pgstat_fetch_stat_tabentry(classForm->relisshared,
+   
  relid);
 
/* Check if it needs vacuum or analyze */
relation_needs_vacanalyze(relid, relopts, classForm, tabentry,
@@ -2185,8 +2185,8 @@ do_autovacuum(void)
}
 
/* Fetch the pgstat entry for this table */
-   tabentry = 
pgstat_fetch_stat_tabentry_ext(classForm->relisshared,
-   
  relid);
+   tabentry = pgstat_fetch_stat_tabentry(classForm->relisshared,
+   
  relid);
 
relation_needs_vacanalyze(relid, relopts, classForm, tabentry,
  
effective_multixact_freeze_max_age,
@@ -2913,8 +2913,8 @@ recheck_relation_needs_vacanalyze(Oid relid,
PgStat_StatTabEntry *tabentry;
 
/* fetch the pgstat table entry */
-   tabentry = pgstat_fetch_stat_tabentry_ext(classForm->relisshared,
-   
  relid);
+   tabentry = pgstat_fetch_stat_tabentry(classForm->relisshared,
+   
  relid);
 
relation_needs_vacanalyze(relid, avopts, classForm, tabentry,
  
effective_multixact_freeze_max_age,
diff --git a/src/backend/utils/activity/pgstat_relation.c 
b/src/backend/utils/activity/pgstat_relation.c
index 55a355f583..266fa0c15e 100644
--- a/src/backend/utils/activity/pgstat_relation.c
+++ b/src/backend/utils/activity/pgstat_relation.c
@@ -61,8 +61,8 @@ pgstat_copy_relation_stats(Relation dst, Relation src)
PgStatShared_Relation *dstshstats;
PgStat_EntryRef *dst_ref;
 
-   srcstats = pgstat_fetch_stat_tabentry_ext(src->rd_rel->relisshared,
-   
  RelationGetRelid(src));
+   srcstats = pgstat_fetch_stat_tabentry(src->rd_rel->relisshared,
+   
  RelationGetRelid(src));
if (!srcstats)
return;
 
@@ -435,27 +435,7 @@ pgstat_update_heap_dead_tuples(Relation rel, int delta)
  * caller is better off to report ZERO instead.
  */
 PgStat_StatTabEntry *
-pgstat_fetch_stat_tabentry(Oid relid)
-{
-   PgStat_StatTabEntry *tabentry;
-
-   tabentry = pgstat_fetch_stat_tabentry_ext(false, relid);
-   if (tabentry != NULL)
-   return tabentry;
-
-   /*
-* If we didn't find it, maybe it's a shared table.
-*/
-   tabentry = pgstat_fetch_stat_tabentry_ext(true, relid);
-   return tabentry;
-}
-
-/*
- * More efficient version of pgstat_fetch_stat_tabentry(), allowing to specify
- * whether the to-be-accessed table is a shared relation or not.
- */
-PgStat_StatTabEntry *
-pgstat_fetch_stat_tabentry_ext(bool shared, Oid reloid)
+pgstat_fetch_stat_tabentry(bool shared, Oid reloid)
 {
Oid dboid = (shared ? InvalidOid : MyDatabaseId);
 
diff --git a/

Re: Patch: Global Unique Index

2022-11-18 Thread Pavel Stehule
pá 18. 11. 2022 v 10:04 odesílatel Sergei Kornilov  napsal:

> Hello
> Do we need new syntax actually? I think that a global unique index can be
> created automatically instead of raising an error "unique constraint on
> partitioned table must include all partitioning columns"
>
+1

Pavel


> regards, Sergei
>
>
>


Re: Assertion failure with barriers in parallel hash join

2022-11-18 Thread David Geier

Thanks!

Please let me know if I can help out, e.g. with re-testing.

--
David Geier
(ServiceNow)

On 11/17/22 08:28, Thomas Munro wrote:

On Thu, Nov 17, 2022 at 8:01 PM David Geier  wrote:

Can we make progress with this patch in the current commit fest, or discuss 
what is still missing to bring this in?

Hi David,
Sorry for the delay.  I'll aim to get this done in the next few days.





Re:Patch: Global Unique Index

2022-11-18 Thread Sergei Kornilov
Hello
Do we need new syntax actually? I think that a global unique index can be 
created automatically instead of raising an error "unique constraint on 
partitioned table must include all partitioning columns"

regards, Sergei




Re: Reducing power consumption on idle servers

2022-11-18 Thread Simon Riggs
On Fri, 18 Nov 2022 at 08:55, Bharath Rupireddy
 wrote:

> However, I'm a bit
> worried about how it'll affect the tools/providers/extensions that
> depend on it.

Who is that? Which ones depend upon it?

-- 
Simon Riggshttp://www.EnterpriseDB.com/




Re: SUBTRANS: Minimizing calls to SubTransSetParent()

2022-11-18 Thread Simon Riggs
On Thu, 17 Nov 2022 at 20:29, Tomas Vondra
 wrote:
>
> On 11/17/22 18:29, Simon Riggs wrote:
> > On Thu, 17 Nov 2022 at 17:04, Simon Riggs  
> > wrote:
> >>
> > 003 includes the idea to not-always do SubTransSetParent()
> >
> I'm a bit confused by the TransactionIdsAreOnSameXactPage naming. Isn't
> this really checking clog pages?

Yes, clog page. I named it to match the new name of pg_xact

-- 
Simon Riggshttp://www.EnterpriseDB.com/




Re: Reducing power consumption on idle servers

2022-11-18 Thread Bharath Rupireddy
On Fri, Nov 18, 2022 at 2:08 AM Robert Haas  wrote:
>
> On Thu, Nov 17, 2022 at 2:55 AM Simon Riggs
>  wrote:
> > No, it will have a direct effect only on people using promote_trigger_file
> > who do not read and act upon the deprecation notice before upgrading
> > by making a one line change to their failover scripts.
>
> TBH, I wonder if we shouldn't just nuke promote_trigger_file.
> pg_promote was added in 2018, and pg_ctl promote was added in 2011. So
> even if we haven't said promote_trigger_file was deprecated, it hasn't
> been the state-of-the-art way of doing this in a really long time. If
> we think that there are still a lot of people using it, or if popular
> tools are relying on it, then perhaps a deprecation period is
> warranted anyway. But I think we should at least consider the
> possibility that it's OK to just get rid of it right away.

I agree with Robert here. It's better to do away with
promote_trigger_file, at least that's better than deprecating it now
and removing it somewhere in later releases. However, I'm a bit
worried about how it'll affect the tools/providers/extensions that
depend on it. And it turns out that this worry exists for every
feature that one thinks to get rid of.

If we go the route of doing away with promote_trigger_file, I think we
need to discuss it in a separate thread as the subject of this thread
doesn't match with what we're discussing here. This way, we'll get
thoughts from other hackers.

--
Bharath Rupireddy
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com




Re: Optimize join selectivity estimation by not reading MCV stats for unique join attributes

2022-11-18 Thread David Geier

Thanks everyone for the great feedback and suggestions.




Yes, it is.  Using zero flag would short-cut get_attstatsslot() to just
return whether the slot type exists without loading it.  Do you think we
need to emphasize this use case in the comments for 'flags'?

Perhaps, it's not really obvious now.


Comment added.



I wonder whether we need to also check statistic_proc_security_check()

when determining if MCVs exists in both sides.

Yeah, I thought about hoisting the statistic_proc_security_check
tests up into get_mcv_stats.  I don't think it's a great idea
though.  Again, it'd complicate untangling this if we ever
generalize the use of MCVs in this function.  Also, I don't
think we should be micro-optimizing the case where the security
check doesn't pass --- if it doesn't, you're going to be hurting
from bad plans a lot more than you are from some wasted cycles
here.


Sounds reasonable.

Attached is v2 of the patch.
This is basically Tom's version plus a comment for the flags of 
get_attstatslot() as suggested by Richard.


I couldn't come up with any reasonable way of writing an automated test 
for that.

Any ideas?

--
David Geier
(ServiceNow)
From 66b16792e8e16dad33adb5614a0936cc41002f4c Mon Sep 17 00:00:00 2001
From: David Geier 
Date: Fri, 18 Nov 2022 09:35:08 +0100
Subject: [PATCH] Don't read MCV stats needlessly in join selectivity
 estimation

Join selectivity estimation only uses MCV stats if they're present
on both join attributes. Therefore, if MCV stats are not available
on one of the two columns, skip reading MCV stats altogether.

Frequently encountered situations in which MCV stats not available
is unique columns or columns for which all values have roughly
the same frequency and therefore ANALYZE decided to not store them.
---
 src/backend/utils/adt/selfuncs.c| 15 +--
 src/backend/utils/cache/lsyscache.c |  4 
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index d597b7e81f..5baab4e3ac 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -2263,6 +2263,7 @@ eqjoinsel(PG_FUNCTION_ARGS)
 	bool		have_mcvs2 = false;
 	bool		join_is_reversed;
 	RelOptInfo *inner_rel;
+	bool		get_mcv_stats;
 
 	get_join_variables(root, args, sjinfo,
 	   &vardata1, &vardata2, &join_is_reversed);
@@ -2275,11 +2276,21 @@ eqjoinsel(PG_FUNCTION_ARGS)
 	memset(&sslot1, 0, sizeof(sslot1));
 	memset(&sslot2, 0, sizeof(sslot2));
 
+	/*
+	 * There is no use in fetching one side's MCVs if we lack MCVs for the
+	 * other side, so do a quick check to verify that both stats exist.
+	 */
+	get_mcv_stats = (HeapTupleIsValid(vardata1.statsTuple) &&
+			 HeapTupleIsValid(vardata2.statsTuple) &&
+			 get_attstatsslot(&sslot1, vardata1.statsTuple, STATISTIC_KIND_MCV, InvalidOid, 0) &&
+			 get_attstatsslot(&sslot2, vardata2.statsTuple, STATISTIC_KIND_MCV, InvalidOid, 0));
+
+
 	if (HeapTupleIsValid(vardata1.statsTuple))
 	{
 		/* note we allow use of nullfrac regardless of security check */
 		stats1 = (Form_pg_statistic) GETSTRUCT(vardata1.statsTuple);
-		if (statistic_proc_security_check(&vardata1, opfuncoid))
+		if (get_mcv_stats && statistic_proc_security_check(&vardata1, opfuncoid))
 			have_mcvs1 = get_attstatsslot(&sslot1, vardata1.statsTuple,
 		  STATISTIC_KIND_MCV, InvalidOid,
 		  ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS);
@@ -2289,7 +2300,7 @@ eqjoinsel(PG_FUNCTION_ARGS)
 	{
 		/* note we allow use of nullfrac regardless of security check */
 		stats2 = (Form_pg_statistic) GETSTRUCT(vardata2.statsTuple);
-		if (statistic_proc_security_check(&vardata2, opfuncoid))
+		if (get_mcv_stats && statistic_proc_security_check(&vardata2, opfuncoid))
 			have_mcvs2 = get_attstatsslot(&sslot2, vardata2.statsTuple,
 		  STATISTIC_KIND_MCV, InvalidOid,
 		  ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS);
diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c
index a16a63f495..191f68f7b4 100644
--- a/src/backend/utils/cache/lsyscache.c
+++ b/src/backend/utils/cache/lsyscache.c
@@ -3157,6 +3157,10 @@ get_attavgwidth(Oid relid, AttrNumber attnum)
  * reqkind: STAKIND code for desired statistics slot kind.
  * reqop: STAOP value wanted, or InvalidOid if don't care.
  * flags: bitmask of ATTSTATSSLOT_VALUES and/or ATTSTATSSLOT_NUMBERS.
+ *Passing 0 will only check if the requested slot type exists but not
+ *extract any content. This is useful in cases where MCV stats are only
+ *useful if they exists on all involved columns (e.g. during join
+ *selectivity computation).
  *
  * If a matching slot is found, true is returned, and *sslot is filled thus:
  * staop: receives the actual STAOP value.
-- 
2.34.1



Re: Reducing power consumption on idle servers

2022-11-18 Thread Bharath Rupireddy
On Fri, Nov 18, 2022 at 6:29 AM Andres Freund  wrote:
>
> Hi,
>
> On 2022-11-17 13:06:23 +0530, Bharath Rupireddy wrote:
> > I understand. I know it's a bit hard to measure the power savings, I'm
> > wondering if there's any info, maybe not necessarily related to
> > postgres, but in general how much power gets saved if a certain number
> > of waits/polls/system calls are reduced.
>
> It's heavily hardware and hardware settings dependent.
>
> On some systems you can get an *approximate* idea of the power usage even
> while plugged in. On others you can only see it while on battery power.
>
> On systems with RAPL support (most Intel and I think newer AMD CPUs) you can
> query power usage with:
>   powerstat -R 1 (adding -D provides a bit more detail)
>
> But note that RAPL typically severely undercounts power usage because it
> doesn't cover a lot of sources of power usage (display, sometimes memory,
> all other peripherals).
>
> Sometimes powerstat -R -D can split power usage up more granularly,
> e.g. between the different CPU sockets and memory.
>
>
> On laptops you can often measure the discharge rate when not plugged in, with
> powerstat -d 0 1. But IME the latency till the values update makes it harder
> to interpret.
>
> On some workstations / servers you can read the power usage via ACPI. E.g. on
> my workstation 'sensors' shows a power_meter-acpi-0
>
>
> As an example of the difference it can make, here's the output of
>   powerstat -D 1
> on my laptop.
>
>   TimeUser  Nice   Sys  IdleIO  Run Ctxt/s  IRQ/s Fork Exec Exit  
> Watts
> 16:41:55   0.1   0.0   0.1  99.8   0.01403246101   
> 2.62
> 16:41:56   0.1   0.0   0.1  99.8   0.01357196101   
> 2.72
> 16:41:57   0.1   0.0   0.1  99.6   0.21510231404   
> 2.64
> 16:41:58   0.1   0.0   0.1  99.9   0.02   1350758   64   62   63   
> 4.06
> 16:41:59   0.3   0.0   1.0  98.7   0.02   4166   2406  244  243  244   
> 7.20
> 16:42:00   0.2   0.0   0.7  99.1   0.02   4203   2353  247  246  247   
> 7.21
> 16:42:01   0.5   0.0   1.6  98.0   0.02   4079   2395  240  239  240   
> 7.08
> 16:42:02   0.5   0.0   0.9  98.7   0.02   4097   2405  245  243  245   
> 7.20
> 16:42:03   0.4   0.0   1.3  98.3   0.02   4117   2311  243  242  243   
> 7.14
> 16:42:04   0.1   0.0   0.4  99.4   0.11   1721   1152   70   70   71   
> 4.48
> 16:42:05   0.1   0.0   0.2  99.8   0.01433250101   
> 2.92
> 16:42:06   0.0   0.0   0.3  99.7   0.01400231101   
> 2.66
>
> In the period of higher power etc usage I ran
> while true;do sleep 0.001;done
> and then interupted that after a bit with ctrl-c.
>
> Same thing on my workstation (:
>
>   TimeUser  Nice   Sys  IdleIO  Run Ctxt/s  IRQ/s Fork Exec Exit  
> Watts
> 16:43:48   1.0   0.0   0.2  98.7   0.11   8218   2354000  
> 46.43
> 16:43:49   1.1   0.0   0.3  98.7   0.01   7866   2477000  
> 45.99
> 16:43:50   1.1   0.0   0.4  98.5   0.02   7753   2996000  
> 48.93
> 16:43:51   0.8   0.0   1.7  97.5   0.01   9395   5285000  
> 75.48
> 16:43:52   0.5   0.0   1.7  97.8   0.01   9141   4806000  
> 75.30
> 16:43:53   1.1   0.0   1.8  97.1   0.02  10065   5504000  
> 76.27
> 16:43:54   1.3   0.0   1.5  97.2   0.01  10962   5165000  
> 76.33
> 16:43:55   0.9   0.0   0.8  98.3   0.01   8452   3939000  
> 61.99
> 16:43:56   0.6   0.0   0.1  99.3   0.02   6541   1999000  
> 40.92
> 16:43:57   0.9   0.0   0.2  98.9   0.02   8199   2477000  
> 42.91
>
>
> And if I query the power supply via ACPI instead:
>
> while true;do sensors power_meter-acpi-0|grep power1|awk '{print $2, 
> $3}';sleep 1;done
> ...
> 163.00 W
> 173.00 W
> 173.00 W
> 172.00 W
> 203.00 W
> 206.00 W
> 206.00 W
> 206.00 W
> 209.00 W
> 205.00 W
> 211.00 W
> 213.00 W
> 203.00 W
> 175.00 W
> 166.00 W
> ...
>
> As you can see the difference is quite substantial. This is solely due to a
> 1ms sleep loop (albeit one where we fork a process after each sleep, which
> likely is a good chunk of the CPU usage).
>
> However, the difference in power usage will be far smaller if the system
> already busy, because the CPU will already run at a high frequency.

Thanks a lot for sharing the details.

-- 
Bharath Rupireddy
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com




Re: Make mesage at end-of-recovery less scary.

2022-11-18 Thread Kyotaro Horiguchi
Just rebased.

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center
>From 1efe0601596807c25769370f38884c7027a00839 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi 
Date: Tue, 15 Nov 2022 13:41:46 +0900
Subject: [PATCH v22] Make End-Of-Recovery error less scary

When recovery in any type ends, we see a bit scary error message like
"invalid record length" that suggests something serious is
happening. Actually if recovery meets a record with length = 0, that
usually means it finished applying all available WAL records.

Make this message less scary as "reached end of WAL". Instead, raise
the error level for other kind of WAL failure to WARNING.
---
 src/backend/access/transam/xlogreader.c   | 137 +-
 src/backend/access/transam/xlogrecovery.c |  94 +++
 src/backend/replication/walreceiver.c |   7 +-
 src/bin/pg_waldump/pg_waldump.c   |  13 +-
 src/include/access/xlogreader.h   |   1 +
 src/test/recovery/t/011_crash_recovery.pl | 106 +
 6 files changed, 299 insertions(+), 59 deletions(-)

diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c
index 93f667b254..f891a62944 100644
--- a/src/backend/access/transam/xlogreader.c
+++ b/src/backend/access/transam/xlogreader.c
@@ -48,6 +48,8 @@ static int	ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr,
 			 int reqLen);
 static void XLogReaderInvalReadState(XLogReaderState *state);
 static XLogPageReadResult XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking);
+static bool ValidXLogRecordLength(XLogReaderState *state, XLogRecPtr RecPtr,
+  XLogRecord *record);
 static bool ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr,
   XLogRecPtr PrevRecPtr, XLogRecord *record, bool randAccess);
 static bool ValidXLogRecord(XLogReaderState *state, XLogRecord *record,
@@ -149,6 +151,7 @@ XLogReaderAllocate(int wal_segment_size, const char *waldir,
 		pfree(state);
 		return NULL;
 	}
+	state->EndOfWAL = false;
 	state->errormsg_buf[0] = '\0';
 
 	/*
@@ -558,6 +561,7 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking)
 	/* reset error state */
 	state->errormsg_buf[0] = '\0';
 	decoded = NULL;
+	state->EndOfWAL = false;
 
 	state->abortedRecPtr = InvalidXLogRecPtr;
 	state->missingContrecPtr = InvalidXLogRecPtr;
@@ -640,25 +644,21 @@ restart:
 	Assert(pageHeaderSize <= readOff);
 
 	/*
-	 * Read the record length.
+	 * Validate the record header.
 	 *
-	 * NB: Even though we use an XLogRecord pointer here, the whole record
-	 * header might not fit on this page. xl_tot_len is the first field of the
-	 * struct, so it must be on this page (the records are MAXALIGNed), but we
-	 * cannot access any other fields until we've verified that we got the
-	 * whole header.
-	 */
-	record = (XLogRecord *) (state->readBuf + RecPtr % XLOG_BLCKSZ);
-	total_len = record->xl_tot_len;
-
-	/*
-	 * If the whole record header is on this page, validate it immediately.
-	 * Otherwise do just a basic sanity check on xl_tot_len, and validate the
-	 * rest of the header after reading it from the next page.  The xl_tot_len
+	 * Even though we use an XLogRecord pointer here, the whole record header
+	 * might not fit on this page.  If the whole record header is on this page,
+	 * validate it immediately.  Even otherwise xl_tot_len must be on this page
+	 * (it is the first field of MAXALIGNed records), but we still cannot
+	 * access any further fields until we've verified that we got the whole
+	 * header, so do just a basic sanity check on record length, and validate
+	 * the rest of the header after reading it from the next page.  The length
 	 * check is necessary here to ensure that we enter the "Need to reassemble
 	 * record" code path below; otherwise we might fail to apply
 	 * ValidXLogRecordHeader at all.
 	 */
+	record = (XLogRecord *) (state->readBuf + RecPtr % XLOG_BLCKSZ);
+
 	if (targetRecOff <= XLOG_BLCKSZ - SizeOfXLogRecord)
 	{
 		if (!ValidXLogRecordHeader(state, RecPtr, state->DecodeRecPtr, record,
@@ -668,18 +668,14 @@ restart:
 	}
 	else
 	{
-		/* XXX: more validation should be done here */
-		if (total_len < SizeOfXLogRecord)
-		{
-			report_invalid_record(state,
-  "invalid record length at %X/%X: wanted %u, got %u",
-  LSN_FORMAT_ARGS(RecPtr),
-  (uint32) SizeOfXLogRecord, total_len);
+		if (!ValidXLogRecordLength(state, RecPtr, record))
 			goto err;
-		}
+
 		gotheader = false;
 	}
 
+	total_len = record->xl_tot_len;
+
 	/*
 	 * Find space to decode this record.  Don't allow oversized allocation if
 	 * the caller requested nonblocking.  Otherwise, we *have* to try to
@@ -1105,6 +1101,60 @@ XLogReaderInvalReadState(XLogReaderState *state)
 	state->readLen = 0;
 }
 
+/*
+ * Validate record length of an XLOG record header.
+ *
+ * This is substantially a part of ValidXLogRecordHeader.  But XLogReadRecord
+ * needs this separate from the function in case of a partial rec

Re: In-placre persistance change of a relation

2022-11-18 Thread Kyotaro Horiguchi
At Tue, 08 Nov 2022 11:33:53 +0900 (JST), Kyotaro Horiguchi 
 wrote in 
> Indeed, thanks!  I'll do that in a few days.

Got too late, but rebased.. The contents of the two patches in the
last version was a bit shuffled but they are fixed.

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center
>From b8bcd1e9dc7c52c277de7f13bc21900efd2030dc Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi 
Date: Tue, 19 Jul 2022 13:23:01 +0900
Subject: [PATCH v25 1/2] In-place table persistence change

Even though ALTER TABLE SET LOGGED/UNLOGGED does not require data
rewriting, currently it runs heap rewrite which causes large amount of
file I/O.  This patch makes the command run without heap rewrite.
Addition to that, SET LOGGED while wal_level > minimal emits WAL using
XLOG_FPI instead of massive number of HEAP_INSERT's, which should be
smaller.

Also this allows for the cleanup of files left behind in the crash of
the transaction that created it.
---
 src/backend/access/rmgrdesc/smgrdesc.c|  49 ++
 src/backend/access/transam/README |  10 +
 src/backend/access/transam/xact.c |   7 +
 src/backend/access/transam/xlogrecovery.c |  18 +
 src/backend/backup/basebackup.c   |   9 +-
 src/backend/catalog/storage.c | 559 +-
 src/backend/commands/tablecmds.c  | 267 +--
 src/backend/storage/buffer/bufmgr.c   |  87 
 src/backend/storage/file/fd.c |   4 +-
 src/backend/storage/file/reinit.c | 340 +
 src/backend/storage/smgr/md.c |  95 +++-
 src/backend/storage/smgr/smgr.c   |  32 ++
 src/backend/storage/sync/sync.c   |  21 +-
 src/bin/pg_rewind/parsexlog.c |  22 +
 src/bin/pg_rewind/pg_rewind.c |   1 -
 src/common/relpath.c  |  47 +-
 src/include/catalog/storage.h |   3 +
 src/include/catalog/storage_xlog.h|  43 +-
 src/include/common/relpath.h  |   9 +-
 src/include/storage/bufmgr.h  |   2 +
 src/include/storage/fd.h  |   1 +
 src/include/storage/md.h  |   8 +-
 src/include/storage/reinit.h  |   8 +-
 src/include/storage/smgr.h|  17 +
 src/tools/pgindent/typedefs.list  |   7 +
 25 files changed, 1483 insertions(+), 183 deletions(-)

diff --git a/src/backend/access/rmgrdesc/smgrdesc.c b/src/backend/access/rmgrdesc/smgrdesc.c
index e0ee8a078a..2f92c06f70 100644
--- a/src/backend/access/rmgrdesc/smgrdesc.c
+++ b/src/backend/access/rmgrdesc/smgrdesc.c
@@ -40,6 +40,46 @@ smgr_desc(StringInfo buf, XLogReaderState *record)
 		 xlrec->blkno, xlrec->flags);
 		pfree(path);
 	}
+	else if (info == XLOG_SMGR_UNLINK)
+	{
+		xl_smgr_unlink *xlrec = (xl_smgr_unlink *) rec;
+		char	   *path = relpathperm(xlrec->rlocator, xlrec->forkNum);
+
+		appendStringInfoString(buf, path);
+		pfree(path);
+	}
+	else if (info == XLOG_SMGR_MARK)
+	{
+		xl_smgr_mark *xlrec = (xl_smgr_mark *) rec;
+		char	   *path = GetRelationPath(xlrec->rlocator.dbOid,
+		   xlrec->rlocator.spcOid,
+		   xlrec->rlocator.relNumber,
+		   InvalidBackendId,
+		   xlrec->forkNum, xlrec->mark);
+		char	   *action = "";
+
+		switch (xlrec->action)
+		{
+			case XLOG_SMGR_MARK_CREATE:
+action = "CREATE";
+break;
+			case XLOG_SMGR_MARK_UNLINK:
+action = "DELETE";
+break;
+		}
+
+		appendStringInfo(buf, "%s %s", action, path);
+		pfree(path);
+	}
+	else if (info == XLOG_SMGR_BUFPERSISTENCE)
+	{
+		xl_smgr_bufpersistence *xlrec = (xl_smgr_bufpersistence *) rec;
+		char	   *path = relpathperm(xlrec->rlocator, MAIN_FORKNUM);
+
+		appendStringInfoString(buf, path);
+		appendStringInfo(buf, " persistence %d", xlrec->persistence);
+		pfree(path);
+	}
 }
 
 const char *
@@ -55,6 +95,15 @@ smgr_identify(uint8 info)
 		case XLOG_SMGR_TRUNCATE:
 			id = "TRUNCATE";
 			break;
+		case XLOG_SMGR_UNLINK:
+			id = "UNLINK";
+			break;
+		case XLOG_SMGR_MARK:
+			id = "MARK";
+			break;
+		case XLOG_SMGR_BUFPERSISTENCE:
+			id = "BUFPERSISTENCE";
+			break;
 	}
 
 	return id;
diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README
index 22c8ae9755..617a63e2c5 100644
--- a/src/backend/access/transam/README
+++ b/src/backend/access/transam/README
@@ -741,6 +741,16 @@ we must panic and abort recovery.  The DBA will have to manually clean up
 then restart recovery.  This is part of the reason for not writing a WAL
 entry until we've successfully done the original action.
 
+
+The Smgr MARK files
+
+
+An smgr mark file is an empty file that is created when a new relation
+storage file is created to signal that the storage file needs to be
+cleaned up at recovery time.  In contrast to the four actions above,
+failure to remove smgr mark files will lead to data loss, in which
+case the server will shut down.
+
 
 Skipping WAL for New RelFileLocator
 -