Hi,

On 2018-05-27 13:00:06 -0700, Andres Freund wrote:
> I've a patch that seems to work, that mostly needs some comment
> polishing.

Attached is what I currently have. Still needs some more work, but I
think it's more than good enough to review the approach.  Basically the
approach consists out of two changes:

1) Send init file removals for shared nailed relations as well.

   This fixes that the shared init file contains arbitrarily outdated
   information for relfrozenxid etc. Leading to e.g. the pg_authid
   errors we've seen in some recent threads.  Only applies to
   new connections.

2) Reread RelationData->rd_rel for nailed relations when invalidated.

   This ensures that already built relcache entries for nailed relations
   are updated. Currently they never are.  This currently doesn't cause
   *that* frequently an issue for !shared entries, because for those the
   init file gets zapped regularly, and autovacuum workers usually don't
   live that long.  But it's still a significant correctness issue for
   both shared an non shared relations.

FWIW, I wonder if this isn't critical enough to make us consider having
a point release earlier..

Greetings,

Andres Freund
>From c7121eec1e76b6f9b2cfa5d7d7f5f4b7a4f8be96 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Mon, 28 May 2018 12:38:22 -0700
Subject: [PATCH] WIP: Ensure relcache entries for nailed relations are
 accurate.

Author: Andres Freund
Reviewed-By:
Discussion:
    https://postgr.es/m/20180525203736.crkbg36muzxrj...@alap3.anarazel.de
    https://postgr.es/m/CAMa1XUhKSJd98JW4o9StWPrfS=11bpgg+_gdmxe25tvuy4s...@mail.gmail.com
    https://postgr.es/m/cakmfjucqbuodrfxpdx39wha3vjyxwerg_zdvxzncr6+5wog...@mail.gmail.com
    https://postgr.es/m/cagewt-ujgpmlq09gxcufmzazsgjc98vxhefbf-tppb0fb13...@mail.gmail.com
Backpatch:
---
 src/backend/utils/cache/inval.c    |  31 +++--
 src/backend/utils/cache/relcache.c | 198 ++++++++++++++++++++---------
 src/include/storage/standbydefs.h  |   2 +-
 3 files changed, 156 insertions(+), 75 deletions(-)

diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c
index 955f5c7fdc5..0d6100fb082 100644
--- a/src/backend/utils/cache/inval.c
+++ b/src/backend/utils/cache/inval.c
@@ -521,12 +521,11 @@ RegisterRelcacheInvalidation(Oid dbId, Oid relId)
 	(void) GetCurrentCommandId(true);
 
 	/*
-	 * If the relation being invalidated is one of those cached in the local
-	 * relcache init file, mark that we need to zap that file at commit. Same
-	 * is true when we are invalidating whole relcache.
+	 * If the relation being invalidated is one of those cached in a relcache
+	 * init file, mark that we need to zap that file at commit. Same is true
+	 * when we are invalidating whole relcache.
 	 */
-	if (OidIsValid(dbId) &&
-		(RelationIdIsInInitFile(relId) || relId == InvalidOid))
+	if (relId == InvalidOid || RelationIdIsInInitFile(relId))
 		transInvalInfo->RelcacheInitFileInval = true;
 }
 
@@ -881,18 +880,26 @@ ProcessCommittedInvalidationMessages(SharedInvalidationMessage *msgs,
 
 	if (RelcacheInitFileInval)
 	{
+		elog(trace_recovery(DEBUG4), "removing relcache init files for database %u",
+			 dbid);
+
 		/*
-		 * RelationCacheInitFilePreInvalidate requires DatabasePath to be set,
-		 * but we should not use SetDatabasePath during recovery, since it is
+		 * RelationCacheInitFilePreInvalidate, when the invalidation message
+		 * is for a specific database, requires DatabasePath to be set, but we
+		 * should not use SetDatabasePath during recovery, since it is
 		 * intended to be used only once by normal backends.  Hence, a quick
 		 * hack: set DatabasePath directly then unset after use.
 		 */
-		DatabasePath = GetDatabasePath(dbid, tsid);
-		elog(trace_recovery(DEBUG4), "removing relcache init file in \"%s\"",
-			 DatabasePath);
+		if (OidIsValid(dbid))
+			DatabasePath = GetDatabasePath(dbid, tsid);
+
 		RelationCacheInitFilePreInvalidate();
-		pfree(DatabasePath);
-		DatabasePath = NULL;
+
+		if (OidIsValid(dbid))
+		{
+			pfree(DatabasePath);
+			DatabasePath = NULL;
+		}
 	}
 
 	SendSharedInvalidMessages(msgs, nmsgs);
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index 3dfb1b8fbe2..aa4427724d5 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -250,6 +250,7 @@ static void RelationDestroyRelation(Relation relation, bool remember_tupdesc);
 static void RelationClearRelation(Relation relation, bool rebuild);
 
 static void RelationReloadIndexInfo(Relation relation);
+static void RelationReloadNailed(Relation relation);
 static void RelationFlushRelation(Relation relation);
 static void RememberToFreeTupleDescAtEOX(TupleDesc td);
 static void AtEOXact_cleanup(Relation relation, bool isCommit);
@@ -286,7 +287,7 @@ static void IndexSupportInitialize(oidvector *indclass,
 static OpClassCacheEnt *LookupOpclassInfo(Oid operatorClassOid,
 				  StrategyNumber numSupport);
 static void RelationCacheInitFileRemoveInDir(const char *tblspcpath);
-static void unlink_initfile(const char *initfilename);
+static void unlink_initfile(const char *initfilename, int elevel);
 static bool equalPartitionDescs(PartitionKey key, PartitionDesc partdesc1,
 					PartitionDesc partdesc2);
 
@@ -1931,7 +1932,16 @@ RelationIdGetRelation(Oid relationId)
 				RelationReloadIndexInfo(rd);
 			else
 				RelationClearRelation(rd, true);
-			Assert(rd->rd_isvalid);
+
+			/*
+			 * Normally entries need to be valid here, but before the relcache
+			 * has been initialized, not enough infrastructure exists to
+			 * perform pg_class lookups. The structure of such entries doesn't
+			 * change, but we still want to update the rd_rel entry. So
+			 * rd_isvalid = false is left in place for a later lookup.
+			 */
+			Assert(rd->rd_isvalid ||
+				   (rd->rd_isnailed && !criticalRelcachesBuilt));
 		}
 		return rd;
 	}
@@ -2135,6 +2145,92 @@ RelationReloadIndexInfo(Relation relation)
 	relation->rd_isvalid = true;
 }
 
+/*
+ * RelationReloadNailed - reload minimal information for nailed relations.
+ *
+ * The structure of a nailed relation can never change (which is good, because
+ * we rely on knowing their structure to be able to read catalog content). But
+ * some parts, e.g. pg_class.relfrozenxid, are still important to have
+ * accurate content for. Therefore those need to be reloaded upon arrival of
+ * invalidations.
+ */
+static void
+RelationReloadNailed(Relation relation)
+{
+	Assert(relation->rd_isnailed);
+
+	/*
+	 * Redo RelationInitPhysicalAddr in case it is a mapped relation whose
+	 * mapping changed.
+	 */
+	RelationInitPhysicalAddr(relation);
+
+	if (relation->rd_rel->relkind == RELKIND_INDEX)
+	{
+		/*
+		 * If it's a nailed-but-not-mapped index, then we need to re-read the
+		 * pg_class row to see if its relfilenode changed. We do that immediately
+		 * if we're inside a valid transaction and the relation is open (not
+		 * counting the nailed refcount).  Otherwise just mark the entry as
+		 * possibly invalid, and it'll be fixed when next opened.
+		 */
+		relation->rd_isvalid = false;	/* needs to be revalidated */
+
+		if (relation->rd_refcnt > 1 && IsTransactionState())
+			RelationReloadIndexInfo(relation);
+	}
+	else
+	{
+		/*
+		 * If the relcache infrastructure has been bootstrapped, and we're in
+		 * a transaction, reload the pg_class part of the relcache entry. We
+		 * can't easily do so if relcaches aren't yet built, but that's fine
+		 * because at that stage the to-be-reloaded attributes (like
+		 * relfrozenxid) aren't accessed.  To ensure the entry will later be
+		 * revalidated, we leave it in invalid state, but allow use
+		 * (cf. RelationIdGetRelation()).
+		 *
+		 * We do that immediately if we're inside a valid transaction and the
+		 * relation is open (not counting the nailed refcount). Otherwise just
+		 * mark the entry as possibly invalid, and it'll be fixed when next
+		 * opened.
+		 */
+		if (criticalRelcachesBuilt && relation->rd_refcnt > 1 && IsTransactionState())
+		{
+			HeapTuple	pg_class_tuple;
+			Form_pg_class relp;
+
+			/*
+			 * NB: Mark the entry as valid before starting to scan, to avoid
+			 * self-recursion when looking up pg_class.
+			 */
+			relation->rd_isvalid = true;
+
+			pg_class_tuple = ScanPgRelation(RelationGetRelid(relation),
+											true,
+											false);
+			relp = (Form_pg_class) GETSTRUCT(pg_class_tuple);
+			memcpy(relation->rd_rel, relp, CLASS_TUPLE_SIZE);
+			heap_freetuple(pg_class_tuple);
+
+			/*
+			 * Again mark as valid, to protect against concurrently arriving
+			 * invalidations.
+			 */
+			relation->rd_isvalid = true;
+		}
+		else
+		{
+			/*
+			 * If currently the entry can't be updated, leave it as invalid. Note
+			 * that we'll allow access to nailed entries marked invalid, as the
+			 * structure didn't change.
+			 */
+			relation->rd_isvalid = false;
+		}
+	}
+}
+
 /*
  * RelationDestroyRelation
  *
@@ -2250,27 +2346,12 @@ RelationClearRelation(Relation relation, bool rebuild)
 	RelationCloseSmgr(relation);
 
 	/*
-	 * Never, never ever blow away a nailed-in system relation, because we'd
-	 * be unable to recover.  However, we must redo RelationInitPhysicalAddr
-	 * in case it is a mapped relation whose mapping changed.
-	 *
-	 * If it's a nailed-but-not-mapped index, then we need to re-read the
-	 * pg_class row to see if its relfilenode changed. We do that immediately
-	 * if we're inside a valid transaction and the relation is open (not
-	 * counting the nailed refcount).  Otherwise just mark the entry as
-	 * possibly invalid, and it'll be fixed when next opened.
+	 * Treat nailed-in system relations separately, they always need to be
+	 * accessible, so we can't blow them away.
 	 */
 	if (relation->rd_isnailed)
 	{
-		RelationInitPhysicalAddr(relation);
-
-		if (relation->rd_rel->relkind == RELKIND_INDEX ||
-			relation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
-		{
-			relation->rd_isvalid = false;	/* needs to be revalidated */
-			if (relation->rd_refcnt > 1 && IsTransactionState())
-				RelationReloadIndexInfo(relation);
-		}
+		RelationReloadNailed(relation);
 		return;
 	}
 
@@ -5907,24 +5988,26 @@ write_item(const void *data, Size len, FILE *fp)
 
 /*
  * Determine whether a given relation (identified by OID) is one of the ones
- * we should store in the local relcache init file.
+ * we should store in a relcache init file.
  *
  * We must cache all nailed rels, and for efficiency we should cache every rel
  * that supports a syscache.  The former set is almost but not quite a subset
- * of the latter.  Currently, we must special-case TriggerRelidNameIndexId,
- * which RelationCacheInitializePhase3 chooses to nail for efficiency reasons,
- * but which does not support any syscache.
- *
- * Note: this function is currently never called for shared rels.  If it were,
- * we'd probably also need a special case for DatabaseNameIndexId, which is
- * critical but does not support a syscache.
+ * of the latter. The special cases are relations where
+ * RelationCacheInitializePhase2/3 chooses to nail for efficiency reasons, but
+ * which do not support any syscache.
  */
 bool
 RelationIdIsInInitFile(Oid relationId)
 {
-	if (relationId == TriggerRelidNameIndexId)
+	if (relationId == SharedSecLabelRelationId ||
+		relationId == TriggerRelidNameIndexId ||
+		relationId == DatabaseNameIndexId ||
+		relationId == SharedSecLabelObjectIndexId)
 	{
-		/* If this Assert fails, we don't need this special case anymore. */
+		/*
+		 * If this Assert fails, we don't need the applicable special case
+		 * anymore.
+		 */
 		Assert(!RelationSupportsSysCache(relationId));
 		return true;
 	}
@@ -5994,38 +6077,30 @@ RelationHasUnloggedIndex(Relation rel)
  * We take the lock and do the unlink in RelationCacheInitFilePreInvalidate,
  * then release the lock in RelationCacheInitFilePostInvalidate.  Caller must
  * send any pending SI messages between those calls.
- *
- * Notice this deals only with the local init file, not the shared init file.
- * The reason is that there can never be a "significant" change to the
- * relcache entry of a shared relation; the most that could happen is
- * updates of noncritical fields such as relpages/reltuples.  So, while
- * it's worth updating the shared init file from time to time, it can never
- * be invalid enough to make it necessary to remove it.
  */
 void
 RelationCacheInitFilePreInvalidate(void)
 {
-	char		initfilename[MAXPGPATH];
+	char		localinitfname[MAXPGPATH];
+	char		sharedinitfname[MAXPGPATH];
 
-	snprintf(initfilename, sizeof(initfilename), "%s/%s",
-			 DatabasePath, RELCACHE_INIT_FILENAME);
+	if (DatabasePath)
+		snprintf(localinitfname, sizeof(localinitfname), "%s/%s",
+				 DatabasePath, RELCACHE_INIT_FILENAME);
+	snprintf(sharedinitfname, sizeof(sharedinitfname), "global/%s",
+			 RELCACHE_INIT_FILENAME);
 
 	LWLockAcquire(RelCacheInitLock, LW_EXCLUSIVE);
 
-	if (unlink(initfilename) < 0)
-	{
-		/*
-		 * The file might not be there if no backend has been started since
-		 * the last removal.  But complain about failures other than ENOENT.
-		 * Fortunately, it's not too late to abort the transaction if we can't
-		 * get rid of the would-be-obsolete init file.
-		 */
-		if (errno != ENOENT)
-			ereport(ERROR,
-					(errcode_for_file_access(),
-					 errmsg("could not remove cache file \"%s\": %m",
-							initfilename)));
-	}
+	/*
+	 * The files might not be there if no backend has been started since the
+	 * last removal.  But complain about failures other than ENOENT with
+	 * ERROR.  Fortunately, it's not too late to abort the transaction if we
+	 * can't get rid of the would-be-obsolete init file.
+	 */
+	if (DatabasePath)
+		unlink_initfile(localinitfname, ERROR);
+	unlink_initfile(sharedinitfname, ERROR);
 }
 
 void
@@ -6051,13 +6126,9 @@ RelationCacheInitFileRemove(void)
 	struct dirent *de;
 	char		path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY)];
 
-	/*
-	 * We zap the shared cache file too.  In theory it can't get out of sync
-	 * enough to be a problem, but in data-corruption cases, who knows ...
-	 */
 	snprintf(path, sizeof(path), "global/%s",
 			 RELCACHE_INIT_FILENAME);
-	unlink_initfile(path);
+	unlink_initfile(path, LOG);
 
 	/* Scan everything in the default tablespace */
 	RelationCacheInitFileRemoveInDir("base");
@@ -6097,7 +6168,7 @@ RelationCacheInitFileRemoveInDir(const char *tblspcpath)
 			/* Try to remove the init file in each database */
 			snprintf(initfilename, sizeof(initfilename), "%s/%s/%s",
 					 tblspcpath, de->d_name, RELCACHE_INIT_FILENAME);
-			unlink_initfile(initfilename);
+			unlink_initfile(initfilename, LOG);
 		}
 	}
 
@@ -6105,12 +6176,15 @@ RelationCacheInitFileRemoveInDir(const char *tblspcpath)
 }
 
 static void
-unlink_initfile(const char *initfilename)
+unlink_initfile(const char *initfilename, int elevel)
 {
 	if (unlink(initfilename) < 0)
 	{
 		/* It might not be there, but log any error other than ENOENT */
 		if (errno != ENOENT)
-			elog(LOG, "could not remove cache file \"%s\": %m", initfilename);
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not remove cache file \"%s\": %m",
+							initfilename)));
 	}
 }
diff --git a/src/include/storage/standbydefs.h b/src/include/storage/standbydefs.h
index bb6144805a7..17b74ca3b61 100644
--- a/src/include/storage/standbydefs.h
+++ b/src/include/storage/standbydefs.h
@@ -64,7 +64,7 @@ typedef struct xl_invalidations
 {
 	Oid			dbId;			/* MyDatabaseId */
 	Oid			tsId;			/* MyDatabaseTableSpace */
-	bool		relcacheInitFileInval;	/* invalidate relcache init file */
+	bool		relcacheInitFileInval;	/* invalidate relcache init files */
 	int			nmsgs;			/* number of shared inval msgs */
 	SharedInvalidationMessage msgs[FLEXIBLE_ARRAY_MEMBER];
 } xl_invalidations;
-- 
2.17.0.rc1.dirty

Reply via email to