On 2013-11-07 00:30:55 +0100, Andres Freund wrote:
> On 2013-11-07 00:17:34 +0100, Andres Freund wrote:
> > On 2013-11-06 17:00:40 -0300, Alvaro Herrera wrote:
> > > Kevin Grittner wrote:
> > > 
> > > > That makes for a pretty simple test for git bisect, even if
> > > > everything including initdb is painfully slow with
> > > > CLOBBER_CACHE_ALWAYS.
> > > 
> > > Most likely culprit is f01d1ae3a104019d6d68aeff85c4816a275130b3
> > 
> > Well, that test tests the functionality added in that commit, so sure,
> > it can't be before that. What confuses me is that relfilenodemap has
> > survived quite some CLOBBER_CACHE_ALWAYS runs in the buildfarm since:
> > http://buildfarm.postgresql.org/cgi-bin/show_history.pl?nm=jaguarundi&br=HEAD
> > 
> > Did you compile with CLOBBER_CACHE_ALWAYS or CLOBBER_CACHE_RECURSIVELY?
> 
> Either way, the code is completely and utterly broken in the face of
> cache invalidations that are received when it does its internal
> heap_open(RelationRelationId). I can't have been thinking straight when
> I wrote the invalidation logic.

Ok, I've attached a fix for this, which unfortunately is not all that
small.
Could either of you commit it?

Greetings,

Andres Freund

-- 
 Andres Freund                     http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training & Services
>From ac8973aec6642810f11aff2c4d1c3079b1569f7f Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Thu, 7 Nov 2013 10:17:28 +0100
Subject: [PATCH] Fix fundamental issues with relfilenodemap's cache
 invalidation logic.

relfilenodemap.c's invalidation callback used to simply delete its
entire hash when a cache reset message arrived and it used to enter a
provisional cache entry into the hash before doing the lookups in
pg_class.
Both is a bad idea because the heap_open() in RelidByRelfilenode() can
cause cache invalidations to be received which could cause us to
access and return free'd memory.

Also fix the possible, but unlikely, scenario where a non-FATAL ERROR
occurs after the hash_create() in InitializeRelfilenodeMap() leaving
RelfilenodeMap in a partially initialized state.

Found by Kevin Grittner.
---
 src/backend/utils/cache/relfilenodemap.c | 189 ++++++++++++++++---------------
 1 file changed, 99 insertions(+), 90 deletions(-)

diff --git a/src/backend/utils/cache/relfilenodemap.c b/src/backend/utils/cache/relfilenodemap.c
index f3f9a09..d3d77dd 100644
--- a/src/backend/utils/cache/relfilenodemap.c
+++ b/src/backend/utils/cache/relfilenodemap.c
@@ -43,8 +43,8 @@ typedef struct
 
 typedef struct
 {
-	RelfilenodeMapKey key;	/* lookup key - must be first */
-	Oid			relid;			/* pg_class.oid */
+	RelfilenodeMapKey 	key;	/* lookup key - must be first */
+	Oid					relid;	/* pg_class.oid */
 } RelfilenodeMapEntry;
 
 /*
@@ -54,26 +54,24 @@ typedef struct
 static void
 RelfilenodeMapInvalidateCallback(Datum arg, Oid relid)
 {
-	HASH_SEQ_STATUS status;
+	HASH_SEQ_STATUS	status;
 	RelfilenodeMapEntry *entry;
 
 	/* nothing to do if not active or deleted */
 	if (RelfilenodeMapHash == NULL)
 		return;
 
-	/* if relid is InvalidOid, we must invalidate the entire cache */
-	if (relid == InvalidOid)
-	{
-		hash_destroy(RelfilenodeMapHash);
-		RelfilenodeMapHash = NULL;
-		return;
-	}
-
 	hash_seq_init(&status, RelfilenodeMapHash);
 	while ((entry = (RelfilenodeMapEntry *) hash_seq_search(&status)) != NULL)
 	{
-		/* Same OID may occur in more than one tablespace. */
-		if (entry->relid == relid)
+		/*
+		 * If relid is InvalidOid, signalling a complete reset, we
+		 * must remove all entries, otherwise just remove the specific
+		 * relation's entry. Always remove negative cache entries.
+		 */
+		if (relid == InvalidOid ||			/* complete reset */
+		    entry->relid == InvalidOid ||	/* negative cache entry */
+		    entry->relid == relid)			/* individual flushed relation */
 		{
 			if (hash_search(RelfilenodeMapHash,
 							(void *) &entry->key,
@@ -92,32 +90,12 @@ static void
 InitializeRelfilenodeMap(void)
 {
 	HASHCTL		ctl;
-	static bool	initial_init_done = false;
-	int i;
+	int			i;
 
 	/* Make sure we've initialized CacheMemoryContext. */
 	if (CacheMemoryContext == NULL)
 		CreateCacheMemoryContext();
 
-	/* Initialize the hash table. */
-	MemSet(&ctl, 0, sizeof(ctl));
-	ctl.keysize = sizeof(RelfilenodeMapKey);
-	ctl.entrysize = sizeof(RelfilenodeMapEntry);
-	ctl.hash = tag_hash;
-	ctl.hcxt = CacheMemoryContext;
-
-	RelfilenodeMapHash =
-		hash_create("RelfilenodeMap cache", 1024, &ctl,
-					HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
-
-	/*
-	 * For complete resets we simply delete the entire hash, but there's no
-	 * need to do the other stuff multiple times. Especially the initialization
-	 * of the relcche invalidation should only be done once.
-	 */
-	if (initial_init_done)
-		return;
-
 	/* build skey */
 	MemSet(&relfilenode_skey, 0, sizeof(relfilenode_skey));
 
@@ -134,10 +112,25 @@ InitializeRelfilenodeMap(void)
 	relfilenode_skey[0].sk_attno = Anum_pg_class_reltablespace;
 	relfilenode_skey[1].sk_attno = Anum_pg_class_relfilenode;
 
+	/* Initialize the hash table. */
+	MemSet(&ctl, 0, sizeof(ctl));
+	ctl.keysize = sizeof(RelfilenodeMapKey);
+	ctl.entrysize = sizeof(RelfilenodeMapEntry);
+	ctl.hash = tag_hash;
+	ctl.hcxt = CacheMemoryContext;
+
+	/*
+	 * Only create the RelfilenodeMapHash now, so we don't end up
+	 * partially initialized when fmgr_info_cxt() above ERRORs out
+	 * with an out of memory error.
+	 */
+	RelfilenodeMapHash =
+		hash_create("RelfilenodeMap cache", 1024, &ctl,
+					HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+
 	/* Watch for invalidation events. */
 	CacheRegisterRelcacheCallback(RelfilenodeMapInvalidateCallback,
 								  (Datum) 0);
-	initial_init_done = true;
 }
 
 /*
@@ -156,6 +149,7 @@ RelidByRelfilenode(Oid reltablespace, Oid relfilenode)
 	Relation relation;
 	HeapTuple ntp;
 	ScanKeyData skey[2];
+	Oid relid;
 
 	if (RelfilenodeMapHash == NULL)
 		InitializeRelfilenodeMap();
@@ -169,82 +163,97 @@ RelidByRelfilenode(Oid reltablespace, Oid relfilenode)
 	key.relfilenode = relfilenode;
 
 	/*
-	 * Check cache and enter entry if nothing could be found. Even if no target
-	 * relation can be found later on we store the negative match and return a
-	 * InvalidOid from cache. That's not really necessary for performance since
-	 * querying invalid values isn't supposed to be a frequent thing, but the
-	 * implementation is simpler this way.
+	 * Check cache and return entry if one is found. Even if no target
+	 * relation can be found later on we store the negative match and
+	 * return a InvalidOid from cache. That's not really necessary for
+	 * performance since querying invalid values isn't supposed to be
+	 * a frequent thing, but it's basically free.
 	 */
-	entry = hash_search(RelfilenodeMapHash, (void *) &key, HASH_ENTER, &found);
+	entry = hash_search(RelfilenodeMapHash, (void *) &key, HASH_FIND, &found);
 
 	if (found)
 		return entry->relid;
 
-	/* initialize empty/negative cache entry before doing the actual lookup */
-	entry->relid = InvalidOid;
-
 	/* ok, no previous cache entry, do it the hard way */
 
+	/* initialize empty/negative cache entry before doing the actual lookups */
+	relid = InvalidOid;
+
 	/* check shared tables */
 	if (reltablespace == GLOBALTABLESPACE_OID)
 	{
-		entry->relid = RelationMapFilenodeToOid(relfilenode, true);
-		return entry->relid;
+		relid = RelationMapFilenodeToOid(relfilenode, true);
 	}
+	/*
+	 *  Not a shared table, could either be a plain relation or a
+	 *  database specific but nailed one, like e.g. pg_class.
+	 */
+	else
+	{
+		/* check forplain relations by looking in pg_class */
+		relation = heap_open(RelationRelationId, AccessShareLock);
 
-	/* check plain relations by looking in pg_class */
-	relation = heap_open(RelationRelationId, AccessShareLock);
-
-	/* copy scankey to local copy, it will be modified during the scan */
-	memcpy(skey, relfilenode_skey, sizeof(skey));
+		/* copy scankey to local copy, it will be modified during the scan */
+		memcpy(skey, relfilenode_skey, sizeof(skey));
 
-	/* set scan arguments */
-	skey[0].sk_argument = ObjectIdGetDatum(reltablespace);
-	skey[1].sk_argument = ObjectIdGetDatum(relfilenode);
+		/* set scan arguments */
+		skey[0].sk_argument = ObjectIdGetDatum(reltablespace);
+		skey[1].sk_argument = ObjectIdGetDatum(relfilenode);
 
-	scandesc = systable_beginscan(relation,
-								  ClassTblspcRelfilenodeIndexId,
-								  true,
-								  NULL,
-								  2,
-								  skey);
+		scandesc = systable_beginscan(relation,
+		                              ClassTblspcRelfilenodeIndexId,
+		                              true,
+		                              NULL,
+		                              2,
+		                              skey);
 
-	found = false;
+		found = false;
 
-	while (HeapTupleIsValid(ntp = systable_getnext(scandesc)))
-	{
-		bool isnull PG_USED_FOR_ASSERTS_ONLY;
+		while (HeapTupleIsValid(ntp = systable_getnext(scandesc)))
+		{
+			bool isnull PG_USED_FOR_ASSERTS_ONLY;
 
-		if (found)
-			elog(ERROR,
-				 "unexpected duplicate for tablespace %u, relfilenode %u",
-				 reltablespace, relfilenode);
-		found = true;
+			if (found)
+				elog(ERROR,
+				     "unexpected duplicate for tablespace %u, relfilenode %u",
+				     reltablespace, relfilenode);
+			found = true;
 
 #ifdef USE_ASSERT_CHECKING
-		if (assert_enabled)
-		{
-			Oid check;
-			check = fastgetattr(ntp, Anum_pg_class_reltablespace,
-								RelationGetDescr(relation),
-								&isnull);
-			Assert(!isnull && check == reltablespace);
-
-			check = fastgetattr(ntp, Anum_pg_class_relfilenode,
-								RelationGetDescr(relation),
-								&isnull);
-			Assert(!isnull && check == relfilenode);
-		}
+			if (assert_enabled)
+			{
+				Oid check;
+				check = fastgetattr(ntp, Anum_pg_class_reltablespace,
+				                    RelationGetDescr(relation),
+				                    &isnull);
+				Assert(!isnull && check == reltablespace);
+
+				check = fastgetattr(ntp, Anum_pg_class_relfilenode,
+				                    RelationGetDescr(relation),
+				                    &isnull);
+				Assert(!isnull && check == relfilenode);
+			}
 #endif
-		entry->relid = HeapTupleGetOid(ntp);
-	}
+			relid = HeapTupleGetOid(ntp);
+		}
 
-	systable_endscan(scandesc);
-	heap_close(relation, AccessShareLock);
+		systable_endscan(scandesc);
+		heap_close(relation, AccessShareLock);
 
-	/* check for tables that are mapped but not shared */
-	if (!found)
-		entry->relid = RelationMapFilenodeToOid(relfilenode, false);
+		/* check for tables that are mapped but not shared */
+		if (!found)
+			relid = RelationMapFilenodeToOid(relfilenode, false);
+	}
+
+	/*
+	 * Only enter entry into cache now, our opening of pg_class could have
+	 * caused a cache invalidation which would have deleted a new entry if
+	 * we had HASH_ENTERed it above.
+	 */
+	entry = hash_search(RelfilenodeMapHash, (void *) &key, HASH_ENTER, &found);
+	if (found)
+		elog(ERROR, "corrupted hashtable");
+	entry->relid = relid;
 
-	return entry->relid;
+	return relid;
 }
-- 
1.8.5.rc1.dirty

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to