On 2013-11-07 00:30:55 +0100, Andres Freund wrote: > On 2013-11-07 00:17:34 +0100, Andres Freund wrote: > > On 2013-11-06 17:00:40 -0300, Alvaro Herrera wrote: > > > Kevin Grittner wrote: > > > > > > > That makes for a pretty simple test for git bisect, even if > > > > everything including initdb is painfully slow with > > > > CLOBBER_CACHE_ALWAYS. > > > > > > Most likely culprit is f01d1ae3a104019d6d68aeff85c4816a275130b3 > > > > Well, that test tests the functionality added in that commit, so sure, > > it can't be before that. What confuses me is that relfilenodemap has > > survived quite some CLOBBER_CACHE_ALWAYS runs in the buildfarm since: > > http://buildfarm.postgresql.org/cgi-bin/show_history.pl?nm=jaguarundi&br=HEAD > > > > Did you compile with CLOBBER_CACHE_ALWAYS or CLOBBER_CACHE_RECURSIVELY? > > Either way, the code is completely and utterly broken in the face of > cache invalidations that are received when it does its internal > heap_open(RelationRelationId). I can't have been thinking straight when > I wrote the invalidation logic.
Ok, I've attached a fix for this, which unfortunately is not all that small. Could either of you commit it? Greetings, Andres Freund -- Andres Freund http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training & Services
>From ac8973aec6642810f11aff2c4d1c3079b1569f7f Mon Sep 17 00:00:00 2001 From: Andres Freund <and...@anarazel.de> Date: Thu, 7 Nov 2013 10:17:28 +0100 Subject: [PATCH] Fix fundamental issues with relfilenodemap's cache invalidation logic. relfilenodemap.c's invalidation callback used to simply delete its entire hash when a cache reset message arrived and it used to enter a provisional cache entry into the hash before doing the lookups in pg_class. Both is a bad idea because the heap_open() in RelidByRelfilenode() can cause cache invalidations to be received which could cause us to access and return free'd memory. Also fix the possible, but unlikely, scenario where a non-FATAL ERROR occurs after the hash_create() in InitializeRelfilenodeMap() leaving RelfilenodeMap in a partially initialized state. Found by Kevin Grittner. --- src/backend/utils/cache/relfilenodemap.c | 189 ++++++++++++++++--------------- 1 file changed, 99 insertions(+), 90 deletions(-) diff --git a/src/backend/utils/cache/relfilenodemap.c b/src/backend/utils/cache/relfilenodemap.c index f3f9a09..d3d77dd 100644 --- a/src/backend/utils/cache/relfilenodemap.c +++ b/src/backend/utils/cache/relfilenodemap.c @@ -43,8 +43,8 @@ typedef struct typedef struct { - RelfilenodeMapKey key; /* lookup key - must be first */ - Oid relid; /* pg_class.oid */ + RelfilenodeMapKey key; /* lookup key - must be first */ + Oid relid; /* pg_class.oid */ } RelfilenodeMapEntry; /* @@ -54,26 +54,24 @@ typedef struct static void RelfilenodeMapInvalidateCallback(Datum arg, Oid relid) { - HASH_SEQ_STATUS status; + HASH_SEQ_STATUS status; RelfilenodeMapEntry *entry; /* nothing to do if not active or deleted */ if (RelfilenodeMapHash == NULL) return; - /* if relid is InvalidOid, we must invalidate the entire cache */ - if (relid == InvalidOid) - { - hash_destroy(RelfilenodeMapHash); - RelfilenodeMapHash = NULL; - return; - } - hash_seq_init(&status, RelfilenodeMapHash); while ((entry = (RelfilenodeMapEntry *) hash_seq_search(&status)) != NULL) { - /* Same OID may occur in more than one tablespace. */ - if (entry->relid == relid) + /* + * If relid is InvalidOid, signalling a complete reset, we + * must remove all entries, otherwise just remove the specific + * relation's entry. Always remove negative cache entries. + */ + if (relid == InvalidOid || /* complete reset */ + entry->relid == InvalidOid || /* negative cache entry */ + entry->relid == relid) /* individual flushed relation */ { if (hash_search(RelfilenodeMapHash, (void *) &entry->key, @@ -92,32 +90,12 @@ static void InitializeRelfilenodeMap(void) { HASHCTL ctl; - static bool initial_init_done = false; - int i; + int i; /* Make sure we've initialized CacheMemoryContext. */ if (CacheMemoryContext == NULL) CreateCacheMemoryContext(); - /* Initialize the hash table. */ - MemSet(&ctl, 0, sizeof(ctl)); - ctl.keysize = sizeof(RelfilenodeMapKey); - ctl.entrysize = sizeof(RelfilenodeMapEntry); - ctl.hash = tag_hash; - ctl.hcxt = CacheMemoryContext; - - RelfilenodeMapHash = - hash_create("RelfilenodeMap cache", 1024, &ctl, - HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT); - - /* - * For complete resets we simply delete the entire hash, but there's no - * need to do the other stuff multiple times. Especially the initialization - * of the relcche invalidation should only be done once. - */ - if (initial_init_done) - return; - /* build skey */ MemSet(&relfilenode_skey, 0, sizeof(relfilenode_skey)); @@ -134,10 +112,25 @@ InitializeRelfilenodeMap(void) relfilenode_skey[0].sk_attno = Anum_pg_class_reltablespace; relfilenode_skey[1].sk_attno = Anum_pg_class_relfilenode; + /* Initialize the hash table. */ + MemSet(&ctl, 0, sizeof(ctl)); + ctl.keysize = sizeof(RelfilenodeMapKey); + ctl.entrysize = sizeof(RelfilenodeMapEntry); + ctl.hash = tag_hash; + ctl.hcxt = CacheMemoryContext; + + /* + * Only create the RelfilenodeMapHash now, so we don't end up + * partially initialized when fmgr_info_cxt() above ERRORs out + * with an out of memory error. + */ + RelfilenodeMapHash = + hash_create("RelfilenodeMap cache", 1024, &ctl, + HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT); + /* Watch for invalidation events. */ CacheRegisterRelcacheCallback(RelfilenodeMapInvalidateCallback, (Datum) 0); - initial_init_done = true; } /* @@ -156,6 +149,7 @@ RelidByRelfilenode(Oid reltablespace, Oid relfilenode) Relation relation; HeapTuple ntp; ScanKeyData skey[2]; + Oid relid; if (RelfilenodeMapHash == NULL) InitializeRelfilenodeMap(); @@ -169,82 +163,97 @@ RelidByRelfilenode(Oid reltablespace, Oid relfilenode) key.relfilenode = relfilenode; /* - * Check cache and enter entry if nothing could be found. Even if no target - * relation can be found later on we store the negative match and return a - * InvalidOid from cache. That's not really necessary for performance since - * querying invalid values isn't supposed to be a frequent thing, but the - * implementation is simpler this way. + * Check cache and return entry if one is found. Even if no target + * relation can be found later on we store the negative match and + * return a InvalidOid from cache. That's not really necessary for + * performance since querying invalid values isn't supposed to be + * a frequent thing, but it's basically free. */ - entry = hash_search(RelfilenodeMapHash, (void *) &key, HASH_ENTER, &found); + entry = hash_search(RelfilenodeMapHash, (void *) &key, HASH_FIND, &found); if (found) return entry->relid; - /* initialize empty/negative cache entry before doing the actual lookup */ - entry->relid = InvalidOid; - /* ok, no previous cache entry, do it the hard way */ + /* initialize empty/negative cache entry before doing the actual lookups */ + relid = InvalidOid; + /* check shared tables */ if (reltablespace == GLOBALTABLESPACE_OID) { - entry->relid = RelationMapFilenodeToOid(relfilenode, true); - return entry->relid; + relid = RelationMapFilenodeToOid(relfilenode, true); } + /* + * Not a shared table, could either be a plain relation or a + * database specific but nailed one, like e.g. pg_class. + */ + else + { + /* check forplain relations by looking in pg_class */ + relation = heap_open(RelationRelationId, AccessShareLock); - /* check plain relations by looking in pg_class */ - relation = heap_open(RelationRelationId, AccessShareLock); - - /* copy scankey to local copy, it will be modified during the scan */ - memcpy(skey, relfilenode_skey, sizeof(skey)); + /* copy scankey to local copy, it will be modified during the scan */ + memcpy(skey, relfilenode_skey, sizeof(skey)); - /* set scan arguments */ - skey[0].sk_argument = ObjectIdGetDatum(reltablespace); - skey[1].sk_argument = ObjectIdGetDatum(relfilenode); + /* set scan arguments */ + skey[0].sk_argument = ObjectIdGetDatum(reltablespace); + skey[1].sk_argument = ObjectIdGetDatum(relfilenode); - scandesc = systable_beginscan(relation, - ClassTblspcRelfilenodeIndexId, - true, - NULL, - 2, - skey); + scandesc = systable_beginscan(relation, + ClassTblspcRelfilenodeIndexId, + true, + NULL, + 2, + skey); - found = false; + found = false; - while (HeapTupleIsValid(ntp = systable_getnext(scandesc))) - { - bool isnull PG_USED_FOR_ASSERTS_ONLY; + while (HeapTupleIsValid(ntp = systable_getnext(scandesc))) + { + bool isnull PG_USED_FOR_ASSERTS_ONLY; - if (found) - elog(ERROR, - "unexpected duplicate for tablespace %u, relfilenode %u", - reltablespace, relfilenode); - found = true; + if (found) + elog(ERROR, + "unexpected duplicate for tablespace %u, relfilenode %u", + reltablespace, relfilenode); + found = true; #ifdef USE_ASSERT_CHECKING - if (assert_enabled) - { - Oid check; - check = fastgetattr(ntp, Anum_pg_class_reltablespace, - RelationGetDescr(relation), - &isnull); - Assert(!isnull && check == reltablespace); - - check = fastgetattr(ntp, Anum_pg_class_relfilenode, - RelationGetDescr(relation), - &isnull); - Assert(!isnull && check == relfilenode); - } + if (assert_enabled) + { + Oid check; + check = fastgetattr(ntp, Anum_pg_class_reltablespace, + RelationGetDescr(relation), + &isnull); + Assert(!isnull && check == reltablespace); + + check = fastgetattr(ntp, Anum_pg_class_relfilenode, + RelationGetDescr(relation), + &isnull); + Assert(!isnull && check == relfilenode); + } #endif - entry->relid = HeapTupleGetOid(ntp); - } + relid = HeapTupleGetOid(ntp); + } - systable_endscan(scandesc); - heap_close(relation, AccessShareLock); + systable_endscan(scandesc); + heap_close(relation, AccessShareLock); - /* check for tables that are mapped but not shared */ - if (!found) - entry->relid = RelationMapFilenodeToOid(relfilenode, false); + /* check for tables that are mapped but not shared */ + if (!found) + relid = RelationMapFilenodeToOid(relfilenode, false); + } + + /* + * Only enter entry into cache now, our opening of pg_class could have + * caused a cache invalidation which would have deleted a new entry if + * we had HASH_ENTERed it above. + */ + entry = hash_search(RelfilenodeMapHash, (void *) &key, HASH_ENTER, &found); + if (found) + elog(ERROR, "corrupted hashtable"); + entry->relid = relid; - return entry->relid; + return relid; } -- 1.8.5.rc1.dirty
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers