Here's an updated version of the patch. There was a bogus assertion in the previous one, comparing against mdsync_cycle_ctr instead of mdunlink_cycle_ctr.
Heikki Linnakangas wrote: > Tom Lane wrote: >> Heikki Linnakangas <[EMAIL PROTECTED]> writes: >>> The best I can think of is to rename the obsolete file to >>> <relfilenode>.stale, when it's scheduled for deletion at next >>> checkpoint, and check for .stale-suffixed files in GetNewRelFileNode, >>> and delete them immediately in DropTableSpace. >> This is getting too Rube Goldbergian for my tastes. What if we just >> make DROP TABLESPACE force a checkpoint before proceeding? > > Patch attached. > > The scenario we're preventing is still possible if for some reason the > latest checkpoint record is damaged, and we start recovery from the > previous checkpoint record. I think the probability of that happening, > together with the OID wrap-around and hitting the relfilenode of a > recently deleted file with a new one, is low enough to not worry about. > If we cared, we could fix it by letting the files to linger for two > checkpoint cycles instead of one. -- Heikki Linnakangas EnterpriseDB http://www.enterprisedb.com
Index: src/backend/access/transam/xlog.c =================================================================== RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/transam/xlog.c,v retrieving revision 1.286 diff -c -r1.286 xlog.c *** src/backend/access/transam/xlog.c 12 Oct 2007 19:39:59 -0000 1.286 --- src/backend/access/transam/xlog.c 18 Oct 2007 20:16:56 -0000 *************** *** 45,50 **** --- 45,51 ---- #include "storage/fd.h" #include "storage/pmsignal.h" #include "storage/procarray.h" + #include "storage/smgr.h" #include "storage/spin.h" #include "utils/builtins.h" #include "utils/pg_locale.h" *************** *** 5668,5673 **** --- 5669,5680 ---- checkPoint.time = time(NULL); /* + * Let the md storage manager to prepare for checkpoint before + * we determine the REDO pointer. + */ + mdcheckpointbegin(); + + /* * We must hold WALInsertLock while examining insert state to determine * the checkpoint REDO pointer. */ *************** *** 5887,5892 **** --- 5894,5904 ---- END_CRIT_SECTION(); /* + * Let the md storage manager to do its post-checkpoint work. + */ + mdcheckpointend(); + + /* * Delete old log files (those no longer needed even for previous * checkpoint). */ Index: src/backend/commands/tablespace.c =================================================================== RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/commands/tablespace.c,v retrieving revision 1.49 diff -c -r1.49 tablespace.c *** src/backend/commands/tablespace.c 1 Aug 2007 22:45:08 -0000 1.49 --- src/backend/commands/tablespace.c 18 Oct 2007 20:31:53 -0000 *************** *** 460,472 **** LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE); /* ! * Try to remove the physical infrastructure */ if (!remove_tablespace_directories(tablespaceoid, false)) ! ereport(ERROR, ! (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), ! errmsg("tablespace \"%s\" is not empty", ! tablespacename))); /* Record the filesystem change in XLOG */ { --- 460,488 ---- LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE); /* ! * Try to remove the physical infrastructure. */ if (!remove_tablespace_directories(tablespaceoid, false)) ! { ! /* ! * There can be lingering empty files in the directories, left behind ! * by for example DROP TABLE, that have been scheduled for deletion ! * at next checkpoint (see comments in mdunlink() for details). We ! * could just delete them immediately, but we can't tell them apart ! * from important data files that we mustn't delete. So instead, we ! * force a checkpoint which will clean out any lingering files, and ! * try again. ! */ ! RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT); ! if (!remove_tablespace_directories(tablespaceoid, false)) ! { ! /* Still not empty, the files must be important then */ ! ereport(ERROR, ! (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), ! errmsg("tablespace \"%s\" is not empty", ! tablespacename))); ! } ! } /* Record the filesystem change in XLOG */ { Index: src/backend/storage/smgr/md.c =================================================================== RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/storage/smgr/md.c,v retrieving revision 1.129 diff -c -r1.129 md.c *** src/backend/storage/smgr/md.c 3 Jul 2007 14:51:24 -0000 1.129 --- src/backend/storage/smgr/md.c 20 Oct 2007 14:10:08 -0000 *************** *** 34,39 **** --- 34,40 ---- /* special values for the segno arg to RememberFsyncRequest */ #define FORGET_RELATION_FSYNC (InvalidBlockNumber) #define FORGET_DATABASE_FSYNC (InvalidBlockNumber-1) + #define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2) /* * On Windows, we have to interpret EACCES as possibly meaning the same as *************** *** 113,118 **** --- 114,123 ---- * table remembers the pending operations. We use a hash table mostly as * a convenient way of eliminating duplicate requests. * + * We use a similar mechanism to remember no-longer-needed files that can + * be deleted after next checkpoint, but we use a linked list instead of hash + * table, because we don't expect there to be any duplicates. + * * (Regular backends do not track pending operations locally, but forward * them to the bgwriter.) */ *************** *** 131,139 **** --- 136,152 ---- CycleCtr cycle_ctr; /* mdsync_cycle_ctr when request was made */ } PendingOperationEntry; + typedef struct + { + RelFileNode rnode; /* the dead relation to delete */ + CycleCtr cycle_ctr; /* mdunlink_cycle_ctr when request was made */ + } PendingUnlinkEntry; + static HTAB *pendingOpsTable = NULL; + static List *pendingUnlinks = NIL; static CycleCtr mdsync_cycle_ctr = 0; + static CycleCtr mdunlink_cycle_ctr = 0; typedef enum /* behavior for mdopen & _mdfd_getseg */ *************** *** 146,151 **** --- 159,165 ---- /* local routines */ static MdfdVec *mdopen(SMgrRelation reln, ExtensionBehavior behavior); static void register_dirty_segment(SMgrRelation reln, MdfdVec *seg); + static void register_unlink(RelFileNode rnode); static MdfdVec *_fdvec_alloc(void); #ifndef LET_OS_MANAGE_FILESIZE *************** *** 188,193 **** --- 202,208 ---- 100L, &hash_ctl, HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT); + pendingUnlinks = NIL; } } *************** *** 257,267 **** --- 272,300 ---- * If isRedo is true, it's okay for the relation to be already gone. * Also, any failure should be reported as WARNING not ERROR, because * we are usually not in a transaction anymore when this is called. + * + * If isRedo is false, the relation is actually just truncated to release + * the space, and left to linger as an empty file until the next checkpoint. + * This is to avoid reusing the same relfilenode for a new relation file, + * until the commit record containing the deletion has been flushed out. + * + * The scenario we're trying to protect from is this: + * After crash, we replay the commit WAL record of a transaction that + * deleted a relation, like DROP TABLE. But before the crash, after deleting + * the old relation, we created a new one, and it happened to get the same + * relfilenode as the deleted relation (OID must've wrapped around for + * that to happen). Now replaying the deletion of the old relation + * deletes the new one instead, because they had the same relfilenode. + * Normally the new relation would be recreated later in the WAL replay, but + * if we relied on fsyncing the file after populating it, like CLUSTER and + * CREATE INDEX do, for example, the contents of the file would be lost + * forever. */ void mdunlink(RelFileNode rnode, bool isRedo) { char *path; + int ret; /* * We have to clean out any pending fsync requests for the doomed relation, *************** *** 271,278 **** path = relpath(rnode); ! /* Delete the first segment, or only segment if not doing segmenting */ ! if (unlink(path) < 0) { if (!isRedo || errno != ENOENT) ereport(WARNING, --- 304,318 ---- path = relpath(rnode); ! /* ! * Delete or truncate the first segment, or only segment if not doing ! * segmenting ! */ ! if (!isRedo) ! ret = truncate(path, 0); ! else ! ret = unlink(path); ! if (ret < 0) { if (!isRedo || errno != ENOENT) ereport(WARNING, *************** *** 316,321 **** --- 356,364 ---- #endif pfree(path); + + if (!isRedo) + register_unlink(rnode); } /* *************** *** 1096,1114 **** } } /* * RememberFsyncRequest() -- callback from bgwriter side of fsync request * ! * We stuff the fsync request into the local hash table for execution * during the bgwriter's next checkpoint. * * The range of possible segment numbers is way less than the range of * BlockNumber, so we can reserve high values of segno for special purposes. ! * We define two: FORGET_RELATION_FSYNC means to cancel pending fsyncs for ! * a relation, and FORGET_DATABASE_FSYNC means to cancel pending fsyncs for ! * a whole database. (These are a tad slow because the hash table has to be ! * searched linearly, but it doesn't seem worth rethinking the table structure ! * for them.) */ void RememberFsyncRequest(RelFileNode rnode, BlockNumber segno) --- 1139,1268 ---- } } + + /* + * register_unlink() -- Schedule a relation to be deleted after next checkpoint + */ + static void + register_unlink(RelFileNode rnode) + { + if (pendingOpsTable) + { + /* standalone backend or startup process: fsync state is local */ + RememberFsyncRequest(rnode, UNLINK_RELATION_REQUEST); + } + else if (IsUnderPostmaster) + { + /* + * Notify the bgwriter about it. If we fail to queue the revoke + * message, we have to sleep and try again ... ugly, but hopefully + * won't happen often. + * + * XXX should we just leave the file orphaned instead? + */ + while (!ForwardFsyncRequest(rnode, UNLINK_RELATION_REQUEST)) + pg_usleep(10000L); /* 10 msec seems a good number */ + } + } + + /* + * mdcheckpointbegin() -- Do pre-checkpoint work + * + * To distinguish unlink requests that arrived before this checkpoint + * started and those that arrived during the checkpoint, we use a cycle + * counter similar to the one we use for fsync requests. That cycle + * counter is incremented here. + * + * This must called *before* RedoRecPtr is determined. + */ + void + mdcheckpointbegin() + { + PendingUnlinkEntry *entry; + ListCell *cell; + + /* + * Just in case the prior checkpoint failed, stamp all entries in + * the list with the current cycle counter. Anything that's in the + * list at the start of checkpoint can surely be deleted after the + * checkpoint is finished, regardless of when the request was made. + */ + foreach(cell, pendingUnlinks) + { + entry = lfirst(cell); + entry->cycle_ctr = mdunlink_cycle_ctr; + } + + /* + * Any unlink requests arriving after this point will be assigned the + * next cycle counter, and won't be unlinked until next checkpoint. + */ + mdunlink_cycle_ctr++; + } + + /* + * mdcheckpointend() -- Do post-checkpoint work + * + * Remove any lingering files that can now be safely removed. + */ + void + mdcheckpointend() + { + while(pendingUnlinks != NIL) + { + PendingUnlinkEntry *entry = linitial(pendingUnlinks); + char *path; + + /* + * New entries are appended to the end, so if the entry is new + * we've reached the end of old entries. + */ + if (entry->cycle_ctr == mdsync_cycle_ctr) + break; + + /* Else assert we haven't missed it */ + Assert((CycleCtr) (entry->cycle_ctr + 1) == mdunlink_cycle_ctr); + + /* Unlink the file */ + path = relpath(entry->rnode); + if (unlink(path) < 0) + { + /* + * ENOENT shouldn't happen either, but it doesn't really matter + * because we would've deleted it now anyway. + */ + if (errno != ENOENT) + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not remove relation %u/%u/%u: %m", + entry->rnode.spcNode, + entry->rnode.dbNode, + entry->rnode.relNode))); + } + pfree(path); + + pendingUnlinks = list_delete_first(pendingUnlinks); + } + } + /* * RememberFsyncRequest() -- callback from bgwriter side of fsync request * ! * We stuff normal fsync request into the local hash table for execution * during the bgwriter's next checkpoint. * * The range of possible segment numbers is way less than the range of * BlockNumber, so we can reserve high values of segno for special purposes. ! * We define three: ! * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation ! * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database ! * - UNLINK_REQUEST is a request to delete a lingering file after next ! * checkpoint. These are stuffed into a linked list separate from ! * the normal fsync requests. ! * ! * (Handling the FORGET_* requests is a tad slow because the hash table has to ! * be searched linearly, but it doesn't seem worth rethinking the table ! * structure for them.) */ void RememberFsyncRequest(RelFileNode rnode, BlockNumber segno) *************** *** 1147,1152 **** --- 1301,1322 ---- } } } + else if (segno == UNLINK_RELATION_REQUEST) + { + /* Unlink request: put it in the linked list */ + MemoryContext oldcxt; + PendingUnlinkEntry *entry; + + oldcxt = MemoryContextSwitchTo(MdCxt); + + entry = palloc(sizeof(PendingUnlinkEntry)); + memcpy(&entry->rnode, &rnode, sizeof(RelFileNode)); + entry->cycle_ctr = mdunlink_cycle_ctr; + + pendingUnlinks = lappend(pendingUnlinks, entry); + + MemoryContextSwitchTo(oldcxt); + } else { /* Normal case: enter a request to fsync this segment */ Index: src/include/storage/smgr.h =================================================================== RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/storage/smgr.h,v retrieving revision 1.59 diff -c -r1.59 smgr.h *** src/include/storage/smgr.h 5 Sep 2007 18:10:48 -0000 1.59 --- src/include/storage/smgr.h 18 Oct 2007 09:52:43 -0000 *************** *** 110,115 **** --- 110,118 ---- extern void ForgetRelationFsyncRequests(RelFileNode rnode); extern void ForgetDatabaseFsyncRequests(Oid dbid); + extern void mdcheckpointbegin(void); + extern void mdcheckpointend(void); + /* smgrtype.c */ extern Datum smgrout(PG_FUNCTION_ARGS); extern Datum smgrin(PG_FUNCTION_ARGS);
---------------------------(end of broadcast)--------------------------- TIP 7: You can help support the PostgreSQL project by donating at http://www.postgresql.org/about/donate