Updated patch attached, thanks. Amit: what's your conclusion from the review?
-- Abhijit
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 5f9fc49..dc90c02 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -501,33 +501,9 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record) * here during crash recovery. */ if (HotStandbyActiveInReplay()) - { - BlockNumber blkno; - - for (blkno = xlrec->lastBlockVacuumed + 1; blkno < xlrec->block; blkno++) - { - /* - * We use RBM_NORMAL_NO_LOG mode because it's not an error - * condition to see all-zero pages. The original btvacuumpage - * scan would have skipped over all-zero pages, noting them in FSM - * but not bothering to initialize them just yet; so we mustn't - * throw an error here. (We could skip acquiring the cleanup lock - * if PageIsNew, but it's probably not worth the cycles to test.) - * - * XXX we don't actually need to read the block, we just need to - * confirm it is unpinned. If we had a special call into the - * buffer manager we could optimise this so that if the block is - * not in shared_buffers we confirm it as unpinned. - */ - buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, blkno, - RBM_NORMAL_NO_LOG); - if (BufferIsValid(buffer)) - { - LockBufferForCleanup(buffer); - UnlockReleaseBuffer(buffer); - } - } - } + XLogLockBlockRangeForCleanup(xlrec->node, MAIN_FORKNUM, + xlrec->lastBlockVacuumed + 1, + xlrec->block); /* * If we have a full-page image, restore it (using a cleanup lock) and diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index b7829ff..d84f83a 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -287,10 +287,6 @@ XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init) * * In RBM_ZERO and RBM_ZERO_ON_ERROR modes, if the page doesn't exist, the * relation is extended with all-zeroes pages up to the given block number. - * - * In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't - * exist, and we don't check for all-zeroes. Thus, no log entry is made - * to imply that the page should be dropped or truncated later. */ Buffer XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, @@ -331,8 +327,6 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, log_invalid_page(rnode, forknum, blkno, false); return InvalidBuffer; } - if (mode == RBM_NORMAL_NO_LOG) - return InvalidBuffer; /* OK to extend the file */ /* we do this in recovery only - no rel-extension lock needed */ Assert(InRecovery); @@ -375,6 +369,62 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, return buffer; } +/* + * XLogBlockRangeForCleanup is used in Hot Standby mode to emulate the + * locking effects imposed by VACUUM for nbtrees. + */ +void +XLogLockBlockRangeForCleanup(RelFileNode rnode, ForkNumber forkNum, + BlockNumber startBlkno, + BlockNumber uptoBlkno) +{ + BlockNumber blkno; + BlockNumber lastblock; + BlockNumber endingBlkno; + Buffer buffer; + SMgrRelation smgr; + + Assert(startBlkno != P_NEW); + Assert(uptoBlkno != P_NEW); + + /* Open the relation at smgr level */ + smgr = smgropen(rnode, InvalidBackendId); + + /* + * Create the target file if it doesn't already exist. This lets us cope + * if the replay sequence contains writes to a relation that is later + * deleted. (The original coding of this routine would instead suppress + * the writes, but that seems like it risks losing valuable data if the + * filesystem loses an inode during a crash. Better to write the data + * until we are actually told to delete the file.) + */ + smgrcreate(smgr, forkNum, true); + + lastblock = smgrnblocks(smgr, forkNum); + + endingBlkno = uptoBlkno; + if (lastblock < endingBlkno) + endingBlkno = lastblock; + + for (blkno = startBlkno; blkno < endingBlkno; blkno++) + { + /* + * All we need to do here is prove that we can lock each block + * with a cleanup lock. It's not an error to see all-zero pages + * here because the original btvacuumpage would not have thrown + * an error either. + * + * We don't actually need to read the block, we just need to + * confirm it is unpinned. + */ + buffer = GetBufferWithoutRelcache(rnode, forkNum, blkno); + if (BufferIsValid(buffer)) + { + LockBufferForCleanup(buffer); + UnlockReleaseBuffer(buffer); + } + } +} /* * Struct actually returned by XLogFakeRelcacheEntry, though the declared diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 07ea665..ce5ff70 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -222,8 +222,6 @@ ReadBuffer(Relation reln, BlockNumber blockNum) * current physical EOF; that is likely to cause problems in md.c when * the page is modified and written out. P_NEW is OK, though. * - * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here. - * * If strategy is not NULL, a nondefault buffer access strategy is used. * See buffer/README for details. */ @@ -284,6 +282,62 @@ ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum, mode, strategy, &hit); } +/* + * GetBufferWithoutRelcache returns Buffer iff available in shared_buffers, + * otherwise returns InvalidBuffer. Buffer is pinned, if available. + * + * Special purpose routine only executed during recovery, so uses a cut-down + * execution path rather than complicate ReadBuffer/AllocBuffer. + */ +Buffer +GetBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum, + BlockNumber blockNum) +{ + BufferTag bufTag; /* identity of requested block */ + uint32 bufHash; /* hash value for newTag */ + LWLock *bufPartitionLock; /* buffer partition lock for it */ + int buf_id; + volatile BufferDesc *buf; + SMgrRelation smgr = smgropen(rnode, InvalidBackendId); + bool valid = false; + + Assert(InRecovery); + + /* Make sure we will have room to remember the buffer pin */ + ResourceOwnerEnlargeBuffers(CurrentResourceOwner); + + /* create a tag so we can lookup the buffer */ + INIT_BUFFERTAG(bufTag, smgr->smgr_rnode.node, forkNum, blockNum); + + /* determine its hash code and partition lock ID */ + bufHash = BufTableHashCode(&bufTag); + bufPartitionLock = BufMappingPartitionLock(bufHash); + + /* see if the block is in the buffer pool already */ + LWLockAcquire(bufPartitionLock, LW_SHARED); + buf_id = BufTableLookup(&bufTag, bufHash); + if (buf_id >= 0) + { + /* + * Found it. Now, try to pin the buffer if it's valid, but leave + * its usage count alone. + */ + buf = &BufferDescriptors[buf_id]; + + LockBufHdr(buf); + valid = (buf->flags & BM_VALID) != 0; + if (valid) + PinBuffer_Locked(buf); + else + UnlockBufHdr(buf); + } + LWLockRelease(bufPartitionLock); + + if (valid) + return BufferDescriptorGetBuffer(&BufferDescriptors[buf_id]); + + return InvalidBuffer; +} /* * ReadBuffer_common -- common logic for all ReadBuffer variants diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h index 58f11d9..1ecbeb3 100644 --- a/src/include/access/xlogutils.h +++ b/src/include/access/xlogutils.h @@ -25,7 +25,9 @@ extern void XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum, extern Buffer XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init); extern Buffer XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, BlockNumber blkno, ReadBufferMode mode); - +extern void XLogLockBlockRangeForCleanup(RelFileNode rnode, ForkNumber forkNum, + BlockNumber startBlkno, + BlockNumber uptoBlkno); extern Relation CreateFakeRelcacheEntry(RelFileNode rnode); extern void FreeFakeRelcacheEntry(Relation fakerel); diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 89447d0..b827be0 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -38,9 +38,7 @@ typedef enum RBM_NORMAL, /* Normal read */ RBM_ZERO, /* Don't read from disk, caller will * initialize */ - RBM_ZERO_ON_ERROR, /* Read, but return an all-zeros page on error */ - RBM_NORMAL_NO_LOG /* Don't log page as invalid during WAL - * replay; otherwise same as RBM_NORMAL */ + RBM_ZERO_ON_ERROR /* Read, but return an all-zeros page on error */ } ReadBufferMode; /* in globals.c ... this duplicates miscadmin.h */ @@ -170,6 +168,8 @@ extern Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, extern Buffer ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy); +extern Buffer GetBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum, + BlockNumber blockNum); extern void ReleaseBuffer(Buffer buffer); extern void UnlockReleaseBuffer(Buffer buffer); extern void MarkBufferDirty(Buffer buffer);
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers