Updated patch attached, thanks.

Amit: what's your conclusion from the review?

-- Abhijit
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 5f9fc49..dc90c02 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -501,33 +501,9 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record)
 	 * here during crash recovery.
 	 */
 	if (HotStandbyActiveInReplay())
-	{
-		BlockNumber blkno;
-
-		for (blkno = xlrec->lastBlockVacuumed + 1; blkno < xlrec->block; blkno++)
-		{
-			/*
-			 * We use RBM_NORMAL_NO_LOG mode because it's not an error
-			 * condition to see all-zero pages.  The original btvacuumpage
-			 * scan would have skipped over all-zero pages, noting them in FSM
-			 * but not bothering to initialize them just yet; so we mustn't
-			 * throw an error here.  (We could skip acquiring the cleanup lock
-			 * if PageIsNew, but it's probably not worth the cycles to test.)
-			 *
-			 * XXX we don't actually need to read the block, we just need to
-			 * confirm it is unpinned. If we had a special call into the
-			 * buffer manager we could optimise this so that if the block is
-			 * not in shared_buffers we confirm it as unpinned.
-			 */
-			buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, blkno,
-											RBM_NORMAL_NO_LOG);
-			if (BufferIsValid(buffer))
-			{
-				LockBufferForCleanup(buffer);
-				UnlockReleaseBuffer(buffer);
-			}
-		}
-	}
+		XLogLockBlockRangeForCleanup(xlrec->node, MAIN_FORKNUM,
+									 xlrec->lastBlockVacuumed + 1,
+									 xlrec->block);
 
 	/*
 	 * If we have a full-page image, restore it (using a cleanup lock) and
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index b7829ff..d84f83a 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -287,10 +287,6 @@ XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
  *
  * In RBM_ZERO and RBM_ZERO_ON_ERROR modes, if the page doesn't exist, the
  * relation is extended with all-zeroes pages up to the given block number.
- *
- * In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't
- * exist, and we don't check for all-zeroes.  Thus, no log entry is made
- * to imply that the page should be dropped or truncated later.
  */
 Buffer
 XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
@@ -331,8 +327,6 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 			log_invalid_page(rnode, forknum, blkno, false);
 			return InvalidBuffer;
 		}
-		if (mode == RBM_NORMAL_NO_LOG)
-			return InvalidBuffer;
 		/* OK to extend the file */
 		/* we do this in recovery only - no rel-extension lock needed */
 		Assert(InRecovery);
@@ -375,6 +369,62 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 	return buffer;
 }
 
+/*
+ * XLogBlockRangeForCleanup is used in Hot Standby mode to emulate the
+ * locking effects imposed by VACUUM for nbtrees.
+ */
+void
+XLogLockBlockRangeForCleanup(RelFileNode rnode, ForkNumber forkNum,
+							 BlockNumber startBlkno,
+							 BlockNumber uptoBlkno)
+{
+	BlockNumber blkno;
+	BlockNumber lastblock;
+	BlockNumber	endingBlkno;
+	Buffer		buffer;
+	SMgrRelation smgr;
+
+	Assert(startBlkno != P_NEW);
+	Assert(uptoBlkno != P_NEW);
+
+	/* Open the relation at smgr level */
+	smgr = smgropen(rnode, InvalidBackendId);
+
+	/*
+	 * Create the target file if it doesn't already exist.  This lets us cope
+	 * if the replay sequence contains writes to a relation that is later
+	 * deleted.  (The original coding of this routine would instead suppress
+	 * the writes, but that seems like it risks losing valuable data if the
+	 * filesystem loses an inode during a crash.  Better to write the data
+	 * until we are actually told to delete the file.)
+	 */
+	smgrcreate(smgr, forkNum, true);
+
+	lastblock = smgrnblocks(smgr, forkNum);
+
+	endingBlkno = uptoBlkno;
+	if (lastblock < endingBlkno)
+		endingBlkno = lastblock;
+
+	for (blkno = startBlkno; blkno < endingBlkno; blkno++)
+	{
+		/*
+		 * All we need to do here is prove that we can lock each block
+		 * with a cleanup lock. It's not an error to see all-zero pages
+		 * here because the original btvacuumpage would not have thrown
+		 * an error either.
+		 *
+		 * We don't actually need to read the block, we just need to
+		 * confirm it is unpinned.
+		 */
+		buffer = GetBufferWithoutRelcache(rnode, forkNum, blkno);
+		if (BufferIsValid(buffer))
+		{
+			LockBufferForCleanup(buffer);
+			UnlockReleaseBuffer(buffer);
+		}
+	}
+}
 
 /*
  * Struct actually returned by XLogFakeRelcacheEntry, though the declared
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 07ea665..ce5ff70 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -222,8 +222,6 @@ ReadBuffer(Relation reln, BlockNumber blockNum)
  * current physical EOF; that is likely to cause problems in md.c when
  * the page is modified and written out. P_NEW is OK, though.
  *
- * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
- *
  * If strategy is not NULL, a nondefault buffer access strategy is used.
  * See buffer/README for details.
  */
@@ -284,6 +282,62 @@ ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
 							 mode, strategy, &hit);
 }
 
+/*
+ * GetBufferWithoutRelcache returns Buffer iff available in shared_buffers,
+ * otherwise returns InvalidBuffer. Buffer is pinned, if available.
+ *
+ * Special purpose routine only executed during recovery, so uses a cut-down
+ * execution path rather than complicate ReadBuffer/AllocBuffer.
+ */
+Buffer
+GetBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
+						 BlockNumber blockNum)
+{
+	BufferTag	bufTag;			/* identity of requested block */
+	uint32		bufHash;		/* hash value for newTag */
+	LWLock	   *bufPartitionLock;		/* buffer partition lock for it */
+	int			buf_id;
+	volatile BufferDesc *buf;
+	SMgrRelation smgr = smgropen(rnode, InvalidBackendId);
+	bool		valid = false;
+
+	Assert(InRecovery);
+
+	/* Make sure we will have room to remember the buffer pin */
+	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+	/* create a tag so we can lookup the buffer */
+	INIT_BUFFERTAG(bufTag, smgr->smgr_rnode.node, forkNum, blockNum);
+
+	/* determine its hash code and partition lock ID */
+	bufHash = BufTableHashCode(&bufTag);
+	bufPartitionLock = BufMappingPartitionLock(bufHash);
+
+	/* see if the block is in the buffer pool already */
+	LWLockAcquire(bufPartitionLock, LW_SHARED);
+	buf_id = BufTableLookup(&bufTag, bufHash);
+	if (buf_id >= 0)
+	{
+		/*
+		 * Found it. Now, try to pin the buffer if it's valid, but leave
+		 * its usage count alone.
+		 */
+		buf = &BufferDescriptors[buf_id];
+
+		LockBufHdr(buf);
+		valid = (buf->flags & BM_VALID) != 0;
+		if (valid)
+			PinBuffer_Locked(buf);
+		else
+			UnlockBufHdr(buf);
+	}
+	LWLockRelease(bufPartitionLock);
+
+	if (valid)
+		return BufferDescriptorGetBuffer(&BufferDescriptors[buf_id]);
+
+	return InvalidBuffer;
+}
 
 /*
  * ReadBuffer_common -- common logic for all ReadBuffer variants
diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h
index 58f11d9..1ecbeb3 100644
--- a/src/include/access/xlogutils.h
+++ b/src/include/access/xlogutils.h
@@ -25,7 +25,9 @@ extern void XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum,
 extern Buffer XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init);
 extern Buffer XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 					   BlockNumber blkno, ReadBufferMode mode);
-
+extern void XLogLockBlockRangeForCleanup(RelFileNode rnode, ForkNumber forkNum,
+										 BlockNumber startBlkno,
+										 BlockNumber uptoBlkno);
 extern Relation CreateFakeRelcacheEntry(RelFileNode rnode);
 extern void FreeFakeRelcacheEntry(Relation fakerel);
 
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 89447d0..b827be0 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -38,9 +38,7 @@ typedef enum
 	RBM_NORMAL,					/* Normal read */
 	RBM_ZERO,					/* Don't read from disk, caller will
 								 * initialize */
-	RBM_ZERO_ON_ERROR,			/* Read, but return an all-zeros page on error */
-	RBM_NORMAL_NO_LOG			/* Don't log page as invalid during WAL
-								 * replay; otherwise same as RBM_NORMAL */
+	RBM_ZERO_ON_ERROR			/* Read, but return an all-zeros page on error */
 } ReadBufferMode;
 
 /* in globals.c ... this duplicates miscadmin.h */
@@ -170,6 +168,8 @@ extern Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum,
 extern Buffer ReadBufferWithoutRelcache(RelFileNode rnode,
 						  ForkNumber forkNum, BlockNumber blockNum,
 						  ReadBufferMode mode, BufferAccessStrategy strategy);
+extern Buffer GetBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
+									   BlockNumber blockNum);
 extern void ReleaseBuffer(Buffer buffer);
 extern void UnlockReleaseBuffer(Buffer buffer);
 extern void MarkBufferDirty(Buffer buffer);
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to