From 83d01f27df5f6128f5abadac501d76a282c4cd36 Mon Sep 17 00:00:00 2001
From: Kirk Jamison <k.jamison@jp.fujitsu.com>
Date: Fri, 11 Sep 2020 13:00:33 +0000
Subject: [PATCH] Speedup dropping of relation buffers during recovery

---
 src/backend/storage/buffer/bufmgr.c | 173 ++++++++++++++++++++++++++++++------
 src/backend/storage/smgr/smgr.c     |   2 +-
 src/include/storage/bufmgr.h        |   2 +-
 3 files changed, 146 insertions(+), 31 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index a2a963b..01ea4d5 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -70,6 +70,8 @@
 
 #define RELS_BSEARCH_THRESHOLD		20
 
+#define BUF_DROP_FULLSCAN_THRESHOLD		(uint32)(NBuffers / 500)
+
 typedef struct PrivateRefCountEntry
 {
 	Buffer		buffer;
@@ -473,6 +475,8 @@ static BufferDesc *BufferAlloc(SMgrRelation smgr,
 							   BufferAccessStrategy strategy,
 							   bool *foundPtr);
 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln);
+static void DropRelFileNodeBuffersOfFork(RelFileNode rnode, ForkNumber forkNum,
+										 BlockNumber firstDelBlock);
 static void AtProcExit_Buffers(int code, Datum arg);
 static void CheckForBufferLeaks(void);
 static int	rnode_comparator(const void *p1, const void *p2);
@@ -2979,65 +2983,176 @@ BufferGetLSNAtomic(Buffer buffer)
  * --------------------------------------------------------------------
  */
 void
-DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
+DropRelFileNodeBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum,
 					   int nforks, BlockNumber *firstDelBlock)
 {
-	int			i;
-	int			j;
+	int			buf_num;
+	int			fork_num;
+	RelFileNodeBackend	rnode;
+	BufferDesc	*bufHdr;
+	uint32		buf_state;
+
+	rnode = smgr_reln->smgr_rnode;
 
 	/* If it's a local relation, it's localbuf.c's problem. */
 	if (RelFileNodeBackendIsTemp(rnode))
 	{
 		if (rnode.backend == MyBackendId)
 		{
-			for (j = 0; j < nforks; j++)
-				DropRelFileNodeLocalBuffers(rnode.node, forkNum[j],
-											firstDelBlock[j]);
+			for (fork_num = 0; fork_num < nforks; fork_num++)
+				DropRelFileNodeLocalBuffers(rnode.node, forkNum[fork_num],
+											firstDelBlock[fork_num]);
 		}
 		return;
 	}
 
-	for (i = 0; i < NBuffers; i++)
+	/*
+	 * We only speedup this path during recovery, because that's the only
+	 * timing when we can get a valid cached value of blocks for relation.
+	 * See comment in smgrnblocks() in smgr.c. Otherwise, proceed to usual
+	 * buffer invalidation process (scanning of whole shared buffers).
+	 */
+	if (InRecovery)
 	{
-		BufferDesc *bufHdr = GetBufferDescriptor(i);
-		uint32		buf_state;
+		for (fork_num = 0; fork_num < nforks; fork_num++)
+		{
+			BlockNumber 	nblocks;
+
+			/* Get the number of blocks for the supplied relation's fork */
+			nblocks = smgrnblocks(smgr_reln, forkNum[fork_num]);
+			Assert(BlockNumberIsValid(nblocks));
+
+			if (nblocks < BUF_DROP_FULLSCAN_THRESHOLD)
+			{
+				BlockNumber		block_num;
+				for (block_num = 0; block_num < nblocks; block_num++)
+				{
+					uint32		newHash;		/* hash value for newTag */
+					BufferTag	newTag;			/* identity of requested block */
+					LWLock	   	*newPartitionLock;	/* buffer partition lock for it */
+					int		buf_id;
+
+					/* create a tag so we can lookup the buffer */
+					INIT_BUFFERTAG(newTag, rnode.node, forkNum[fork_num],
+								   firstDelBlock[fork_num]);
+
+					/* determine its hash code and partition lock ID */
+					newHash = BufTableHashCode(&newTag);
+					newPartitionLock = BufMappingPartitionLock(newHash);
+
+					/* Check that it is in the buffer pool. If not, do nothing */
+					LWLockAcquire(newPartitionLock, LW_SHARED);
+					buf_id = BufTableLookup(&newTag, newHash);
+
+					if (buf_id < 0)
+					{
+						LWLockRelease(newPartitionLock);
+						continue;
+					}
+					LWLockRelease(newPartitionLock);
+
+					bufHdr = GetBufferDescriptor(buf_id);
+
+					/*
+					 * We can make this a tad faster by prechecking the buffer tag before
+					 * we attempt to lock the buffer; this saves a lot of lock
+					 * acquisitions in typical cases.  It should be safe because the
+					 * caller must have AccessExclusiveLock on the relation, or some other
+					 * reason to be certain that no one is loading new pages of the rel
+					 * into the buffer pool.  (Otherwise we might well miss such pages
+					 * entirely.)  Therefore, while the tag might be changing while we
+					 * look at it, it can't be changing *to* a value we care about, only
+					 * *away* from such a value.  So false negatives are impossible, and
+					 * false positives are safe because we'll recheck after getting the
+					 * buffer lock.
+					 *
+					 * We could check forkNum and blockNum as well as the rnode, but the
+					 * incremental win from doing so seems small.
+					 */
+					if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
+						continue;
+
+					buf_state = LockBufHdr(bufHdr);
+
+					if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
+						bufHdr->tag.forkNum == forkNum[fork_num] &&
+						bufHdr->tag.blockNum >= firstDelBlock[fork_num])
+						InvalidateBuffer(bufHdr);	/* releases spinlock */
+					else
+						UnlockBufHdr(bufHdr, buf_state);
+				}
+				if (block_num >= nblocks)
+				{
+					DropRelFileNodeBuffersOfFork(rnode.node, forkNum[fork_num],
+												firstDelBlock[fork_num]);
+				}
+			}
+			else
+			{
+				DropRelFileNodeBuffersOfFork(rnode.node, forkNum[fork_num],
+											 firstDelBlock[fork_num]);
+			}
+		}
+		return;
+	}
+	for (buf_num = 0; buf_num < NBuffers; buf_num++)
+	{
+		bufHdr = GetBufferDescriptor(buf_num);
 
-		/*
-		 * We can make this a tad faster by prechecking the buffer tag before
-		 * we attempt to lock the buffer; this saves a lot of lock
-		 * acquisitions in typical cases.  It should be safe because the
-		 * caller must have AccessExclusiveLock on the relation, or some other
-		 * reason to be certain that no one is loading new pages of the rel
-		 * into the buffer pool.  (Otherwise we might well miss such pages
-		 * entirely.)  Therefore, while the tag might be changing while we
-		 * look at it, it can't be changing *to* a value we care about, only
-		 * *away* from such a value.  So false negatives are impossible, and
-		 * false positives are safe because we'll recheck after getting the
-		 * buffer lock.
-		 *
-		 * We could check forkNum and blockNum as well as the rnode, but the
-		 * incremental win from doing so seems small.
-		 */
 		if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
 			continue;
 
 		buf_state = LockBufHdr(bufHdr);
 
-		for (j = 0; j < nforks; j++)
+		for (fork_num = 0; fork_num < nforks; fork_num++)
 		{
 			if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
-				bufHdr->tag.forkNum == forkNum[j] &&
-				bufHdr->tag.blockNum >= firstDelBlock[j])
+				bufHdr->tag.forkNum == forkNum[fork_num] &&
+				bufHdr->tag.blockNum >= firstDelBlock[fork_num])
 			{
 				InvalidateBuffer(bufHdr);	/* releases spinlock */
 				break;
 			}
 		}
-		if (j >= nforks)
+		if (fork_num >= nforks)
+			UnlockBufHdr(bufHdr, buf_state);
+	}
+}
+
+
+/* -----------------------------------------------------------------
+ *		DropRelFileNodeBuffersOfFork
+ *
+ *		This function removes from the buffer pool the pages for
+ *		each fork of the specified relation.
+ * -----------------------------------------------------------------
+ */
+static void
+DropRelFileNodeBuffersOfFork(RelFileNode rnode, ForkNumber forkNum,
+							 BlockNumber firstDelBlock)
+{
+	int			buf_num;
+
+	for (buf_num = 0; buf_num < NBuffers; buf_num++)
+	{
+		BufferDesc	*bufHdr = GetBufferDescriptor(buf_num);
+		uint32		buf_state;
+
+		if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode))
+			continue;
+
+		buf_state = LockBufHdr(bufHdr);
+
+		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
+			bufHdr->tag.forkNum == forkNum &&
+			bufHdr->tag.blockNum >= firstDelBlock)
+			InvalidateBuffer(bufHdr);	/* releases spinlock */
+		else
 			UnlockBufHdr(bufHdr, buf_state);
 	}
 }
 
+
 /* ---------------------------------------------------------------------
  *		DropRelFileNodesAllBuffers
  *
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index dcc09df..5238c6c 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -583,7 +583,7 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb
 	 * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will
 	 * just drop them without bothering to write the contents.
 	 */
-	DropRelFileNodeBuffers(reln->smgr_rnode, forknum, nforks, nblocks);
+	DropRelFileNodeBuffers(reln, forknum, nforks, nblocks);
 
 	/*
 	 * Send a shared-inval message to force other backends to close any smgr
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index ee91b8f..056f65e 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -203,7 +203,7 @@ extern void FlushOneBuffer(Buffer buffer);
 extern void FlushRelationBuffers(Relation rel);
 extern void FlushRelationsAllBuffers(struct SMgrRelationData **smgrs, int nrels);
 extern void FlushDatabaseBuffers(Oid dbid);
-extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
+extern void DropRelFileNodeBuffers(struct SMgrRelationData *smgr_reln, ForkNumber *forkNum,
 								   int nforks, BlockNumber *firstDelBlock);
 extern void DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes);
 extern void DropDatabaseBuffers(Oid dbid);
-- 
1.8.3.1

