From 20b620d4548acc344d9ad065db42e68cc0ce5f14 Mon Sep 17 00:00:00 2001
From: Kirk Jamison <k,jamison@fujitsu.com>
Date: Thu, 27 Aug 2020 08:09:39 +0000
Subject: [PATCH] Speedup dropping of relation buffers during recovery

---
 src/backend/storage/buffer/bufmgr.c | 125 +++++++++++++++++++++++++++---------
 src/backend/storage/smgr/smgr.c     |   2 +-
 src/include/storage/bufmgr.h        |   5 +-
 3 files changed, 98 insertions(+), 34 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index a2a963b..95b3c7d 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -70,6 +70,8 @@
 
 #define RELS_BSEARCH_THRESHOLD		20
 
+#define BUF_DROP_THRESHOLD		500	/* NBuffers divided by 2 */
+
 typedef struct PrivateRefCountEntry
 {
 	Buffer		buffer;
@@ -2979,11 +2981,14 @@ BufferGetLSNAtomic(Buffer buffer)
  * --------------------------------------------------------------------
  */
 void
-DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
+DropRelFileNodeBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum,
 					   int nforks, BlockNumber *firstDelBlock)
 {
 	int			i;
 	int			j;
+	RelFileNodeBackend	rnode;
+
+	rnode = smgr_reln->smgr_rnode;
 
 	/* If it's a local relation, it's localbuf.c's problem. */
 	if (RelFileNodeBackendIsTemp(rnode))
@@ -2997,44 +3002,102 @@ DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
 		return;
 	}
 
-	for (i = 0; i < NBuffers; i++)
+	/*
+	 * Proceed to the normal buffer invalidation process. We only speedup
+	 * this path during recovery, because that's the only timing when we
+	 * can get a valid cached value of blocks for relation. See comment
+	 * in smgrnblocks() in smgr.c.
+	 */
+	if (!InRecovery)
 	{
-		BufferDesc *bufHdr = GetBufferDescriptor(i);
-		uint32		buf_state;
+		for (i = 0; i < NBuffers; i++)
+		{
+			BufferDesc *bufHdr = GetBufferDescriptor(i);
+			uint32		buf_state;
 
-		/*
-		 * We can make this a tad faster by prechecking the buffer tag before
-		 * we attempt to lock the buffer; this saves a lot of lock
-		 * acquisitions in typical cases.  It should be safe because the
-		 * caller must have AccessExclusiveLock on the relation, or some other
-		 * reason to be certain that no one is loading new pages of the rel
-		 * into the buffer pool.  (Otherwise we might well miss such pages
-		 * entirely.)  Therefore, while the tag might be changing while we
-		 * look at it, it can't be changing *to* a value we care about, only
-		 * *away* from such a value.  So false negatives are impossible, and
-		 * false positives are safe because we'll recheck after getting the
-		 * buffer lock.
-		 *
-		 * We could check forkNum and blockNum as well as the rnode, but the
-		 * incremental win from doing so seems small.
-		 */
-		if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
-			continue;
+			/*
+			 * We can make this a tad faster by prechecking the buffer tag before
+			 * we attempt to lock the buffer; this saves a lot of lock
+			 * acquisitions in typical cases.  It should be safe because the
+			 * caller must have AccessExclusiveLock on the relation, or some other
+			 * reason to be certain that no one is loading new pages of the rel
+			 * into the buffer pool.  (Otherwise we might well miss such pages
+			 * entirely.)  Therefore, while the tag might be changing while we
+			 * look at it, it can't be changing *to* a value we care about, only
+			 * *away* from such a value.  So false negatives are impossible, and
+			 * false positives are safe because we'll recheck after getting the
+			 * buffer lock.
+			 *
+			 * We could check forkNum and blockNum as well as the rnode, but the
+			 * incremental win from doing so seems small.
+			 */
+			if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
+				continue;
 
-		buf_state = LockBufHdr(bufHdr);
+			buf_state = LockBufHdr(bufHdr);
 
-		for (j = 0; j < nforks; j++)
+			for (j = 0; j < nforks; j++)
+			{
+				if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
+					bufHdr->tag.forkNum == forkNum[j] &&
+					bufHdr->tag.blockNum >= firstDelBlock[j])
+				{
+					InvalidateBuffer(bufHdr);	/* releases spinlock */
+					break;
+				}
+			}
+			if (j >= nforks)
+				UnlockBufHdr(bufHdr, buf_state);
+		}
+	}
+	else
+	{
+		BufferTag	newTag;			/* identity of requested block */
+		uint32		newHash;		/* hash value for newTag */
+		LWLock	   	*newPartitionLock;	/* buffer partition lock for it */
+		BlockNumber 	reln_nblocks;
+
+		for (i = 0; i < nforks; i++)
 		{
-			if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
-				bufHdr->tag.forkNum == forkNum[j] &&
-				bufHdr->tag.blockNum >= firstDelBlock[j])
+			/* Get the number of blocks for the supplied relation fork */
+			reln_nblocks = smgrnblocks(smgr_reln, forkNum[i]);
+
+			/* create a tag so we can lookup the buffer */
+			INIT_BUFFERTAG(newTag, rnode.node, forkNum[i], reln_nblocks);
+
+			/* determine its hash code and partition lock ID */
+			newHash = BufTableHashCode(&newTag);
+			newPartitionLock = BufMappingPartitionLock(newHash);
+
+			if (((int)reln_nblocks) < BUF_DROP_THRESHOLD)
 			{
-				InvalidateBuffer(bufHdr);	/* releases spinlock */
-				break;
+				for (j = 0; j < reln_nblocks; j++)
+				{
+					BufferDesc	*bufHdr;
+					uint32		buf_state;
+					int		buf_id;
+
+					/* Check that it is in the buffer pool */
+					LWLockAcquire(newPartitionLock, LW_SHARED);
+					buf_id = BufTableLookup(&newTag, newHash);
+					LWLockRelease(newPartitionLock);
+
+					bufHdr = GetBufferDescriptor(buf_id);
+
+					if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
+						continue;
+
+					buf_state = LockBufHdr(bufHdr);
+
+					if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
+						bufHdr->tag.forkNum == forkNum[j] &&
+						bufHdr->tag.blockNum >= firstDelBlock[j])
+						InvalidateBuffer(bufHdr); /* releases spinlock */
+					else
+						UnlockBufHdr(bufHdr, buf_state);
+				}
 			}
 		}
-		if (j >= nforks)
-			UnlockBufHdr(bufHdr, buf_state);
 	}
 }
 
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index dcc09df..5238c6c 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -583,7 +583,7 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb
 	 * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will
 	 * just drop them without bothering to write the contents.
 	 */
-	DropRelFileNodeBuffers(reln->smgr_rnode, forknum, nforks, nblocks);
+	DropRelFileNodeBuffers(reln, forknum, nforks, nblocks);
 
 	/*
 	 * Send a shared-inval message to force other backends to close any smgr
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index ee91b8f..da3104c 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -201,9 +201,10 @@ extern BlockNumber RelationGetNumberOfBlocksInFork(Relation relation,
 												   ForkNumber forkNum);
 extern void FlushOneBuffer(Buffer buffer);
 extern void FlushRelationBuffers(Relation rel);
-extern void FlushRelationsAllBuffers(struct SMgrRelationData **smgrs, int nrels);
+extern void FlushRelationsAllBuffers(struct SMgrRelationData
+ **smgrs, int nrels);
 extern void FlushDatabaseBuffers(Oid dbid);
-extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
+extern void DropRelFileNodeBuffers(struct SMgrRelationData *smgr_reln, ForkNumber *forkNum,
 								   int nforks, BlockNumber *firstDelBlock);
 extern void DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes);
 extern void DropDatabaseBuffers(Oid dbid);
-- 
1.8.3.1

