From 25cac32ee1561c27a0a5eee5ff929d9c44bb1ffc Mon Sep 17 00:00:00 2001
From: Kirk Jamison <k.jamison@jp.fujitsu.com>
Date: Wed, 2 Sep 2020 02:57:26 +0000
Subject: [PATCH] Speedup dropping of relation buffers during recovery

---
 src/backend/storage/buffer/bufmgr.c | 148 ++++++++++++++++++++++++++++--------
 src/backend/storage/smgr/smgr.c     |   2 +-
 src/include/storage/bufmgr.h        |   2 +-
 3 files changed, 119 insertions(+), 33 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index a2a963b..2b3f08c 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -70,6 +70,8 @@
 
 #define RELS_BSEARCH_THRESHOLD		20
 
+#define BUF_DROP_FULLSCAN_THRESHOLD		(NBuffers / 2) 	/* NBuffers divided by 2 */
+
 typedef struct PrivateRefCountEntry
 {
 	Buffer		buffer;
@@ -2979,11 +2981,17 @@ BufferGetLSNAtomic(Buffer buffer)
  * --------------------------------------------------------------------
  */
 void
-DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
+DropRelFileNodeBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum,
 					   int nforks, BlockNumber *firstDelBlock)
 {
 	int			i;
 	int			j;
+	int			k;
+	RelFileNodeBackend	rnode;
+	BufferDesc	*bufHdr;
+	uint32		buf_state;
+
+	rnode = smgr_reln->smgr_rnode;
 
 	/* If it's a local relation, it's localbuf.c's problem. */
 	if (RelFileNodeBackendIsTemp(rnode))
@@ -2997,44 +3005,122 @@ DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
 		return;
 	}
 
-	for (i = 0; i < NBuffers; i++)
+	/*
+	 * Proceed to the normal buffer invalidation process. We only speedup
+	 * this path during recovery, because that's the only timing when we
+	 * can get a valid cached value of blocks for relation. See comment
+	 * in smgrnblocks() in smgr.c.
+	 */
+	if (!InRecovery)
 	{
-		BufferDesc *bufHdr = GetBufferDescriptor(i);
-		uint32		buf_state;
+		for (i = 0; i < NBuffers; i++)
+		{
+			bufHdr = GetBufferDescriptor(i);
 
-		/*
-		 * We can make this a tad faster by prechecking the buffer tag before
-		 * we attempt to lock the buffer; this saves a lot of lock
-		 * acquisitions in typical cases.  It should be safe because the
-		 * caller must have AccessExclusiveLock on the relation, or some other
-		 * reason to be certain that no one is loading new pages of the rel
-		 * into the buffer pool.  (Otherwise we might well miss such pages
-		 * entirely.)  Therefore, while the tag might be changing while we
-		 * look at it, it can't be changing *to* a value we care about, only
-		 * *away* from such a value.  So false negatives are impossible, and
-		 * false positives are safe because we'll recheck after getting the
-		 * buffer lock.
-		 *
-		 * We could check forkNum and blockNum as well as the rnode, but the
-		 * incremental win from doing so seems small.
-		 */
-		if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
-			continue;
+			/*
+			 * We can make this a tad faster by prechecking the buffer tag before
+			 * we attempt to lock the buffer; this saves a lot of lock
+			 * acquisitions in typical cases.  It should be safe because the
+			 * caller must have AccessExclusiveLock on the relation, or some other
+			 * reason to be certain that no one is loading new pages of the rel
+			 * into the buffer pool.  (Otherwise we might well miss such pages
+			 * entirely.)  Therefore, while the tag might be changing while we
+			 * look at it, it can't be changing *to* a value we care about, only
+			 * *away* from such a value.  So false negatives are impossible, and
+			 * false positives are safe because we'll recheck after getting the
+			 * buffer lock.
+			 *
+			 * We could check forkNum and blockNum as well as the rnode, but the
+			 * incremental win from doing so seems small.
+			 */
+			if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
+				continue;
 
-		buf_state = LockBufHdr(bufHdr);
+			buf_state = LockBufHdr(bufHdr);
+
+			for (j = 0; j < nforks; j++)
+			{
+				if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
+					bufHdr->tag.forkNum == forkNum[j] &&
+					bufHdr->tag.blockNum >= firstDelBlock[j])
+				{
+					InvalidateBuffer(bufHdr);	/* releases spinlock */
+					break;
+				}
+			}
+			if (j >= nforks)
+				UnlockBufHdr(bufHdr, buf_state);
+		}
+	}
+	else
+	{
+		BufferTag	newTag;			/* identity of requested block */
+		uint32		newHash;		/* hash value for newTag */
+		LWLock	   	*newPartitionLock;	/* buffer partition lock for it */
+		BlockNumber 	reln_nblocks;
 
-		for (j = 0; j < nforks; j++)
+		for (i = 0; i < nforks; i++)
 		{
-			if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
-				bufHdr->tag.forkNum == forkNum[j] &&
-				bufHdr->tag.blockNum >= firstDelBlock[j])
+			/* Get the number of blocks for the supplied relation fork */
+			reln_nblocks = smgrnblocks(smgr_reln, forkNum[i]);
+
+			/* create a tag so we can lookup the buffer */
+			INIT_BUFFERTAG(newTag, rnode.node, forkNum[i], reln_nblocks);
+
+			/* determine its hash code and partition lock ID */
+			newHash = BufTableHashCode(&newTag);
+			newPartitionLock = BufMappingPartitionLock(newHash);
+
+			if (((int)reln_nblocks) < BUF_DROP_FULLSCAN_THRESHOLD)
 			{
-				InvalidateBuffer(bufHdr);	/* releases spinlock */
-				break;
+				for (j = 0; j < reln_nblocks; j++)
+				{
+					int		buf_id;
+
+					/* Check that it is in the buffer pool */
+					LWLockAcquire(newPartitionLock, LW_SHARED);
+					buf_id = BufTableLookup(&newTag, newHash);
+					LWLockRelease(newPartitionLock);
+
+					bufHdr = GetBufferDescriptor(buf_id);
+
+					if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
+						continue;
+
+					buf_state = LockBufHdr(bufHdr);
+
+					if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
+						bufHdr->tag.forkNum == forkNum[j] &&
+						bufHdr->tag.blockNum >= firstDelBlock[j])
+						InvalidateBuffer(bufHdr); /* releases spinlock */
+					else
+						UnlockBufHdr(bufHdr, buf_state);
+				}
 			}
+			else
+			{
+				for (j = BUF_DROP_FULLSCAN_THRESHOLD; j < NBuffers; j++)
+				{
+					bufHdr = GetBufferDescriptor(j);
+
+					if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
+						continue;
+
+					buf_state = LockBufHdr(bufHdr);
+
+					if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
+						bufHdr->tag.forkNum == forkNum[j] &&
+						bufHdr->tag.blockNum >= firstDelBlock[j])
+					{
+						InvalidateBuffer(bufHdr);	/* releases spinlock */
+						break;
+					}
+				}
+			}
+
+			if (i >= nforks)
+				UnlockBufHdr(bufHdr, buf_state);
 		}
-		if (j >= nforks)
-			UnlockBufHdr(bufHdr, buf_state);
 	}
 }
 
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index dcc09df..5238c6c 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -583,7 +583,7 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb
 	 * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will
 	 * just drop them without bothering to write the contents.
 	 */
-	DropRelFileNodeBuffers(reln->smgr_rnode, forknum, nforks, nblocks);
+	DropRelFileNodeBuffers(reln, forknum, nforks, nblocks);
 
 	/*
 	 * Send a shared-inval message to force other backends to close any smgr
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index ee91b8f..056f65e 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -203,7 +203,7 @@ extern void FlushOneBuffer(Buffer buffer);
 extern void FlushRelationBuffers(Relation rel);
 extern void FlushRelationsAllBuffers(struct SMgrRelationData **smgrs, int nrels);
 extern void FlushDatabaseBuffers(Oid dbid);
-extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
+extern void DropRelFileNodeBuffers(struct SMgrRelationData *smgr_reln, ForkNumber *forkNum,
 								   int nforks, BlockNumber *firstDelBlock);
 extern void DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes);
 extern void DropDatabaseBuffers(Oid dbid);
-- 
1.8.3.1

