From 957019074b42bcf0332a353634e8d37b52bd974a Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Mon, 19 May 2025 13:05:28 -0400
Subject: [PATCH v2] Avoid BufferGetLSNAtomic() calls during nbtree scans.

Avoid BufferGetLSNAtomic() calls during nbtree index-only scans, bitmap
index scans, and page reads of pages that won't need to return items to
the scan via the btgettuple interface.
---
 src/include/access/nbtree.h                   |  3 +-
 src/backend/access/nbtree/nbtpreprocesskeys.c | 13 +++++++
 src/backend/access/nbtree/nbtree.c            |  4 ++
 src/backend/access/nbtree/nbtsearch.c         | 39 ++++++++++++-------
 src/backend/access/nbtree/nbtutils.c          | 12 +++---
 5 files changed, 50 insertions(+), 21 deletions(-)

diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index ebca02588..1b0f143cf 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -967,7 +967,7 @@ typedef struct BTScanPosData
 	BlockNumber currPage;		/* page referenced by items array */
 	BlockNumber prevPage;		/* currPage's left link */
 	BlockNumber nextPage;		/* currPage's right link */
-	XLogRecPtr	lsn;			/* currPage's LSN */
+	XLogRecPtr	lsn;			/* currPage's LSN (iff so.drop_pin) */
 
 	/* scan direction for the saved position's call to _bt_readpage */
 	ScanDirection dir;
@@ -1054,6 +1054,7 @@ typedef struct BTScanOpaqueData
 {
 	/* these fields are set by _bt_preprocess_keys(): */
 	bool		qual_ok;		/* false if qual can never be satisfied */
+	bool		drop_pin;		/* true if it's safe to drop leaf page pins */
 	int			numberOfKeys;	/* number of preprocessed scan keys */
 	ScanKey		keyData;		/* array of preprocessed scan keys */
 
diff --git a/src/backend/access/nbtree/nbtpreprocesskeys.c b/src/backend/access/nbtree/nbtpreprocesskeys.c
index a136e4bbf..ae48f49d7 100644
--- a/src/backend/access/nbtree/nbtpreprocesskeys.c
+++ b/src/backend/access/nbtree/nbtpreprocesskeys.c
@@ -205,6 +205,19 @@ _bt_preprocess_keys(IndexScanDesc scan)
 
 	/* initialize result variables */
 	so->qual_ok = true;
+
+	/*
+	 * We prefer to eagerly drop a leaf page pin to avoid blocking concurrent
+	 * heap TID recycling by VACUUM.  But we cannot safely drop leaf page pins
+	 * during index-only scans, nor scans of logged relations, where checking
+	 * if the page's LSN changed while the pin was dropped isn't sufficient.
+	 * (Setting so->drop_pin=true doesn't meaningfully affect the behavior of
+	 * bitmap index scans, so they always set it 'false' to avoid needlessly
+	 * calling BufferGetLSNAtomic from _bt_readpage.)
+	 */
+	so->drop_pin = (IsMVCCSnapshot(scan->xs_snapshot) &&
+					RelationNeedsWAL(scan->indexRelation) &&
+					!scan->xs_want_itup && scan->heapRelation);
 	so->numberOfKeys = 0;
 
 	if (numberOfKeys < 1)
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 765659887..aad896327 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -228,6 +228,8 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 	bool		res;
 
+	Assert(scan->heapRelation != NULL);
+
 	/* btree indexes are never lossy */
 	scan->xs_recheck = false;
 
@@ -289,6 +291,8 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 	int64		ntids = 0;
 	ItemPointer heapTid;
 
+	Assert(scan->heapRelation == NULL);
+
 	/* Each loop iteration performs another primitive index scan */
 	do
 	{
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index fe9a38869..851825a64 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -25,7 +25,8 @@
 #include "utils/rel.h"
 
 
-static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
+static inline void _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so,
+											   BTScanPos sp);
 static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,
 							Buffer buf, bool forupdate, BTStack stack,
 							int access);
@@ -63,18 +64,26 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
  *
  * See nbtree/README section on making concurrent TID recycling safe.
  */
-static void
-_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
+static inline void
+_bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so, BTScanPos sp)
 {
-	_bt_unlockbuf(scan->indexRelation, sp->buf);
-
-	if (IsMVCCSnapshot(scan->xs_snapshot) &&
-		RelationNeedsWAL(scan->indexRelation) &&
-		!scan->xs_want_itup)
+	if (!so->drop_pin)
 	{
-		ReleaseBuffer(sp->buf);
-		sp->buf = InvalidBuffer;
+		/* Just drop the lock (not the pin) */
+		_bt_unlockbuf(rel, sp->buf);
+		return;
 	}
+
+	/*
+	 * Drop the lock and the pin.
+	 *
+	 * Have to establish so->currPos.lsn to provide _bt_killitems with an
+	 * alternative mechanism to ensure that no unsafe concurrent heap TID
+	 * recycling has taken place.
+	 */
+	so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
+	_bt_relbuf(rel, so->currPos.buf);
+	sp->buf = InvalidBuffer;
 }
 
 /*
@@ -1610,6 +1619,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 	so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
 	so->currPos.prevPage = opaque->btpo_prev;
 	so->currPos.nextPage = opaque->btpo_next;
+	/* delay setting so->currPos.lsn until _bt_drop_lock_and_maybe_pin */
 
 	Assert(!P_IGNORE(opaque));
 	Assert(BTScanPosIsPinned(so->currPos));
@@ -1626,8 +1636,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 								 so->currPos.currPage);
 	}
 
-	/* initialize remaining currPos fields related to current page */
-	so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
+	/* initialize most remaining currPos fields related to current page */
 	so->currPos.dir = dir;
 	so->currPos.nextTupleOffset = 0;
 	/* either moreLeft or moreRight should be set now (may be unset later) */
@@ -2251,12 +2260,14 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
 	 */
 	if (_bt_readpage(scan, dir, offnum, true))
 	{
+		Relation	rel = scan->indexRelation;
+
 		/*
 		 * _bt_readpage succeeded.  Drop the lock (and maybe the pin) on
 		 * so->currPos.buf in preparation for btgettuple returning tuples.
 		 */
 		Assert(BTScanPosIsPinned(so->currPos));
-		_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+		_bt_drop_lock_and_maybe_pin(rel, so, &so->currPos);
 		return true;
 	}
 
@@ -2413,7 +2424,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
 	 */
 	Assert(so->currPos.currPage == blkno);
 	Assert(BTScanPosIsPinned(so->currPos));
-	_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+	_bt_drop_lock_and_maybe_pin(rel, so, &so->currPos);
 
 	return true;
 }
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 1a15dfcb7..8b02f2c7f 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -3364,9 +3364,9 @@ _bt_killitems(IndexScanDesc scan)
 	int			i;
 	int			numKilled = so->numKilled;
 	bool		killedsomething = false;
-	bool		droppedpin PG_USED_FOR_ASSERTS_ONLY;
 
 	Assert(BTScanPosIsValid(so->currPos));
+	Assert(scan->heapRelation != NULL); /* can't be a bitmap index scan */
 
 	/*
 	 * Always reset the scan state, so we don't look for same items on other
@@ -3374,7 +3374,7 @@ _bt_killitems(IndexScanDesc scan)
 	 */
 	so->numKilled = 0;
 
-	if (BTScanPosIsPinned(so->currPos))
+	if (!so->drop_pin)
 	{
 		/*
 		 * We have held the pin on this page since we read the index tuples,
@@ -3382,7 +3382,7 @@ _bt_killitems(IndexScanDesc scan)
 		 * re-use of any TID on the page, so there is no need to check the
 		 * LSN.
 		 */
-		droppedpin = false;
+		Assert(BTScanPosIsPinned(so->currPos));
 		_bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ);
 
 		page = BufferGetPage(so->currPos.buf);
@@ -3391,8 +3391,8 @@ _bt_killitems(IndexScanDesc scan)
 	{
 		Buffer		buf;
 
-		droppedpin = true;
 		/* Attempt to re-read the buffer, getting pin and lock. */
+		Assert(!BTScanPosIsPinned(so->currPos));
 		buf = _bt_getbuf(scan->indexRelation, so->currPos.currPage, BT_READ);
 
 		page = BufferGetPage(buf);
@@ -3442,7 +3442,7 @@ _bt_killitems(IndexScanDesc scan)
 				 * correctness.
 				 *
 				 * Note that the page may have been modified in almost any way
-				 * since we first read it (in the !droppedpin case), so it's
+				 * since we first read it (in the !so->drop_pin case), so it's
 				 * possible that this posting list tuple wasn't a posting list
 				 * tuple when we first encountered its heap TIDs.
 				 */
@@ -3458,7 +3458,7 @@ _bt_killitems(IndexScanDesc scan)
 					 * though only in the common case where the page can't
 					 * have been concurrently modified
 					 */
-					Assert(kitem->indexOffset == offnum || !droppedpin);
+					Assert(kitem->indexOffset == offnum || !so->drop_pin);
 
 					/*
 					 * Read-ahead to later kitems here.
-- 
2.49.0

