From 0e1b9d8af5ffd7f26d2b3b8e8304b300f73120ef Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Mon, 19 May 2025 13:05:28 -0400
Subject: [PATCH v3] Avoid BufferGetLSNAtomic() calls during nbtree scans.

Avoid (or at least delay) BufferGetLSNAtomic() calls during nbtree
scans.  Avoid them entirely during both index-only scans and bitmap
index scans.  Selectively avoid them during plain index scans by
limiting the calls to cases where they're truly necessary: only call
BufferGetLSNAtomic() when we finish reading a page that actually
contains items that btgettuple will return to the executor proper.

Currently, when checksums (or wal_log_hints) are enabled, acquiring a
page's LSN in BufferGetLSNAtomic() involves locking the buffer header
(which involves the use of spinlocks).  Testing has shown that enabling
page-level checksums can cause large regressions with certain workloads,
especially on large multi-socket systems.  The underlying issue wasn't
caused by any Postgres 18 commit.  However, Postgres 18 commit 04bec894
made initdb use checksums by default, so on balance it seems prudent to
address the problem now, as an open item for Postgres 18.

Author: Peter Geoghegan <pg@bowt.ie>
Reviewed-By: Tomas Vondra <tomas@vondra.me>
Discussion: https://postgr.es/m/941f0190-e3c6-4622-9ac7-c04e936e5fdb@vondra.me
Discussion: https://postgr.es/m/CAH2-Wzk-Dg5XWs_jDuiHt4_7ryrSY+n=vxmHY51EVqPDFsKXmg@mail.gmail.com
---
 src/include/access/nbtree.h           |  3 +-
 src/backend/access/nbtree/nbtree.c    | 30 +++++++++++++++
 src/backend/access/nbtree/nbtsearch.c | 55 +++++++++++++++------------
 src/backend/access/nbtree/nbtutils.c  | 19 +++++----
 4 files changed, 74 insertions(+), 33 deletions(-)

diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index ebca02588..a6c4ed7fd 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -967,7 +967,7 @@ typedef struct BTScanPosData
 	BlockNumber currPage;		/* page referenced by items array */
 	BlockNumber prevPage;		/* currPage's left link */
 	BlockNumber nextPage;		/* currPage's right link */
-	XLogRecPtr	lsn;			/* currPage's LSN */
+	XLogRecPtr	lsn;			/* currPage's LSN (when so.drop_pin) */
 
 	/* scan direction for the saved position's call to _bt_readpage */
 	ScanDirection dir;
@@ -1054,6 +1054,7 @@ typedef struct BTScanOpaqueData
 {
 	/* these fields are set by _bt_preprocess_keys(): */
 	bool		qual_ok;		/* false if qual can never be satisfied */
+	bool		drop_pin;		/* drop leaf pin before btgettuple returns? */
 	int			numberOfKeys;	/* number of preprocessed scan keys */
 	ScanKey		keyData;		/* array of preprocessed scan keys */
 
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 765659887..e4b94f08b 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -228,6 +228,8 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 	bool		res;
 
+	Assert(scan->heapRelation != NULL);
+
 	/* btree indexes are never lossy */
 	scan->xs_recheck = false;
 
@@ -289,6 +291,8 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 	int64		ntids = 0;
 	ItemPointer heapTid;
 
+	Assert(scan->heapRelation == NULL);
+
 	/* Each loop iteration performs another primitive index scan */
 	do
 	{
@@ -393,6 +397,32 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 		BTScanPosInvalidate(so->currPos);
 	}
 
+	/*
+	 * We prefer to eagerly drop leaf page pins before btgettuple returns.
+	 * This avoids making VACUUM wait to acquire a cleanup lock on the page.
+	 *
+	 * We cannot safely drop leaf page pins during index-only scans due to a
+	 * race condition involving VACUUM setting pages all-visible in the VM.
+	 * It's also unsafe for plain index scans that use a non-MVCC snapshot.
+	 *
+	 * When we drop pins eagerly, the mechanism that marks so->killedItems[]
+	 * index tuples LP_DEAD has to deal with concurrent TID recycling races.
+	 * The scheme used to detect unsafe TID recycling won't work when scanning
+	 * unlogged relations (since it involves saving an affected page's LSN).
+	 * Opt out of eager pin dropping during unlogged relation scans for now
+	 * (this is preferable to opting out of kill_prior_tuple LP_DEAD setting).
+	 *
+	 * Also opt out of dropping leaf page pins eagerly during bitmap scans.
+	 * Pins cannot be held for more than an instant during bitmap scans either
+	 * way, so we might as well avoid wasting cycles on acquiring page LSNs.
+	 *
+	 * See nbtree/README section on making concurrent TID recycling safe.
+	 */
+	so->drop_pin = (!scan->xs_want_itup &&
+					IsMVCCSnapshot(scan->xs_snapshot) &&
+					RelationNeedsWAL(scan->indexRelation) &&
+					scan->heapRelation != NULL);
+
 	so->markItemIndex = -1;
 	so->needPrimScan = false;
 	so->scanBehind = false;
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index fe9a38869..1af65ad1f 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -25,7 +25,7 @@
 #include "utils/rel.h"
 
 
-static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
+static inline void _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so);
 static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,
 							Buffer buf, bool forupdate, BTStack stack,
 							int access);
@@ -57,24 +57,29 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
 /*
  *	_bt_drop_lock_and_maybe_pin()
  *
- * Unlock the buffer; and if it is safe to release the pin, do that, too.
- * This will prevent vacuum from stalling in a blocked state trying to read a
- * page when a cursor is sitting on it.
- *
- * See nbtree/README section on making concurrent TID recycling safe.
+ * Unlock so->currPos.buf.  If so->drop_pin has been set, drop the pin, too.
+ * Dropping the pin prevents VACUUM from blocking on acquiring a cleanup lock.
  */
-static void
-_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
+static inline void
+_bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so)
 {
-	_bt_unlockbuf(scan->indexRelation, sp->buf);
-
-	if (IsMVCCSnapshot(scan->xs_snapshot) &&
-		RelationNeedsWAL(scan->indexRelation) &&
-		!scan->xs_want_itup)
+	if (!so->drop_pin)
 	{
-		ReleaseBuffer(sp->buf);
-		sp->buf = InvalidBuffer;
+		/* Just drop the lock (not the pin) */
+		_bt_unlockbuf(rel, so->currPos.buf);
+		return;
 	}
+
+	/*
+	 * Drop both the lock and the pin.
+	 *
+	 * Have to set so->currPos.lsn so that _bt_killitems has a way to detect
+	 * when concurrent heap TID recycling by VACUUM might have taken place.
+	 */
+	Assert(RelationNeedsWAL(rel));
+	so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
+	_bt_relbuf(rel, so->currPos.buf);
+	so->currPos.buf = InvalidBuffer;
 }
 
 /*
@@ -1610,7 +1615,13 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 	so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
 	so->currPos.prevPage = opaque->btpo_prev;
 	so->currPos.nextPage = opaque->btpo_next;
+	/* delay setting so->currPos.lsn until _bt_drop_lock_and_maybe_pin */
+	so->currPos.dir = dir;
+	so->currPos.nextTupleOffset = 0;
 
+	/* either moreRight or moreLeft should be set now (may be unset later) */
+	Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight :
+		   so->currPos.moreLeft);
 	Assert(!P_IGNORE(opaque));
 	Assert(BTScanPosIsPinned(so->currPos));
 	Assert(!so->needPrimScan);
@@ -1626,14 +1637,6 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 								 so->currPos.currPage);
 	}
 
-	/* initialize remaining currPos fields related to current page */
-	so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
-	so->currPos.dir = dir;
-	so->currPos.nextTupleOffset = 0;
-	/* either moreLeft or moreRight should be set now (may be unset later) */
-	Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight :
-		   so->currPos.moreLeft);
-
 	PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot);
 
 	/* initialize local variables */
@@ -2251,12 +2254,14 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
 	 */
 	if (_bt_readpage(scan, dir, offnum, true))
 	{
+		Relation	rel = scan->indexRelation;
+
 		/*
 		 * _bt_readpage succeeded.  Drop the lock (and maybe the pin) on
 		 * so->currPos.buf in preparation for btgettuple returning tuples.
 		 */
 		Assert(BTScanPosIsPinned(so->currPos));
-		_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+		_bt_drop_lock_and_maybe_pin(rel, so);
 		return true;
 	}
 
@@ -2413,7 +2418,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
 	 */
 	Assert(so->currPos.currPage == blkno);
 	Assert(BTScanPosIsPinned(so->currPos));
-	_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+	_bt_drop_lock_and_maybe_pin(rel, so);
 
 	return true;
 }
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 1a15dfcb7..b63d1fc57 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -3364,9 +3364,10 @@ _bt_killitems(IndexScanDesc scan)
 	int			i;
 	int			numKilled = so->numKilled;
 	bool		killedsomething = false;
-	bool		droppedpin PG_USED_FOR_ASSERTS_ONLY;
 
+	Assert(numKilled > 0);
 	Assert(BTScanPosIsValid(so->currPos));
+	Assert(scan->heapRelation != NULL); /* can't be a bitmap index scan */
 
 	/*
 	 * Always reset the scan state, so we don't look for same items on other
@@ -3374,7 +3375,7 @@ _bt_killitems(IndexScanDesc scan)
 	 */
 	so->numKilled = 0;
 
-	if (BTScanPosIsPinned(so->currPos))
+	if (!so->drop_pin)
 	{
 		/*
 		 * We have held the pin on this page since we read the index tuples,
@@ -3382,7 +3383,7 @@ _bt_killitems(IndexScanDesc scan)
 		 * re-use of any TID on the page, so there is no need to check the
 		 * LSN.
 		 */
-		droppedpin = false;
+		Assert(BTScanPosIsPinned(so->currPos));
 		_bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ);
 
 		page = BufferGetPage(so->currPos.buf);
@@ -3390,13 +3391,17 @@ _bt_killitems(IndexScanDesc scan)
 	else
 	{
 		Buffer		buf;
+		XLogRecPtr	latestlsn;
 
-		droppedpin = true;
 		/* Attempt to re-read the buffer, getting pin and lock. */
+		Assert(!BTScanPosIsPinned(so->currPos));
 		buf = _bt_getbuf(scan->indexRelation, so->currPos.currPage, BT_READ);
 
 		page = BufferGetPage(buf);
-		if (BufferGetLSNAtomic(buf) == so->currPos.lsn)
+		latestlsn = BufferGetLSNAtomic(buf);
+		Assert(!XLogRecPtrIsInvalid(so->currPos.lsn));
+		Assert(so->currPos.lsn <= latestlsn);
+		if (so->currPos.lsn == latestlsn)
 			so->currPos.buf = buf;
 		else
 		{
@@ -3442,7 +3447,7 @@ _bt_killitems(IndexScanDesc scan)
 				 * correctness.
 				 *
 				 * Note that the page may have been modified in almost any way
-				 * since we first read it (in the !droppedpin case), so it's
+				 * since we first read it (in the !so->drop_pin case), so it's
 				 * possible that this posting list tuple wasn't a posting list
 				 * tuple when we first encountered its heap TIDs.
 				 */
@@ -3458,7 +3463,7 @@ _bt_killitems(IndexScanDesc scan)
 					 * though only in the common case where the page can't
 					 * have been concurrently modified
 					 */
-					Assert(kitem->indexOffset == offnum || !droppedpin);
+					Assert(kitem->indexOffset == offnum || !so->drop_pin);
 
 					/*
 					 * Read-ahead to later kitems here.
-- 
2.49.0

