From c8aec56cfadcffac78a61568cf5903426c82b97d Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Mon, 9 Jun 2025 09:57:30 -0400
Subject: [PATCH v2] Fix issue with mark/restore nbtree pins.

---
 src/include/access/nbtree.h           |  5 --
 src/backend/access/nbtree/nbtree.c    | 75 +++++++++++++++++----------
 src/backend/access/nbtree/nbtsearch.c | 72 ++++++++++++++++++-------
 src/backend/access/nbtree/nbtutils.c  | 25 +++++----
 4 files changed, 115 insertions(+), 62 deletions(-)

diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index e709d2e0a..fe0b805d4 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1012,11 +1012,6 @@ typedef BTScanPosData *BTScanPos;
 		ReleaseBuffer((scanpos).buf); \
 		(scanpos).buf = InvalidBuffer; \
 	} while (0)
-#define BTScanPosUnpinIfPinned(scanpos) \
-	do { \
-		if (BTScanPosIsPinned(scanpos)) \
-			BTScanPosUnpin(scanpos); \
-	} while (0)
 
 #define BTScanPosIsValid(scanpos) \
 ( \
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 03a1d7b02..04dab0703 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -387,16 +387,6 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 
-	/* we aren't holding any read locks, but gotta drop the pins */
-	if (BTScanPosIsValid(so->currPos))
-	{
-		/* Before leaving current page, deal with any killed items */
-		if (so->numKilled > 0)
-			_bt_killitems(scan);
-		BTScanPosUnpinIfPinned(so->currPos);
-		BTScanPosInvalidate(so->currPos);
-	}
-
 	/*
 	 * We prefer to eagerly drop leaf page pins before btgettuple returns.
 	 * This avoids making VACUUM wait to acquire a cleanup lock on the page.
@@ -416,6 +406,8 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 	 * Pins cannot be held for more than an instant during bitmap scans either
 	 * way, so we might as well avoid wasting cycles on acquiring page LSNs.
 	 *
+	 * Note: so->dropPin should never change across rescans.
+	 *
 	 * See nbtree/README section on making concurrent TID recycling safe.
 	 */
 	so->dropPin = (!scan->xs_want_itup &&
@@ -423,12 +415,27 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 				   RelationNeedsWAL(scan->indexRelation) &&
 				   scan->heapRelation != NULL);
 
-	so->markItemIndex = -1;
+	/* we aren't holding any read locks, but gotta drop the pins */
+	if (BTScanPosIsValid(so->currPos))
+	{
+		/* Before leaving current page, deal with any killed items */
+		if (so->numKilled > 0)
+			_bt_killitems(scan);
+		if (!so->dropPin)
+			BTScanPosUnpin(so->currPos);
+		BTScanPosInvalidate(so->currPos);
+	}
+
 	so->needPrimScan = false;
 	so->scanBehind = false;
 	so->oppositeDirCheck = false;
-	BTScanPosUnpinIfPinned(so->markPos);
-	BTScanPosInvalidate(so->markPos);
+	so->markItemIndex = -1;
+	if (BTScanPosIsValid(so->markPos))
+	{
+		if (!so->dropPin)
+			BTScanPosUnpin(so->markPos);
+		BTScanPosInvalidate(so->markPos);
+	}
 
 	/*
 	 * Allocate tuple workspace arrays, if needed for an index-only scan and
@@ -475,11 +482,16 @@ btendscan(IndexScanDesc scan)
 		/* Before leaving current page, deal with any killed items */
 		if (so->numKilled > 0)
 			_bt_killitems(scan);
-		BTScanPosUnpinIfPinned(so->currPos);
+		if (!so->dropPin)
+			BTScanPosUnpin(so->currPos);
 	}
 
 	so->markItemIndex = -1;
-	BTScanPosUnpinIfPinned(so->markPos);
+	if (BTScanPosIsValid(so->markPos))
+	{
+		if (!so->dropPin)
+			BTScanPosUnpin(so->markPos);
+	}
 
 	/* No need to invalidate positions, the RAM is about to be freed. */
 
@@ -505,8 +517,14 @@ btmarkpos(IndexScanDesc scan)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 
-	/* There may be an old mark with a pin (but no lock). */
-	BTScanPosUnpinIfPinned(so->markPos);
+	/* Always invalidate any existing mark */
+	if (BTScanPosIsValid(so->markPos))
+	{
+		if (!so->dropPin)
+			BTScanPosUnpin(so->markPos);
+		BTScanPosInvalidate(so->markPos);
+	}
+	so->markItemIndex = -1;
 
 	/*
 	 * Just record the current itemIndex.  If we later step to next page
@@ -516,11 +534,6 @@ btmarkpos(IndexScanDesc scan)
 	 */
 	if (BTScanPosIsValid(so->currPos))
 		so->markItemIndex = so->currPos.itemIndex;
-	else
-	{
-		BTScanPosInvalidate(so->markPos);
-		so->markItemIndex = -1;
-	}
 }
 
 /*
@@ -555,14 +568,13 @@ btrestrpos(IndexScanDesc scan)
 			/* Before leaving current page, deal with any killed items */
 			if (so->numKilled > 0)
 				_bt_killitems(scan);
-			BTScanPosUnpinIfPinned(so->currPos);
+			if (!so->dropPin)
+				BTScanPosUnpin(so->currPos);
+			BTScanPosInvalidate(so->currPos);
 		}
 
 		if (BTScanPosIsValid(so->markPos))
 		{
-			/* bump pin on mark buffer for assignment to current buffer */
-			if (BTScanPosIsPinned(so->markPos))
-				IncrBufferRefCount(so->markPos.buf);
 			memcpy(&so->currPos, &so->markPos,
 				   offsetof(BTScanPosData, items[1]) +
 				   so->markPos.lastItem * sizeof(BTScanPosItem));
@@ -575,9 +587,16 @@ btrestrpos(IndexScanDesc scan)
 				_bt_start_array_keys(scan, so->currPos.dir);
 				so->needPrimScan = false;
 			}
+
+			/*
+			 * We need to keep the mark around, in case we need to restore it
+			 * again.  But we don't need to use the so->markPos representation
+			 * (the so->markItemIndex representation suffices, now that the
+			 * page that so->currPos references is this mark's page).
+			 */
+			so->markItemIndex = so->currPos.itemIndex;
+			BTScanPosInvalidate(so->markPos);
 		}
-		else
-			BTScanPosInvalidate(so->currPos);
 	}
 }
 
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 070f14c8b..d443b1abd 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -2109,6 +2109,7 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
  *	_bt_steppage() -- Step to next page containing valid data for scan
  *
  * Wrapper on _bt_readnextpage that performs final steps for the current page.
+ * Only called when _bt_drop_lock_and_maybe_pin was called for so->currPos.
  *
  * On entry, so->currPos must be valid.  Its buffer will be pinned, though
  * never locked. (Actually, when so->dropPin there won't even be a pin held,
@@ -2122,6 +2123,10 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 				lastcurrblkno;
 
 	Assert(BTScanPosIsValid(so->currPos));
+	if (!so->dropPin)
+		Assert(BTScanPosIsPinned(so->currPos));
+	else
+		Assert(!BTScanPosIsPinned(so->currPos));
 
 	/* Before leaving current page, deal with any killed items */
 	if (so->numKilled > 0)
@@ -2133,9 +2138,6 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 	 */
 	if (so->markItemIndex >= 0)
 	{
-		/* bump pin on current buffer for assignment to mark buffer */
-		if (BTScanPosIsPinned(so->currPos))
-			IncrBufferRefCount(so->currPos.buf);
 		memcpy(&so->markPos, &so->currPos,
 			   offsetof(BTScanPosData, items[1]) +
 			   so->currPos.lastItem * sizeof(BTScanPosItem));
@@ -2145,6 +2147,16 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 		so->markPos.itemIndex = so->markItemIndex;
 		so->markItemIndex = -1;
 
+		/*
+		 * If there's a pin held on so->currPos, we transfer the pin over to
+		 * so->markPos (without actually releasing it)
+		 */
+		Assert(BTScanPosIsValid(so->markPos));
+		if (!so->dropPin)
+			Assert(BTScanPosIsPinned(so->markPos));
+		else
+			Assert(!BTScanPosIsPinned(so->markPos));
+
 		/*
 		 * If we're just about to start the next primitive index scan
 		 * (possible with a scan that has arrays keys, and needs to skip to
@@ -2172,15 +2184,14 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 		/* mark/restore not supported by parallel scans */
 		Assert(!scan->parallel_scan);
 	}
-
-	BTScanPosUnpinIfPinned(so->currPos);
-
-	/* Walk to the next page with data */
-	if (ScanDirectionIsForward(dir))
-		blkno = so->currPos.nextPage;
-	else
-		blkno = so->currPos.prevPage;
-	lastcurrblkno = so->currPos.currPage;
+	else if (!so->dropPin)
+	{
+		/*
+		 * Not saving so->currPos position in so->markPos, so release the
+		 * position's pin for ourselves
+		 */
+		BTScanPosUnpin(so->currPos);
+	}
 
 	/*
 	 * Cancel primitive index scans that were scheduled when the call to
@@ -2191,6 +2202,19 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 	if (so->currPos.dir != dir)
 		so->needPrimScan = false;
 
+	/*
+	 * so->currPos no longer holds a buffer pin (independent of whether we
+	 * released the pin, or just transferred it over to so->markPos)
+	 */
+	so->currPos.buf = InvalidBuffer;
+
+	/* Walk to the next page with data */
+	if (ScanDirectionIsForward(dir))
+		blkno = so->currPos.nextPage;
+	else
+		blkno = so->currPos.prevPage;
+	lastcurrblkno = so->currPos.currPage;
+
 	return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);
 }
 
@@ -2220,7 +2244,10 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 static bool
 _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
 {
+	Relation	rel = scan->indexRelation;
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	BlockNumber blkno,
+				lastcurrblkno;
 
 	so->numKilled = 0;			/* just paranoia */
 	so->markItemIndex = -1;		/* ditto */
@@ -2253,8 +2280,6 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
 	 */
 	if (_bt_readpage(scan, dir, offnum, true))
 	{
-		Relation	rel = scan->indexRelation;
-
 		/*
 		 * _bt_readpage succeeded.  Drop the lock (and maybe the pin) on
 		 * so->currPos.buf in preparation for btgettuple returning tuples.
@@ -2265,14 +2290,20 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
 	}
 
 	/* There's no actually-matching data on the page in so->currPos.buf */
-	_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
+	Assert(so->numKilled == 0);
+	Assert(so->markItemIndex < 0);
 
-	/* Call _bt_readnextpage using its _bt_steppage wrapper function */
-	if (!_bt_steppage(scan, dir))
-		return false;
+	_bt_relbuf(rel, so->currPos.buf);
+	so->currPos.buf = InvalidBuffer;
 
-	/* _bt_readpage for a later page (now in so->currPos) succeeded */
-	return true;
+	/* Walk to the next page with data */
+	if (ScanDirectionIsForward(dir))
+		blkno = so->currPos.nextPage;
+	else
+		blkno = so->currPos.prevPage;
+	lastcurrblkno = so->currPos.currPage;
+
+	return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);
 }
 
 /*
@@ -2408,6 +2439,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
 
 		/* no matching tuples on this page */
 		_bt_relbuf(rel, so->currPos.buf);
+		so->currPos.buf = InvalidBuffer;
 		seized = false;			/* released by _bt_readpage (or by us) */
 	}
 
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 29f0dca1b..731916765 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -3323,9 +3323,9 @@ _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate,
  * current page and killed tuples thereon (generally, this should only be
  * called if so->numKilled > 0).
  *
- * The caller does not have a lock on the page and may or may not have the
- * page pinned in a buffer.  Note that read-lock is sufficient for setting
- * LP_DEAD status (which is only a hint).
+ * Caller should not have a lock on the so->currPos page, but must hold a
+ * buffer pin when !so->dropPin.  When we return, it still won't be locked.
+ * It'll continue to hold the same pins that were held when we were called.
  *
  * We match items by heap TID before assuming they are the right ones to
  * delete.  We cope with cases where items have moved right due to insertions.
@@ -3353,6 +3353,7 @@ _bt_killitems(IndexScanDesc scan)
 	OffsetNumber maxoff;
 	int			numKilled = so->numKilled;
 	bool		killedsomething = false;
+	Buffer		buf;
 
 	Assert(numKilled > 0);
 	Assert(BTScanPosIsValid(so->currPos));
@@ -3369,11 +3370,11 @@ _bt_killitems(IndexScanDesc scan)
 		 * concurrent VACUUMs from recycling any of the TIDs on the page.
 		 */
 		Assert(BTScanPosIsPinned(so->currPos));
-		_bt_lockbuf(rel, so->currPos.buf, BT_READ);
+		buf = so->currPos.buf;
+		_bt_lockbuf(rel, buf, BT_READ);
 	}
 	else
 	{
-		Buffer		buf;
 		XLogRecPtr	latestlsn;
 
 		Assert(!BTScanPosIsPinned(so->currPos));
@@ -3391,10 +3392,9 @@ _bt_killitems(IndexScanDesc scan)
 		}
 
 		/* Unmodified, hinting is safe */
-		so->currPos.buf = buf;
 	}
 
-	page = BufferGetPage(so->currPos.buf);
+	page = BufferGetPage(buf);
 	opaque = BTPageGetOpaque(page);
 	minoff = P_FIRSTDATAKEY(opaque);
 	maxoff = PageGetMaxOffsetNumber(page);
@@ -3511,10 +3511,17 @@ _bt_killitems(IndexScanDesc scan)
 	if (killedsomething)
 	{
 		opaque->btpo_flags |= BTP_HAS_GARBAGE;
-		MarkBufferDirtyHint(so->currPos.buf, true);
+		MarkBufferDirtyHint(buf, true);
 	}
 
-	_bt_unlockbuf(rel, so->currPos.buf);
+	/*
+	 * so->currPos retains no locks, but does retain whatever pins were held
+	 * just before we were called
+	 */
+	if (!so->dropPin)
+		_bt_unlockbuf(rel, buf);
+	else
+		_bt_relbuf(rel, buf);
 }
 
 
-- 
2.49.0

