From 7f2a4c11f3c1a46c3713ef337b7a4a05be5adfd6 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Tue, 10 Jun 2025 12:07:01 -0400
Subject: [PATCH v3 2/2] Stop conditioning nbtree actions on pin being held.

Commit 2ed5b87f, which taught nbtree to avoid holding onto a leaf page
buffer pin across the btgettuple calls for certain index scans, made it
possible for a scan position (either so->currPos or so->markPos) to be a
valid position without having a buffer pin held/a valid value in 'buf'.
The same commit introduced an idiom around buffer pin management: at the
point when a scan position was invalidated, a call to the function-like
macro BTScanPosUnpinIfPinned() released the buffer pin iff the relevant
position's 'buf' field was set to a non-InvalidBuffer value.  Recent
experience with bugfix commit XXX shows this idiom to be rather brittle.

Finish off the work started by recent commit e6eed40e: fully move over
to conditioning releasing buffer pins on whether the scan in question is
a so->dropPin scan.  If it is, then _bt_drop_lock_and_maybe_pin must
have already dropped the buffer pin earlier on.  If it's not, then
_bt_drop_lock_and_maybe_pin certainly won't have dropped the pin when
called, so it is the responsibility of the pos-invalidating code.
BTScanPosUnpinIfPinned() is no longer needed, so get rid of it.  Keep
BTScanPosIsPinned(), but only call it from assertions going forward.

Now that the rules around buffer pins are a lot clearer, we can remove
all use of IncrBufferRefCount() from nbtree.  In the case of the
_bt_steppage call site, we were incremented and then immediately
decrementing the backend-local pin count in cases where so->currPos was
to be saved in so->markPos; now we'll do neither in such cases.  The
second IncrBufferRefCount() call site (which was in btrestrpos) now
takes this same approach, though in reverse:  instead of keeping around
the original so->markPos, we can invalidate it (but not unpin it) right
away, and store the new so->currPos.itemIndex in so->markItemIndex.
That way there is never any need to hold more than a single buffer pin
on the same page during mark and restore (so->currPos and so->markPos
might each hold their own buffer pins, but never on the same page).

Master-only follow-up to bugfix commit XXXXXXXX.

Author: Peter Geoghegan <pg@bowt.ie>
Discussion: https://postgr.es/m/CAH2-Wzm3a8i31aBXi6oQt9uG7m1-xpX-MXjMMYoDJH=sBj1jnA@mail.gmail.com
---
 src/include/access/nbtree.h           |  17 ++--
 src/backend/access/nbtree/nbtree.c    | 117 ++++++++++++++++----------
 src/backend/access/nbtree/nbtsearch.c |  70 ++++++++++-----
 3 files changed, 128 insertions(+), 76 deletions(-)

diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index e709d2e0a..a448c2c9f 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1007,23 +1007,18 @@ typedef BTScanPosData *BTScanPos;
 				!BufferIsValid((scanpos).buf)), \
 	BufferIsValid((scanpos).buf) \
 )
-#define BTScanPosUnpin(scanpos) \
-	do { \
-		ReleaseBuffer((scanpos).buf); \
-		(scanpos).buf = InvalidBuffer; \
-	} while (0)
-#define BTScanPosUnpinIfPinned(scanpos) \
-	do { \
-		if (BTScanPosIsPinned(scanpos)) \
-			BTScanPosUnpin(scanpos); \
-	} while (0)
-
 #define BTScanPosIsValid(scanpos) \
 ( \
 	AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
 				!BufferIsValid((scanpos).buf)), \
 	BlockNumberIsValid((scanpos).currPage) \
 )
+
+#define BTScanPosUnpin(scanpos) \
+	do { \
+		ReleaseBuffer((scanpos).buf); \
+		(scanpos).buf = InvalidBuffer; \
+	} while (0)
 #define BTScanPosInvalidate(scanpos) \
 	do { \
 		(scanpos).buf = InvalidBuffer; \
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index fdff960c1..446a2f918 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -387,16 +387,6 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 
-	/* we aren't holding any read locks, but gotta drop the pins */
-	if (BTScanPosIsValid(so->currPos))
-	{
-		/* Before leaving current page, deal with any killed items */
-		if (so->numKilled > 0)
-			_bt_killitems(scan);
-		BTScanPosUnpinIfPinned(so->currPos);
-		BTScanPosInvalidate(so->currPos);
-	}
-
 	/*
 	 * We prefer to eagerly drop leaf page pins before btgettuple returns.
 	 * This avoids making VACUUM wait to acquire a cleanup lock on the page.
@@ -425,12 +415,27 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 				   RelationNeedsWAL(scan->indexRelation) &&
 				   scan->heapRelation != NULL);
 
+	/*
+	 * Before leaving the current position, perform final steps, since we'll
+	 * never call _bt_steppage for its page
+	 */
+	if (BTScanPosIsValid(so->currPos))
+	{
+		if (so->numKilled > 0)
+			_bt_killitems(scan);
+		if (!so->dropPin)
+			BTScanPosUnpin(so->currPos);
+		BTScanPosInvalidate(so->currPos);
+	}
+
+	/* Always invalidate any existing mark */
 	so->markItemIndex = -1;
-	so->needPrimScan = false;
-	so->scanBehind = false;
-	so->oppositeDirCheck = false;
-	BTScanPosUnpinIfPinned(so->markPos);
-	BTScanPosInvalidate(so->markPos);
+	if (BTScanPosIsValid(so->markPos))
+	{
+		if (!so->dropPin)
+			BTScanPosUnpin(so->markPos);
+		BTScanPosInvalidate(so->markPos);
+	}
 
 	/*
 	 * Allocate tuple workspace arrays, if needed for an index-only scan and
@@ -459,6 +464,9 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 	 */
 	if (scankey && scan->numberOfKeys > 0)
 		memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
+	so->needPrimScan = false;
+	so->scanBehind = false;
+	so->oppositeDirCheck = false;
 	so->numberOfKeys = 0;		/* until _bt_preprocess_keys sets it */
 	so->numArrayKeys = 0;		/* ditto */
 }
@@ -471,19 +479,27 @@ btendscan(IndexScanDesc scan)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 
-	/* we aren't holding any read locks, but gotta drop the pins */
+	/*
+	 * Before leaving the current position, perform final steps, since we'll
+	 * never call _bt_steppage for its page
+	 */
 	if (BTScanPosIsValid(so->currPos))
 	{
-		/* Before leaving current page, deal with any killed items */
 		if (so->numKilled > 0)
 			_bt_killitems(scan);
-		BTScanPosUnpinIfPinned(so->currPos);
+		if (!so->dropPin)
+			BTScanPosUnpin(so->currPos);
+		BTScanPosInvalidate(so->currPos);	/* unnecessary, but be consistent */
 	}
 
-	so->markItemIndex = -1;
-	BTScanPosUnpinIfPinned(so->markPos);
-
-	/* No need to invalidate positions, the RAM is about to be freed. */
+	/* Always invalidate any existing mark */
+	so->markItemIndex = -1;		/* unnecessary, but be consistent */
+	if (BTScanPosIsValid(so->markPos))
+	{
+		if (!so->dropPin)
+			BTScanPosUnpin(so->markPos);
+		BTScanPosInvalidate(so->markPos);	/* unnecessary, but be consistent */
+	}
 
 	/* Release storage */
 	if (so->keyData != NULL)
@@ -507,8 +523,14 @@ btmarkpos(IndexScanDesc scan)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 
-	/* There may be an old mark with a pin (but no lock). */
-	BTScanPosUnpinIfPinned(so->markPos);
+	/* Always invalidate any existing mark */
+	so->markItemIndex = -1;
+	if (BTScanPosIsValid(so->markPos))
+	{
+		if (!so->dropPin)
+			BTScanPosUnpin(so->markPos);
+		BTScanPosInvalidate(so->markPos);
+	}
 
 	/*
 	 * Just record the current itemIndex.  If we later step to next page
@@ -518,11 +540,6 @@ btmarkpos(IndexScanDesc scan)
 	 */
 	if (BTScanPosIsValid(so->currPos))
 		so->markItemIndex = so->currPos.itemIndex;
-	else
-	{
-		BTScanPosInvalidate(so->markPos);
-		so->markItemIndex = -1;
-	}
 }
 
 /*
@@ -536,41 +553,55 @@ btrestrpos(IndexScanDesc scan)
 	if (so->markItemIndex >= 0)
 	{
 		/*
-		 * The scan has never moved to a new page since the last mark.  Just
+		 * The mark position is on the same page we are currently on.  Just
 		 * restore the itemIndex.
-		 *
-		 * NB: In this case we can't count on anything in so->markPos to be
-		 * accurate.
 		 */
 		so->currPos.itemIndex = so->markItemIndex;
 	}
 	else
 	{
 		/*
-		 * The scan moved to a new page after last mark or restore, and we are
-		 * now restoring to the marked page.  We aren't holding any read
-		 * locks, but if we're still holding the pin for the current position,
-		 * we must drop it.
+		 * The scan moved to a page beyond that of the last mark we took.
+		 *
+		 * Before leaving the current position, perform final steps, since
+		 * we'll never call _bt_steppage for its page.
 		 */
 		if (BTScanPosIsValid(so->currPos))
 		{
-			/* Before leaving current page, deal with any killed items */
 			if (so->numKilled > 0)
 				_bt_killitems(scan);
-			BTScanPosUnpinIfPinned(so->currPos);
+			if (!so->dropPin)
+				BTScanPosUnpin(so->currPos);
+			BTScanPosInvalidate(so->currPos);
 		}
 
+		/*
+		 * We're completely done with our old so->currPos.  Copy so->markPos
+		 * into so->currPos to actually restore the mark.
+		 */
 		if (BTScanPosIsValid(so->markPos))
 		{
-			/* bump pin on mark buffer for assignment to current buffer */
-			if (BTScanPosIsPinned(so->markPos))
-				IncrBufferRefCount(so->markPos.buf);
 			memcpy(&so->currPos, &so->markPos,
 				   offsetof(BTScanPosData, items[1]) +
 				   so->markPos.lastItem * sizeof(BTScanPosItem));
 			if (so->currTuples)
 				memcpy(so->currTuples, so->markTuples,
 					   so->markPos.nextTupleOffset);
+
+			/*
+			 * We need to keep the mark around, in case it gets restored
+			 * again.  We use the so->markItemIndex representation for this.
+			 * We generally only need to keep around a separate BTScanPosData
+			 * (which will have its own buffer pin when !so->dropPin) for a
+			 * mark whose page doesn't match so->currPos.currPage.
+			 *
+			 * See also: _bt_steppage, which converts the so->markItemIndex
+			 * representation into the so->markPos representation, in order to
+			 * safely keep around a mark after so->currPos.currPage changes.
+			 */
+			BTScanPosInvalidate(so->markPos);
+			so->markItemIndex = so->currPos.itemIndex;
+
 			/* Reset the scan's array keys (see _bt_steppage for why) */
 			if (so->numArrayKeys)
 			{
@@ -578,8 +609,6 @@ btrestrpos(IndexScanDesc scan)
 				so->needPrimScan = false;
 			}
 		}
-		else
-			BTScanPosInvalidate(so->currPos);
 	}
 }
 
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 070f14c8b..596fc14b9 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -2096,6 +2096,7 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
 
 	/* Most recent _bt_readpage must have succeeded */
 	Assert(BTScanPosIsValid(so->currPos));
+	Assert(BTScanPosIsPinned(so->currPos) == !so->dropPin);
 	Assert(so->currPos.itemIndex >= so->currPos.firstItem);
 	Assert(so->currPos.itemIndex <= so->currPos.lastItem);
 
@@ -2109,6 +2110,8 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
  *	_bt_steppage() -- Step to next page containing valid data for scan
  *
  * Wrapper on _bt_readnextpage that performs final steps for the current page.
+ * Call here when leaving a so->currPos that _bt_readpage returned 'true' for
+ * (just call _bt_readnextpage directly when _bt_readpage returned 'false').
  *
  * On entry, so->currPos must be valid.  Its buffer will be pinned, though
  * never locked. (Actually, when so->dropPin there won't even be a pin held,
@@ -2121,7 +2124,12 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 	BlockNumber blkno,
 				lastcurrblkno;
 
+	/* Assert that _bt_readpage returned true for so->currPos */
+	Assert(so->currPos.firstItem <= so->currPos.lastItem);
+
+	/* Assert that _bt_drop_lock_and_maybe_pin left so->currPos as expected */
 	Assert(BTScanPosIsValid(so->currPos));
+	Assert(BTScanPosIsPinned(so->currPos) == !so->dropPin);
 
 	/* Before leaving current page, deal with any killed items */
 	if (so->numKilled > 0)
@@ -2133,9 +2141,9 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 	 */
 	if (so->markItemIndex >= 0)
 	{
-		/* bump pin on current buffer for assignment to mark buffer */
-		if (BTScanPosIsPinned(so->currPos))
-			IncrBufferRefCount(so->currPos.buf);
+		/* mark/restore not supported by parallel scans */
+		Assert(!scan->parallel_scan);
+
 		memcpy(&so->markPos, &so->currPos,
 			   offsetof(BTScanPosData, items[1]) +
 			   so->currPos.lastItem * sizeof(BTScanPosItem));
@@ -2169,18 +2177,23 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 				so->markPos.moreLeft = true;
 		}
 
-		/* mark/restore not supported by parallel scans */
-		Assert(!scan->parallel_scan);
+		/*
+		 * Saving so->currPos into so->markPos, so just unset its 'buf' field
+		 * (avoid unpinning the buffer, since it is now owned by so->markPos).
+		 * _bt_readnextpage expects so->currPos to always look like this.
+		 */
+		so->currPos.buf = InvalidBuffer;
 	}
-
-	BTScanPosUnpinIfPinned(so->currPos);
-
-	/* Walk to the next page with data */
-	if (ScanDirectionIsForward(dir))
-		blkno = so->currPos.nextPage;
 	else
-		blkno = so->currPos.prevPage;
-	lastcurrblkno = so->currPos.currPage;
+	{
+		/*
+		 * Not saving so->currPos into so->markPos, so release its buffer pin
+		 * (except in so->dropPin case, where _bt_drop_lock_and_maybe_pin must
+		 * have done so earlier)
+		 */
+		if (!so->dropPin)
+			BTScanPosUnpin(so->currPos);
+	}
 
 	/*
 	 * Cancel primitive index scans that were scheduled when the call to
@@ -2191,6 +2204,13 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 	if (so->currPos.dir != dir)
 		so->needPrimScan = false;
 
+	/* Walk to the next page with data */
+	if (ScanDirectionIsForward(dir))
+		blkno = so->currPos.nextPage;
+	else
+		blkno = so->currPos.prevPage;
+	lastcurrblkno = so->currPos.currPage;
+
 	return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);
 }
 
@@ -2220,7 +2240,10 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 static bool
 _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
 {
+	Relation	rel = scan->indexRelation;
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	BlockNumber blkno,
+				lastcurrblkno;
 
 	so->numKilled = 0;			/* just paranoia */
 	so->markItemIndex = -1;		/* ditto */
@@ -2253,8 +2276,6 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
 	 */
 	if (_bt_readpage(scan, dir, offnum, true))
 	{
-		Relation	rel = scan->indexRelation;
-
 		/*
 		 * _bt_readpage succeeded.  Drop the lock (and maybe the pin) on
 		 * so->currPos.buf in preparation for btgettuple returning tuples.
@@ -2265,14 +2286,20 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
 	}
 
 	/* There's no actually-matching data on the page in so->currPos.buf */
-	_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
+	Assert(so->numKilled == 0);
+	Assert(so->markItemIndex < 0);
 
-	/* Call _bt_readnextpage using its _bt_steppage wrapper function */
-	if (!_bt_steppage(scan, dir))
-		return false;
+	_bt_relbuf(rel, so->currPos.buf);
+	so->currPos.buf = InvalidBuffer;
 
-	/* _bt_readpage for a later page (now in so->currPos) succeeded */
-	return true;
+	/* Walk to the next page with data */
+	if (ScanDirectionIsForward(dir))
+		blkno = so->currPos.nextPage;
+	else
+		blkno = so->currPos.prevPage;
+	lastcurrblkno = so->currPos.currPage;
+
+	return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);
 }
 
 /*
@@ -2408,6 +2435,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
 
 		/* no matching tuples on this page */
 		_bt_relbuf(rel, so->currPos.buf);
+		so->currPos.buf = InvalidBuffer;
 		seized = false;			/* released by _bt_readpage (or by us) */
 	}
 
-- 
2.49.0

