From b3d73c79df9daaaead4dfdde8802d6b5e9317a4a Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Tue, 10 Jun 2025 12:07:01 -0400
Subject: [PATCH v1] Stop conditioning nbtree actions on pin being held.

Commit 2ed5b87f, which taught nbtree to avoid holding onto a leaf page
buffer pin across the btgettuple calls for certain index scans, made it
possible for a scan position (either so->currPos or so->markPos) to be a
valid position without having a buffer pin held/a valid value in 'buf'.
The same commit introduced an idiom around buffer pin management: at the
point when a scan position was invalidated, a call to the function-like
macro BTScanPosUnpinIfPinned() released the buffer pin iff the relevant
position's 'buf' field was set to a non-InvalidBuffer value.  Experience
with bugfix commit 7c319f54 suggests that this idiom is rather brittle.

Finish off the work started by recent commit e6eed40e: fully move over
to conditioning releasing buffer pins on whether the scan in question is
a so->dropPin scan.  If it is, then _bt_drop_lock_and_maybe_pin must
have already dropped the buffer pin earlier on.  If it's not, then
_bt_drop_lock_and_maybe_pin certainly won't have dropped the pin when
called, so doing so is the responsibility of the pos-invalidating code.
BTScanPosUnpinIfPinned() is no longer needed, so get rid of it.  Keep
BTScanPosIsPinned(), but only call it from assertions going forward.

Now that the rules around buffer pins are a lot clearer, we can remove
all IncrBufferRefCount calls from nbtree.  They aren't truly necessary;
both of the IncrBufferRefCount call sites effectively just incremented
and then immediately decremented the backend-local pin count.  We can
safely avoid changing the buffer refcount by explicitly "transferring"
ownership of the pin from the old obsolescent position to the new one
that replaces it. (Actually, the btrestrpos caller _did_ keep around a
so->markPos/second pin, even after it returned, because the mark had to
be kept around in case it needed to be restored again some time later.
But we can safely keep such a mark around by setting so->markItemIndex,
and then invalidating the mark's original so->markPos representation.
This approach makes btrestrpos symmetrical with the approach taken by
_bt_steppage, which is the other former IncrBufferRefCount caller.)

This hardening/refactoring is a follow-up to bugfix commit 7c319f54.

Author: Peter Geoghegan <pg@bowt.ie>
Discussion: https://postgr.es/m/CAH2-Wzm3a8i31aBXi6oQt9uG7m1-xpX-MXjMMYoDJH=sBj1jnA@mail.gmail.com
---
 src/include/access/nbtree.h           |  17 ++--
 src/backend/access/nbtree/nbtree.c    | 122 +++++++++++++++++---------
 src/backend/access/nbtree/nbtsearch.c |  72 ++++++++++-----
 3 files changed, 133 insertions(+), 78 deletions(-)

diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index e709d2e0a..a448c2c9f 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1007,23 +1007,18 @@ typedef BTScanPosData *BTScanPos;
 				!BufferIsValid((scanpos).buf)), \
 	BufferIsValid((scanpos).buf) \
 )
-#define BTScanPosUnpin(scanpos) \
-	do { \
-		ReleaseBuffer((scanpos).buf); \
-		(scanpos).buf = InvalidBuffer; \
-	} while (0)
-#define BTScanPosUnpinIfPinned(scanpos) \
-	do { \
-		if (BTScanPosIsPinned(scanpos)) \
-			BTScanPosUnpin(scanpos); \
-	} while (0)
-
 #define BTScanPosIsValid(scanpos) \
 ( \
 	AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
 				!BufferIsValid((scanpos).buf)), \
 	BlockNumberIsValid((scanpos).currPage) \
 )
+
+#define BTScanPosUnpin(scanpos) \
+	do { \
+		ReleaseBuffer((scanpos).buf); \
+		(scanpos).buf = InvalidBuffer; \
+	} while (0)
 #define BTScanPosInvalidate(scanpos) \
 	do { \
 		(scanpos).buf = InvalidBuffer; \
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index fdff960c1..c110567b0 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -387,16 +387,6 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 
-	/* we aren't holding any read locks, but gotta drop the pins */
-	if (BTScanPosIsValid(so->currPos))
-	{
-		/* Before leaving current page, deal with any killed items */
-		if (so->numKilled > 0)
-			_bt_killitems(scan);
-		BTScanPosUnpinIfPinned(so->currPos);
-		BTScanPosInvalidate(so->currPos);
-	}
-
 	/*
 	 * We prefer to eagerly drop leaf page pins before btgettuple returns.
 	 * This avoids making VACUUM wait to acquire a cleanup lock on the page.
@@ -425,12 +415,27 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 				   RelationNeedsWAL(scan->indexRelation) &&
 				   scan->heapRelation != NULL);
 
+	/*
+	 * Before leaving the current position, perform final steps, since we'll
+	 * never call _bt_steppage for its page
+	 */
+	if (BTScanPosIsValid(so->currPos))
+	{
+		if (so->numKilled > 0)
+			_bt_killitems(scan);
+		if (!so->dropPin)
+			BTScanPosUnpin(so->currPos);
+		BTScanPosInvalidate(so->currPos);
+	}
+
+	/* Always invalidate any existing mark */
 	so->markItemIndex = -1;
-	so->needPrimScan = false;
-	so->scanBehind = false;
-	so->oppositeDirCheck = false;
-	BTScanPosUnpinIfPinned(so->markPos);
-	BTScanPosInvalidate(so->markPos);
+	if (BTScanPosIsValid(so->markPos))
+	{
+		if (!so->dropPin)
+			BTScanPosUnpin(so->markPos);
+		BTScanPosInvalidate(so->markPos);
+	}
 
 	/*
 	 * Allocate tuple workspace arrays, if needed for an index-only scan and
@@ -459,6 +464,9 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 	 */
 	if (scankey && scan->numberOfKeys > 0)
 		memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
+	so->needPrimScan = false;
+	so->scanBehind = false;
+	so->oppositeDirCheck = false;
 	so->numberOfKeys = 0;		/* until _bt_preprocess_keys sets it */
 	so->numArrayKeys = 0;		/* ditto */
 }
@@ -471,19 +479,27 @@ btendscan(IndexScanDesc scan)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 
-	/* we aren't holding any read locks, but gotta drop the pins */
+	/*
+	 * Before leaving the current position, perform final steps, since we'll
+	 * never call _bt_steppage for its page
+	 */
 	if (BTScanPosIsValid(so->currPos))
 	{
-		/* Before leaving current page, deal with any killed items */
 		if (so->numKilled > 0)
 			_bt_killitems(scan);
-		BTScanPosUnpinIfPinned(so->currPos);
+		if (!so->dropPin)
+			BTScanPosUnpin(so->currPos);
+		BTScanPosInvalidate(so->currPos);	/* unnecessary, but be consistent */
 	}
 
-	so->markItemIndex = -1;
-	BTScanPosUnpinIfPinned(so->markPos);
-
-	/* No need to invalidate positions, the RAM is about to be freed. */
+	/* Always invalidate any existing mark */
+	so->markItemIndex = -1;		/* unnecessary, but be consistent */
+	if (BTScanPosIsValid(so->markPos))
+	{
+		if (!so->dropPin)
+			BTScanPosUnpin(so->markPos);
+		BTScanPosInvalidate(so->markPos);	/* unnecessary, but be consistent */
+	}
 
 	/* Release storage */
 	if (so->keyData != NULL)
@@ -507,8 +523,14 @@ btmarkpos(IndexScanDesc scan)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 
-	/* There may be an old mark with a pin (but no lock). */
-	BTScanPosUnpinIfPinned(so->markPos);
+	/* Always invalidate any existing mark */
+	so->markItemIndex = -1;
+	if (BTScanPosIsValid(so->markPos))
+	{
+		if (!so->dropPin)
+			BTScanPosUnpin(so->markPos);
+		BTScanPosInvalidate(so->markPos);
+	}
 
 	/*
 	 * Just record the current itemIndex.  If we later step to next page
@@ -518,11 +540,6 @@ btmarkpos(IndexScanDesc scan)
 	 */
 	if (BTScanPosIsValid(so->currPos))
 		so->markItemIndex = so->currPos.itemIndex;
-	else
-	{
-		BTScanPosInvalidate(so->markPos);
-		so->markItemIndex = -1;
-	}
 }
 
 /*
@@ -536,41 +553,62 @@ btrestrpos(IndexScanDesc scan)
 	if (so->markItemIndex >= 0)
 	{
 		/*
-		 * The scan has never moved to a new page since the last mark.  Just
+		 * The mark position is on the same page we are currently on.  Just
 		 * restore the itemIndex.
 		 *
-		 * NB: In this case we can't count on anything in so->markPos to be
-		 * accurate.
+		 * We generally only need to keep around a separate BTScanPosData
+		 * (which will have its own buffer pin when !so->dropPin) for a mark
+		 * whose page doesn't match so->currPos.currPage.  That requires care
+		 * later on in this function, and in _bt_steppage.
 		 */
+		Assert(!BTScanPosIsValid(so->markPos)); /* can't also be valid */
 		so->currPos.itemIndex = so->markItemIndex;
 	}
 	else
 	{
 		/*
-		 * The scan moved to a new page after last mark or restore, and we are
-		 * now restoring to the marked page.  We aren't holding any read
-		 * locks, but if we're still holding the pin for the current position,
-		 * we must drop it.
+		 * The scan moved to a page beyond that of the last mark we took.
+		 *
+		 * Before leaving the current position, perform final steps, since
+		 * we'll never call _bt_steppage for its page.
 		 */
 		if (BTScanPosIsValid(so->currPos))
 		{
-			/* Before leaving current page, deal with any killed items */
 			if (so->numKilled > 0)
 				_bt_killitems(scan);
-			BTScanPosUnpinIfPinned(so->currPos);
+			if (!so->dropPin)
+				BTScanPosUnpin(so->currPos);
+			BTScanPosInvalidate(so->currPos);
 		}
 
+		/*
+		 * We're completely done with our old so->currPos.  Copy so->markPos
+		 * into so->currPos to actually restore the mark.
+		 */
 		if (BTScanPosIsValid(so->markPos))
 		{
-			/* bump pin on mark buffer for assignment to current buffer */
-			if (BTScanPosIsPinned(so->markPos))
-				IncrBufferRefCount(so->markPos.buf);
 			memcpy(&so->currPos, &so->markPos,
 				   offsetof(BTScanPosData, items[1]) +
 				   so->markPos.lastItem * sizeof(BTScanPosItem));
 			if (so->currTuples)
 				memcpy(so->currTuples, so->markTuples,
 					   so->markPos.nextTupleOffset);
+
+			/*
+			 * We need to keep the mark around, in case it gets restored
+			 * again.  We use the so->markItemIndex representation for this.
+			 * Converting to that representation involves invalidating the
+			 * original so->markPos representation. (This is the opposite of
+			 * what happened back when _bt_steppage created this so->markPos
+			 * from the scan's then-obsolescent so->currPos.)
+			 *
+			 * Note: We deliberately avoid a "BTScanPosUnpin(so->markPos)"
+			 * when invalidating so->markPos, since so->markPos's original pin
+			 * is "transferred" to the new so->currPos.
+			 */
+			BTScanPosInvalidate(so->markPos);
+			so->markItemIndex = so->currPos.itemIndex;	/* keep the mark */
+
 			/* Reset the scan's array keys (see _bt_steppage for why) */
 			if (so->numArrayKeys)
 			{
@@ -578,8 +616,6 @@ btrestrpos(IndexScanDesc scan)
 				so->needPrimScan = false;
 			}
 		}
-		else
-			BTScanPosInvalidate(so->currPos);
 	}
 }
 
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 070f14c8b..be79f60c4 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1623,6 +1623,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 	Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight :
 		   so->currPos.moreLeft);
 	Assert(!P_IGNORE(opaque));
+	Assert(so->numKilled == 0);
+	Assert(so->markItemIndex < 0);
 	Assert(BTScanPosIsPinned(so->currPos));
 	Assert(!so->needPrimScan);
 
@@ -2096,6 +2098,7 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
 
 	/* Most recent _bt_readpage must have succeeded */
 	Assert(BTScanPosIsValid(so->currPos));
+	Assert(BTScanPosIsPinned(so->currPos) == !so->dropPin);
 	Assert(so->currPos.itemIndex >= so->currPos.firstItem);
 	Assert(so->currPos.itemIndex <= so->currPos.lastItem);
 
@@ -2109,6 +2112,8 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
  *	_bt_steppage() -- Step to next page containing valid data for scan
  *
  * Wrapper on _bt_readnextpage that performs final steps for the current page.
+ * Call here when leaving a so->currPos that _bt_readpage returned 'true' for
+ * (otherwise just call _bt_readnextpage directly).
  *
  * On entry, so->currPos must be valid.  Its buffer will be pinned, though
  * never locked. (Actually, when so->dropPin there won't even be a pin held,
@@ -2121,7 +2126,12 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 	BlockNumber blkno,
 				lastcurrblkno;
 
+	/* Assert that _bt_readpage returned true for so->currPos */
+	Assert(so->currPos.firstItem <= so->currPos.lastItem);
+
+	/* Assert that _bt_drop_lock_and_maybe_pin left so->currPos as expected */
 	Assert(BTScanPosIsValid(so->currPos));
+	Assert(BTScanPosIsPinned(so->currPos) == !so->dropPin);
 
 	/* Before leaving current page, deal with any killed items */
 	if (so->numKilled > 0)
@@ -2133,9 +2143,9 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 	 */
 	if (so->markItemIndex >= 0)
 	{
-		/* bump pin on current buffer for assignment to mark buffer */
-		if (BTScanPosIsPinned(so->currPos))
-			IncrBufferRefCount(so->currPos.buf);
+		/* mark/restore not supported by parallel scans */
+		Assert(!scan->parallel_scan);
+
 		memcpy(&so->markPos, &so->currPos,
 			   offsetof(BTScanPosData, items[1]) +
 			   so->currPos.lastItem * sizeof(BTScanPosItem));
@@ -2169,18 +2179,23 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 				so->markPos.moreLeft = true;
 		}
 
-		/* mark/restore not supported by parallel scans */
-		Assert(!scan->parallel_scan);
+		/*
+		 * Saving so->currPos into so->markPos, so just unset its 'buf' field
+		 * (avoid unpinning the buffer, since it is now owned by so->markPos).
+		 * _bt_readnextpage expects so->currPos to always look like this.
+		 */
+		so->currPos.buf = InvalidBuffer;	/* transferred to so->markPos */
 	}
-
-	BTScanPosUnpinIfPinned(so->currPos);
-
-	/* Walk to the next page with data */
-	if (ScanDirectionIsForward(dir))
-		blkno = so->currPos.nextPage;
 	else
-		blkno = so->currPos.prevPage;
-	lastcurrblkno = so->currPos.currPage;
+	{
+		/*
+		 * Not saving so->currPos into so->markPos, so release its buffer pin
+		 * (except in so->dropPin case, where _bt_drop_lock_and_maybe_pin must
+		 * have done so earlier)
+		 */
+		if (!so->dropPin)
+			BTScanPosUnpin(so->currPos);
+	}
 
 	/*
 	 * Cancel primitive index scans that were scheduled when the call to
@@ -2191,6 +2206,13 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 	if (so->currPos.dir != dir)
 		so->needPrimScan = false;
 
+	/* Walk to the next page with data */
+	if (ScanDirectionIsForward(dir))
+		blkno = so->currPos.nextPage;
+	else
+		blkno = so->currPos.prevPage;
+	lastcurrblkno = so->currPos.currPage;
+
 	return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);
 }
 
@@ -2220,10 +2242,10 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 static bool
 _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
 {
+	Relation	rel = scan->indexRelation;
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
-
-	so->numKilled = 0;			/* just paranoia */
-	so->markItemIndex = -1;		/* ditto */
+	BlockNumber blkno,
+				lastcurrblkno;
 
 	/* Initialize so->currPos for the first page (page in so->currPos.buf) */
 	if (so->needPrimScan)
@@ -2253,8 +2275,6 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
 	 */
 	if (_bt_readpage(scan, dir, offnum, true))
 	{
-		Relation	rel = scan->indexRelation;
-
 		/*
 		 * _bt_readpage succeeded.  Drop the lock (and maybe the pin) on
 		 * so->currPos.buf in preparation for btgettuple returning tuples.
@@ -2265,14 +2285,17 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
 	}
 
 	/* There's no actually-matching data on the page in so->currPos.buf */
-	_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
+	_bt_relbuf(rel, so->currPos.buf);
+	so->currPos.buf = InvalidBuffer;
 
-	/* Call _bt_readnextpage using its _bt_steppage wrapper function */
-	if (!_bt_steppage(scan, dir))
-		return false;
+	/* Walk to the next page with data */
+	if (ScanDirectionIsForward(dir))
+		blkno = so->currPos.nextPage;
+	else
+		blkno = so->currPos.prevPage;
+	lastcurrblkno = so->currPos.currPage;
 
-	/* _bt_readpage for a later page (now in so->currPos) succeeded */
-	return true;
+	return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);
 }
 
 /*
@@ -2408,6 +2431,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
 
 		/* no matching tuples on this page */
 		_bt_relbuf(rel, so->currPos.buf);
+		so->currPos.buf = InvalidBuffer;
 		seized = false;			/* released by _bt_readpage (or by us) */
 	}
 
-- 
2.49.0

