From f5d8a31a8a1eaf3761d02b79ce721d399c0860ac Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Mon, 9 Jun 2025 09:57:30 -0400
Subject: [PATCH v3 1/2] Make _bt_killitems drop pins it acquired itself.

Teach nbtree's _bt_killitems to leave the so->currPos page that it sets
LP_DEAD items on in whatever state it was in when _bt_killitems was
called.  In particular, make sure that so->dropPin scans don't acquire a
pin whose reference is saved in so->currPos.buf.

Allowing _bt_killitems to change so->currPos.buf like this is wrong.
The immediate consequence of allowing it is that code in _bt_steppage
(that copies so->currPos into so->markPos) will behave as if the scan is
a !so->dropPin scan.  so->markPos will therefore retain the buffer pin
indefinitely, even though _bt_killitems only needs to acquire a pin
(along with a lock) for long enough to mark known-dead items LP_DEAD.

This issue came to light following a report of a failure of an assertion
from recent commit e6eed40e.  The test case in question involves the use
of mark and restore.  An initial call to _bt_killitems takes place that
leaves so->currPos.buf in a state that is inconsistent with the scan
being so->dropPin.  A subsequent call to _bt_killitems for the same
position (following so->currPos being saved in so->markPos, and then
restored as so->currPos) resulted in the failure of an assertion that
tests that so->currPos.buf is InvalidBuffer when the scan is so->dropPin
(on non-assert builds, we get a buffer pin leak instead).

The same problem exists on earlier releases, though the problem is far
more subtle there.  Recent commit e6eed40e introduced the so->dropPin
field as a partial replacement for testing so->currPos.buf directly.
Earlier releases won't get an assertion failure (or buffer pin leak),
but they will allow the second _bt_killitems call from the test case to
behave as if a buffer pin was consistently held since the original call
to _bt_readpage.  This is wrong; there will have been an initial window
during which no pin was held on the so->currPos page, and yet the second
confused _bt_killitems call neglects to check that so->currPos.lsn still
matches the current page LSN.  As a result of all this, it's just about
possible that _bt_killitems will mark the wrong index tuples LP_DEAD.
This could only happen with merge joins (which are the only user of
nbtree's mark/restore support), and only when a concurrently inserted
index tuple happened to use a recently-recycled TID, and happened to be
inserted onto the same leaf page as a concurrently-VACUUM'd dead tuple.

A follow-up commit will make nbtree completely stop conditioning whether
or not a position's pin needs to be dropped on whether the 'buf' field
is set.  All call sites that might need to drop a still-held pin should
be taught to rely on the scan-level so->dropPin field introduced by
recent commit e6eed40e.  That approach would make bugs like this one
impossible (or, at worst, make them much easier to detect).

Author: Peter Geoghegan <pg@bowt.ie>
Reported-By: Alexander Lakhin <exclusion@gmail.com>
Discussion: https://postgr.es/m/545be1e5-3786-439a-9257-a90d30f8b849@gmail.com
Backpatch-through: 13
---
 src/backend/access/nbtree/nbtree.c   |  2 ++
 src/backend/access/nbtree/nbtutils.c | 21 ++++++++++++---------
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 03a1d7b02..fdff960c1 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -417,6 +417,8 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 	 * way, so we might as well avoid wasting cycles on acquiring page LSNs.
 	 *
 	 * See nbtree/README section on making concurrent TID recycling safe.
+	 *
+	 * Note: so->dropPin should never change across rescans.
 	 */
 	so->dropPin = (!scan->xs_want_itup &&
 				   IsMVCCSnapshot(scan->xs_snapshot) &&
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 29f0dca1b..f3d0556fc 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -3323,9 +3323,9 @@ _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate,
  * current page and killed tuples thereon (generally, this should only be
  * called if so->numKilled > 0).
  *
- * The caller does not have a lock on the page and may or may not have the
- * page pinned in a buffer.  Note that read-lock is sufficient for setting
- * LP_DEAD status (which is only a hint).
+ * Caller should not have a lock on the so->currPos page, but must hold a
+ * buffer pin when !so->dropPin.  When we return, it still won't be locked.
+ * It'll continue to hold whatever pins were held before calling here.
  *
  * We match items by heap TID before assuming they are the right ones to
  * delete.  We cope with cases where items have moved right due to insertions.
@@ -3353,6 +3353,7 @@ _bt_killitems(IndexScanDesc scan)
 	OffsetNumber maxoff;
 	int			numKilled = so->numKilled;
 	bool		killedsomething = false;
+	Buffer		buf;
 
 	Assert(numKilled > 0);
 	Assert(BTScanPosIsValid(so->currPos));
@@ -3369,11 +3370,11 @@ _bt_killitems(IndexScanDesc scan)
 		 * concurrent VACUUMs from recycling any of the TIDs on the page.
 		 */
 		Assert(BTScanPosIsPinned(so->currPos));
-		_bt_lockbuf(rel, so->currPos.buf, BT_READ);
+		buf = so->currPos.buf;
+		_bt_lockbuf(rel, buf, BT_READ);
 	}
 	else
 	{
-		Buffer		buf;
 		XLogRecPtr	latestlsn;
 
 		Assert(!BTScanPosIsPinned(so->currPos));
@@ -3391,10 +3392,9 @@ _bt_killitems(IndexScanDesc scan)
 		}
 
 		/* Unmodified, hinting is safe */
-		so->currPos.buf = buf;
 	}
 
-	page = BufferGetPage(so->currPos.buf);
+	page = BufferGetPage(buf);
 	opaque = BTPageGetOpaque(page);
 	minoff = P_FIRSTDATAKEY(opaque);
 	maxoff = PageGetMaxOffsetNumber(page);
@@ -3511,10 +3511,13 @@ _bt_killitems(IndexScanDesc scan)
 	if (killedsomething)
 	{
 		opaque->btpo_flags |= BTP_HAS_GARBAGE;
-		MarkBufferDirtyHint(so->currPos.buf, true);
+		MarkBufferDirtyHint(buf, true);
 	}
 
-	_bt_unlockbuf(rel, so->currPos.buf);
+	if (!so->dropPin)
+		_bt_unlockbuf(rel, buf);
+	else
+		_bt_relbuf(rel, buf);
 }
 
 
-- 
2.49.0

