From 318822f0d2848fb3a3400dd2ea5f2c0400f9505b Mon Sep 17 00:00:00 2001
From: nkey <michail.nikolaev@gmail.com>
Date: Wed, 12 Mar 2025 00:53:02 +0100
Subject: [PATCH v6 2/2] Fix btree index scan concurrency issues with dirty
 snapshots

This patch addresses an issue where non-MVCC index scans using SnapshotDirty
or SnapshotSelf could miss tuples due to concurrent modifications. The fix
retains read locks on pages for these special snapshot types until the scan
is done with the page's tuples, preventing concurrent modifications from
causing inconsistent results.

Updated README to document this special case in the btree locking mechanism.
---
 src/backend/access/nbtree/README      | 13 +++++++++-
 src/backend/access/nbtree/nbtree.c    | 17 +++++++++++++
 src/backend/access/nbtree/nbtsearch.c | 35 +++++++++++++++++++++++----
 src/backend/access/nbtree/nbtutils.c  |  8 +++++-
 src/include/access/nbtree.h           |  3 +++
 5 files changed, 69 insertions(+), 7 deletions(-)

diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index 53d4a61dc3f..a9280415633 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -85,7 +85,8 @@ move right until we find a page whose right-link matches the page we
 came from.  (Actually, it's even harder than that; see page deletion
 discussion below.)
 
-Page read locks are held only for as long as a scan is examining a page.
+Page read locks are held only for as long as a scan is examining a page
+(with exception for SnapshotDirty and SnapshotSelf scans - see below).
 To minimize lock/unlock traffic, an index scan always searches a leaf page
 to identify all the matching items at once, copying their heap tuple IDs
 into backend-local storage.  The heap tuple IDs are then processed while
@@ -103,6 +104,16 @@ We also remember the left-link, and follow it when the scan moves backwards
 (though this requires extra handling to account for concurrent splits of
 the left sibling; see detailed move-left algorithm below).
 
+Despite the described mechanics in place, inconsistent results may still occur
+during non-MVCC scans (SnapshotDirty and SnapshotSelf). This issue can occur if a 
+concurrent transaction deletes a tuple and inserts a new tuple with a new TID in the 
+same page. If the scan has already visited the page and cached its content in the
+backend-local storage, it might skip the old tuple due to deletion and miss the new 
+tuple because the scan does not re-read the page. To address this issue, for 
+SnapshotDirty and SnapshotSelf scans, we retain the read lock on the page until 
+we're completely done processing all the tuples from that page, preventing 
+concurrent modifications that could lead to inconsistent results.
+
 In most cases we release our lock and pin on a page before attempting
 to acquire pin and lock on the page we are moving to.  In a few places
 it is necessary to lock the next page before releasing the current one.
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 765659887af..8239076c518 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -389,6 +389,12 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 		/* Before leaving current page, deal with any killed items */
 		if (so->numKilled > 0)
 			_bt_killitems(scan);
+		/* Release any extended lock held for SnapshotDirty/Self scans */
+		if (so->currPos.extra_unlock)
+		{
+			_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
+			so->currPos.extra_unlock = false;
+		}
 		BTScanPosUnpinIfPinned(so->currPos);
 		BTScanPosInvalidate(so->currPos);
 	}
@@ -445,6 +451,12 @@ btendscan(IndexScanDesc scan)
 		/* Before leaving current page, deal with any killed items */
 		if (so->numKilled > 0)
 			_bt_killitems(scan);
+		/* Release any extended lock held for SnapshotDirty/Self scans */
+		if (so->currPos.extra_unlock)
+		{
+			_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
+			so->currPos.extra_unlock = false;
+		}
 		BTScanPosUnpinIfPinned(so->currPos);
 	}
 
@@ -525,6 +537,11 @@ btrestrpos(IndexScanDesc scan)
 			/* Before leaving current page, deal with any killed items */
 			if (so->numKilled > 0)
 				_bt_killitems(scan);
+			if (so->currPos.extra_unlock)
+			{
+				_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
+				so->currPos.extra_unlock = false;
+			}
 			BTScanPosUnpinIfPinned(so->currPos);
 		}
 
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index fe9a3886913..2c5a34cacfe 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -61,11 +61,22 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
  * This will prevent vacuum from stalling in a blocked state trying to read a
  * page when a cursor is sitting on it.
  *
+ * For SnapshotDirty and SnapshotSelf scans, we don't actually unlock the buffer
+ * here, but instead set extra_unlock to indicate that the lock should be held
+ * until we're completely done with this page. This prevents concurrent
+ * modifications from causing inconsistent results during non-MVCC scans.
+ *
  * See nbtree/README section on making concurrent TID recycling safe.
  */
 static void
 _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
 {
+	if (scan->xs_snapshot->snapshot_type == SNAPSHOT_DIRTY ||
+		scan->xs_snapshot->snapshot_type == SNAPSHOT_SELF)
+	{
+		sp->extra_unlock = true;
+		return;
+	}
 	_bt_unlockbuf(scan->indexRelation, sp->buf);
 
 	if (IsMVCCSnapshot(scan->xs_snapshot) &&
@@ -1527,7 +1538,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
  *	_bt_next() -- Get the next item in a scan.
  *
  *		On entry, so->currPos describes the current page, which may be pinned
- *		but is not locked, and so->currPos.itemIndex identifies which item was
+ *		but is not locked (except for SnapshotDirty and SnapshotSelf scans, where
+ *		the page remains locked), and so->currPos.itemIndex identifies which item was
  *		previously returned.
  *
  *		On success exit, so->currPos is updated as needed, and _bt_returnitem
@@ -2107,10 +2119,11 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
  *
  * Wrapper on _bt_readnextpage that performs final steps for the current page.
  *
- * On entry, if so->currPos.buf is valid the buffer is pinned but not locked.
- * If there's no pin held, it's because _bt_drop_lock_and_maybe_pin dropped
- * the pin eagerly earlier on.  The scan must have so->currPos.currPage set to
- * a valid block, in any case.
+ * On entry, if so->currPos.buf is valid the buffer is pinned but not locked,
+ * except for SnapshotDirty and SnapshotSelf scans where the buffer remains locked
+ * until we're done with all tuples from the page. If there's no pin held, it's
+ * because _bt_drop_lock_and_maybe_pin dropped the pin eagerly earlier on.
+ * The scan must have so->currPos.currPage set to a valid block, in any case.
  */
 static bool
 _bt_steppage(IndexScanDesc scan, ScanDirection dir)
@@ -2169,8 +2182,20 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 
 		/* mark/restore not supported by parallel scans */
 		Assert(!scan->parallel_scan);
+		Assert(scan->xs_snapshot->snapshot_type != SNAPSHOT_DIRTY);
+		Assert(scan->xs_snapshot->snapshot_type != SNAPSHOT_SELF);
 	}
 
+	/* 
+	 * For SnapshotDirty/Self scans, we kept the read lock after processing
+	 * the page's tuples (see _bt_drop_lock_and_maybe_pin). Now that we're 
+	 * moving to another page, we need to explicitly release that lock.
+	 */
+	if (so->currPos.extra_unlock)
+	{
+		_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
+		so->currPos.extra_unlock = false;
+	}
 	BTScanPosUnpinIfPinned(so->currPos);
 
 	/* Walk to the next page with data */
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 1a15dfcb7d3..61008a36b5d 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -3383,13 +3383,17 @@ _bt_killitems(IndexScanDesc scan)
 		 * LSN.
 		 */
 		droppedpin = false;
-		_bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ);
+		/* For SnapshotDirty/Self scans, the buffer is already locked */
+		if (!so->currPos.extra_unlock)
+			_bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ);
 
 		page = BufferGetPage(so->currPos.buf);
 	}
 	else
 	{
 		Buffer		buf;
+		/* extra_unlock should never be set without a valid buffer pin */
+		Assert(!so->currPos.extra_unlock);
 
 		droppedpin = true;
 		/* Attempt to re-read the buffer, getting pin and lock. */
@@ -3526,6 +3530,8 @@ _bt_killitems(IndexScanDesc scan)
 	}
 
 	_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
+	/* Reset the extra_unlock flag since we've now released the lock */
+	so->currPos.extra_unlock = false;
 }
 
 
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index ebca02588d3..2c2485f34bd 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -962,6 +962,7 @@ typedef struct BTScanPosItem	/* what we remember about each match */
 typedef struct BTScanPosData
 {
 	Buffer		buf;			/* currPage buf (invalid means unpinned) */
+	bool		extra_unlock;	/* for SnapshotDirty/Self, read lock is held even after _bt_drop_lock_and_maybe_pin */
 
 	/* page details as of the saved position's call to _bt_readpage */
 	BlockNumber currPage;		/* page referenced by items array */
@@ -1009,6 +1010,7 @@ typedef BTScanPosData *BTScanPos;
 )
 #define BTScanPosUnpin(scanpos) \
 	do { \
+		Assert(!(scanpos).extra_unlock); \
 		ReleaseBuffer((scanpos).buf); \
 		(scanpos).buf = InvalidBuffer; \
 	} while (0)
@@ -1028,6 +1030,7 @@ typedef BTScanPosData *BTScanPos;
 	do { \
 		(scanpos).buf = InvalidBuffer; \
 		(scanpos).currPage = InvalidBlockNumber; \
+		(scanpos).extra_unlock = false; \
 	} while (0)
 
 /* We need one of these for each equality-type SK_SEARCHARRAY scan key */
-- 
2.43.0

