From 0474c4028e644fa99fa4dc1cf2479220782e7a30 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <boekewurm+postgres@gmail.com>
Date: Thu, 20 Mar 2025 23:12:25 +0100
Subject: [PATCH v10 4/5] NBTree: Reduce Index-Only Scan pinning requirements

Previously, we would keep a pin on every leaf page while we were returning
tuples to the scan.  With this patch, we utilize the newly introduced
table_index_vischeck_tuples API to pre-check visibility of all TIDs, and
thus unpin the page well ahead of when we'd usually be ready with returning
and processing all index tuple results.  This reduces the time VACUUM may
have to wait for a pin, and can increase performance with reduced redundant
VM checks.
---
 src/include/access/nbtree.h           |   4 ++
 src/backend/access/nbtree/nbtree.c    |   5 ++
 src/backend/access/nbtree/nbtsearch.c | 100 ++++++++++++++++++++++++--
 3 files changed, 103 insertions(+), 6 deletions(-)

diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 0c43767f8c3..2423ddf7bfd 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -951,6 +951,7 @@ typedef struct BTScanPosItem	/* what we remember about each match */
 	ItemPointerData heapTid;	/* TID of referenced heap item */
 	OffsetNumber indexOffset;	/* index item's location within page */
 	LocationIndex tupleOffset;	/* IndexTuple's offset in workspace, if any */
+	uint8		visrecheck;		/* visibility recheck status, if any */
 } BTScanPosItem;
 
 typedef struct BTScanPosData
@@ -1053,6 +1054,9 @@ typedef struct BTScanOpaqueData
 	int		   *killedItems;	/* currPos.items indexes of killed items */
 	int			numKilled;		/* number of currently stored items */
 
+	/* buffer used for index-only scan visibility checks */
+	Buffer		vmbuf;
+
 	/*
 	 * If we are doing an index-only scan, these are the tuple storage
 	 * workspaces for the currPos and markPos respectively.  Each is of size
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index c0a8833e068..e7b7f7cfa4c 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -345,6 +345,8 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
 	so->killedItems = NULL;		/* until needed */
 	so->numKilled = 0;
 
+	so->vmbuf = InvalidBuffer;
+
 	/*
 	 * We don't know yet whether the scan will be index-only, so we do not
 	 * allocate the tuple workspace arrays until btrescan.  However, we set up
@@ -436,6 +438,9 @@ btendscan(IndexScanDesc scan)
 	so->markItemIndex = -1;
 	BTScanPosUnpinIfPinned(so->markPos);
 
+	if (BufferIsValid(so->vmbuf))
+		ReleaseBuffer(so->vmbuf);
+
 	/* No need to invalidate positions, the RAM is about to be freed. */
 
 	/* Release storage */
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 22b27d01d00..7445ee9daeb 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -25,7 +25,7 @@
 #include "utils/rel.h"
 
 
-static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
+static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp, Buffer *vmbuf);
 static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,
 							Buffer buf, bool forupdate, BTStack stack,
 							int access);
@@ -64,13 +64,88 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
  * See nbtree/README section on making concurrent TID recycling safe.
  */
 static void
-_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
+_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp, Buffer *vmbuf)
 {
 	_bt_unlockbuf(scan->indexRelation, sp->buf);
 
+	/*
+	 * Do some visibility checks if this is an index-only scan; allowing us to
+	 * drop the pin on this page before we have returned all tuples from this
+	 * IOS to the executor.
+	 */
+	if (scan->xs_want_itup)
+	{
+		TM_IndexVisibilityCheckOp visCheck;
+		int		offset = sp->firstItem;
+
+		visCheck.nchecktids = 1 + sp->lastItem - offset;
+		visCheck.checktids = palloc_array(TM_VisCheck,
+										  visCheck.nchecktids);
+		visCheck.vmbuf = vmbuf;
+
+		for (int i = 0; i < visCheck.nchecktids; i++)
+		{
+			int		itemidx = offset + i;
+
+			Assert(sp->items[itemidx].visrecheck == TMVC_Unchecked);
+			Assert(ItemPointerIsValid(&sp->items[itemidx].heapTid));
+
+			visCheck.checktids[i].tid = sp->items[itemidx].heapTid;
+			visCheck.checktids[i].idxoffnum = itemidx;
+			visCheck.checktids[i].vischeckresult = TMVC_Unchecked;
+		}
+
+		table_index_vischeck_tuples(scan->heapRelation, &visCheck);
+
+		for (int i = 0; i < visCheck.nchecktids; i++)
+		{
+			TM_VisCheck *check = &visCheck.checktids[i];
+			BTScanPosItem *item = &sp->items[check->idxoffnum];
+
+			/* We must have a valid visibility check result */
+			Assert(check->vischeckresult != TMVC_Unchecked);
+			/* The offset number should still indicate the right item */
+			Assert(ItemPointerEquals(&check->tid, &item->heapTid));
+
+			/* Store the visibility check result */
+			item->visrecheck = check->vischeckresult;
+		}
+
+		/* release temporary resources */
+		pfree(visCheck.checktids);
+	}
+
+	/*
+	 * We may need to hold a pin on the page for one of several reasons:
+	 *
+	 * 1.) To safely apply kill_prior_tuple, we need to know that the tuples
+	 * were not removed from the page (and subsequently re-inserted).
+	 * A page's LSN can also allow us to detect modifications on the page,
+	 * which then allows us to bail out of setting the hint bits, but that
+	 * requires the index to be WAL-logged; so unless the index is WAL-logged
+	 * we must hold a pin on the page to apply the kill_prior_tuple
+	 * optimization.
+	 *
+	 * 2.) Non-MVCC scans need pin coupling to make sure the scan covers
+	 * exactly the whole index keyspace.
+	 *
+	 * 3.) For Index-Only Scans, the scan needs to check the visibility of the
+	 * table tuple while the relevant index tuple is guaranteed to still be
+	 * contained in the index (so that vacuum hasn't yet marked any pages that
+	 * could contain the value as ALL_VISIBLE after reclaiming a dead tuple
+	 * that might be buffered in the scan).  A pin must therefore be held
+	 * at least while the basic visibility of the page's tuples is being
+	 * checked.
+	 *
+	 * For cases 1 and 2, we must hold the pin after we've finished processing
+	 * the index page.
+	 *
+	 * For case 3, we can release the pin if we first do the visibility checks
+	 * of to-be-returned tuples using table_index_vischeck_tuples, which we've
+	 * done just above.
+	 */
 	if (IsMVCCSnapshot(scan->xs_snapshot) &&
-		RelationNeedsWAL(scan->indexRelation) &&
-		!scan->xs_want_itup)
+		RelationNeedsWAL(scan->indexRelation))
 	{
 		ReleaseBuffer(sp->buf);
 		sp->buf = InvalidBuffer;
@@ -1904,6 +1979,8 @@ _bt_saveitem(BTScanOpaque so, int itemIndex,
 
 	currItem->heapTid = itup->t_tid;
 	currItem->indexOffset = offnum;
+	currItem->visrecheck = TMVC_Unchecked;
+
 	if (so->currTuples)
 	{
 		Size		itupsz = IndexTupleSize(itup);
@@ -1934,6 +2011,8 @@ _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
 
 	currItem->heapTid = *heapTid;
 	currItem->indexOffset = offnum;
+	currItem->visrecheck = TMVC_Unchecked;
+
 	if (so->currTuples)
 	{
 		/* Save base IndexTuple (truncate posting list) */
@@ -1970,6 +2049,7 @@ _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
 
 	currItem->heapTid = *heapTid;
 	currItem->indexOffset = offnum;
+	currItem->visrecheck = TMVC_Unchecked;
 
 	/*
 	 * Have index-only scans return the same base IndexTuple for every TID
@@ -1995,6 +2075,14 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
 
 	/* Return next item, per amgettuple contract */
 	scan->xs_heaptid = currItem->heapTid;
+
+	if (scan->xs_want_itup)
+	{
+		scan->xs_visrecheck = currItem->visrecheck;
+		Assert(currItem->visrecheck != TMVC_Unchecked ||
+			   BufferIsValid(so->currPos.buf));
+	}
+
 	if (so->currTuples)
 		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
 }
@@ -2153,7 +2241,7 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
 		 * so->currPos.buf in preparation for btgettuple returning tuples.
 		 */
 		Assert(BTScanPosIsPinned(so->currPos));
-		_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+		_bt_drop_lock_and_maybe_pin(scan, &so->currPos, &so->vmbuf);
 		return true;
 	}
 
@@ -2310,7 +2398,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
 	 */
 	Assert(so->currPos.currPage == blkno);
 	Assert(BTScanPosIsPinned(so->currPos));
-	_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+	_bt_drop_lock_and_maybe_pin(scan, &so->currPos, &so->vmbuf);
 
 	return true;
 }
-- 
2.45.2

