From 045942933860b18a26e4c8239b3df350a23caab9 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <boekewurm+postgres@gmail.com>
Date: Thu, 20 Mar 2025 23:12:25 +0100
Subject: [PATCH v12 4/5] NBTree: Reduce Index-Only Scan pin duration

Previously, we would keep a pin on every leaf page while we were returning
tuples to the scan.  With this patch, we utilize the newly introduced
table_index_vischeck_tuples API to pre-check visibility of all TIDs, and
thus unpin the page well ahead of when we'd usually be ready with returning
and processing all index tuple results.  This reduces the time VACUUM may
have to wait for a pin, and can increase performance with reduced redundant
VM checks.
---
 src/backend/access/nbtree/nbtree.c    |  21 +++++
 src/backend/access/nbtree/nbtsearch.c | 125 ++++++++++++++++++++++++--
 src/include/access/nbtree.h           |   6 ++
 3 files changed, 147 insertions(+), 5 deletions(-)

diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index accc7fe8bbe..1f44d17e3b4 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -360,6 +360,10 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
 	so->killedItems = NULL;		/* until needed */
 	so->numKilled = 0;
 
+	so->vmbuf = InvalidBuffer;
+	so->vischeckcap = 0;
+	so->vischecksbuf = NULL;
+
 	/*
 	 * We don't know yet whether the scan will be index-only, so we do not
 	 * allocate the tuple workspace arrays until btrescan.  However, we set up
@@ -400,6 +404,12 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 	BTScanPosUnpinIfPinned(so->markPos);
 	BTScanPosInvalidate(so->markPos);
 
+	if (BufferIsValid(so->vmbuf))
+	{
+		ReleaseBuffer(so->vmbuf);
+		so->vmbuf = InvalidBuffer;
+	}
+
 	/*
 	 * Allocate tuple workspace arrays, if needed for an index-only scan and
 	 * not already done in a previous rescan call.  To save on palloc
@@ -451,6 +461,17 @@ btendscan(IndexScanDesc scan)
 	so->markItemIndex = -1;
 	BTScanPosUnpinIfPinned(so->markPos);
 
+	if (so->vischecksbuf)
+		pfree(so->vischecksbuf);
+	so->vischecksbuf = NULL;
+	so->vischeckcap = 0;
+
+	if (BufferIsValid(so->vmbuf))
+	{
+		ReleaseBuffer(so->vmbuf);
+		so->vmbuf = InvalidBuffer;
+	}
+
 	/* No need to invalidate positions, the RAM is about to be freed. */
 
 	/* Release storage */
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 77264ddeecb..ed173bf7246 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -25,7 +25,7 @@
 #include "utils/rel.h"
 
 
-static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
+static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp, BTScanOpaque so);
 static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,
 							Buffer buf, bool forupdate, BTStack stack,
 							int access);
@@ -54,6 +54,12 @@ static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
 
 
+/*
+ * Execute vischecks at the index level?
+ * Enabled by default.
+ */
+#define DEBUG_IOS_VISCHECKS_ENABLED true
+
 /*
  *	_bt_drop_lock_and_maybe_pin()
  *
@@ -64,13 +70,109 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
  * See nbtree/README section on making concurrent TID recycling safe.
  */
 static void
-_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
+_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp, BTScanOpaque so)
 {
 	_bt_unlockbuf(scan->indexRelation, sp->buf);
 
+	/*
+	 * Do some visibility checks if this is an index-only scan; allowing us to
+	 * drop the pin on this page before we have returned all tuples from this
+	 * IOS to the executor.
+	 */
+	if (scan->xs_want_itup && DEBUG_IOS_VISCHECKS_ENABLED)
+	{
+		int		initOffset = sp->firstItem;
+		int		ntids = 1 + sp->lastItem - initOffset;
+
+		if (ntids > 0)
+		{
+			TM_IndexVisibilityCheckOp visCheck;
+			Relation	heaprel = scan->heapRelation;
+			TM_VisCheck *check;
+			BTScanPosItem *item;
+
+			visCheck.checkntids = ntids;
+
+			if (so->vischeckcap == 0)
+			{
+				so->vischecksbuf = palloc_array(TM_VisCheck, ntids);
+				so->vischeckcap = ntids;
+			}
+			else if (so->vischeckcap < visCheck.checkntids)
+			{
+				so->vischecksbuf = repalloc_array(so->vischecksbuf,
+												  TM_VisCheck, ntids);
+				so->vischeckcap = ntids;
+			}
+
+			visCheck.checktids = so->vischecksbuf;
+			visCheck.vmbuf = &so->vmbuf;
+
+			check = so->vischecksbuf;
+			item = &so->currPos.items[initOffset];
+
+			for (int i = 0; i < visCheck.checkntids; i++)
+			{
+				Assert(item->visrecheck == TMVC_Unchecked);
+				Assert(ItemPointerIsValid(&item->heapTid));
+
+				PopulateTMVischeck(check, &item->heapTid, initOffset + i);
+
+				item++;
+				check++;
+			}
+
+			table_index_vischeck_tuples(heaprel, &visCheck);
+			check = so->vischecksbuf;
+
+			for (int i = 0; i < visCheck.checkntids; i++)
+			{
+				item = &so->currPos.items[check->idxoffnum];
+				/* We must have a valid visibility check result */
+				Assert(check->vischeckresult != TMVC_Unchecked);
+				/* The offset number should still indicate the right item */
+				Assert(check->tidblkno == ItemPointerGetBlockNumberNoCheck(&item->heapTid));
+				Assert(check->tidoffset == ItemPointerGetOffsetNumberNoCheck(&item->heapTid));
+
+				/* Store the visibility check result */
+				item->visrecheck = check->vischeckresult;
+				check++;
+			}
+		}
+	}
+
+	/*
+	 * We may need to hold a pin on the page for one of several reasons:
+	 *
+	 * 1.) To safely apply kill_prior_tuple, we need to know that the tuples
+	 * were not removed from the page (and subsequently re-inserted).
+	 * A page's LSN can also allow us to detect modifications on the page,
+	 * which then allows us to bail out of setting the hint bits, but that
+	 * requires the index to be WAL-logged; so unless the index is WAL-logged
+	 * we must hold a pin on the page to apply the kill_prior_tuple
+	 * optimization.
+	 *
+	 * 2.) Non-MVCC scans need pin coupling to make sure the scan covers
+	 * exactly the whole index keyspace.
+	 *
+	 * 3.) For Index-Only Scans, the scan needs to check the visibility of the
+	 * table tuple while the relevant index tuple is guaranteed to still be
+	 * contained in the index (so that vacuum hasn't yet marked any pages that
+	 * could contain the value as ALL_VISIBLE after reclaiming a dead tuple
+	 * that might be buffered in the scan).  A pin must therefore be held
+	 * at least while the basic visibility of the page's tuples is being
+	 * checked.
+	 *
+	 * For cases 1 and 2, we must hold the pin after we've finished processing
+	 * the index page.
+	 *
+	 * For case 3, we can release the pin if we first do the visibility checks
+	 * of to-be-returned tuples using table_index_vischeck_tuples, which we've
+	 * done just above.
+	 */
 	if (IsMVCCSnapshot(scan->xs_snapshot) &&
 		RelationNeedsWAL(scan->indexRelation) &&
-		!scan->xs_want_itup)
+		(!scan->xs_want_itup || DEBUG_IOS_VISCHECKS_ENABLED))
 	{
 		ReleaseBuffer(sp->buf);
 		sp->buf = InvalidBuffer;
@@ -2001,6 +2103,8 @@ _bt_saveitem(BTScanOpaque so, int itemIndex,
 
 	currItem->heapTid = itup->t_tid;
 	currItem->indexOffset = offnum;
+	currItem->visrecheck = TMVC_Unchecked;
+
 	if (so->currTuples)
 	{
 		Size		itupsz = IndexTupleSize(itup);
@@ -2031,6 +2135,8 @@ _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
 
 	currItem->heapTid = *heapTid;
 	currItem->indexOffset = offnum;
+	currItem->visrecheck = TMVC_Unchecked;
+
 	if (so->currTuples)
 	{
 		/* Save base IndexTuple (truncate posting list) */
@@ -2067,6 +2173,7 @@ _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
 
 	currItem->heapTid = *heapTid;
 	currItem->indexOffset = offnum;
+	currItem->visrecheck = TMVC_Unchecked;
 
 	/*
 	 * Have index-only scans return the same base IndexTuple for every TID
@@ -2092,6 +2199,14 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
 
 	/* Return next item, per amgettuple contract */
 	scan->xs_heaptid = currItem->heapTid;
+
+	if (scan->xs_want_itup)
+	{
+		scan->xs_visrecheck = currItem->visrecheck;
+		Assert(currItem->visrecheck != TMVC_Unchecked ||
+			   BufferIsValid(so->currPos.buf));
+	}
+
 	if (so->currTuples)
 		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
 }
@@ -2250,7 +2365,7 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
 		 * so->currPos.buf in preparation for btgettuple returning tuples.
 		 */
 		Assert(BTScanPosIsPinned(so->currPos));
-		_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+		_bt_drop_lock_and_maybe_pin(scan, &so->currPos, so);
 		return true;
 	}
 
@@ -2407,7 +2522,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
 	 */
 	Assert(so->currPos.currPage == blkno);
 	Assert(BTScanPosIsPinned(so->currPos));
-	_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+	_bt_drop_lock_and_maybe_pin(scan, &so->currPos, so);
 
 	return true;
 }
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index ebca02588d3..f9ced4a1f0b 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -957,6 +957,7 @@ typedef struct BTScanPosItem	/* what we remember about each match */
 	ItemPointerData heapTid;	/* TID of referenced heap item */
 	OffsetNumber indexOffset;	/* index item's location within page */
 	LocationIndex tupleOffset;	/* IndexTuple's offset in workspace, if any */
+	uint8		visrecheck;		/* visibility recheck status, if any */
 } BTScanPosItem;
 
 typedef struct BTScanPosData
@@ -1071,6 +1072,11 @@ typedef struct BTScanOpaqueData
 	int		   *killedItems;	/* currPos.items indexes of killed items */
 	int			numKilled;		/* number of currently stored items */
 
+	/* used for index-only scan visibility prechecks */
+	Buffer		vmbuf;			/* vm buffer */
+	int			vischeckcap;	/* capacity of vischeckbuf */
+	TM_VisCheck *vischecksbuf;	/* single allocation to save on alloc overhead */
+
 	/*
 	 * If we are doing an index-only scan, these are the tuple storage
 	 * workspaces for the currPos and markPos respectively.  Each is of size
-- 
2.48.1

