From 6f33846eae0417e46772f4cafe493c3f558c4ad1 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <boekewurm+postgres@gmail.com>
Date: Fri, 25 Apr 2025 14:57:06 +0200
Subject: [PATCH v01 1/4] Expose visibility checking shim for index usage

This will be the backbone of a fix for visibility issues in gist and
sp-gist's index-only scan code in later commits.

The issue to be solved is index scans caching tuples in memory that
are being removed by a concurrently running vacuum; to the point that
the VACUUM process has removed the tuple and marked its page as
ALL_VISIBLE by the time the IOS code returns that tuple through
amgettuple.

While the structure of IndexScanDesc is updated, this new field is
located in an alignment gap, thus making this new field safe to use.
---
 src/include/access/relscan.h             |  1 +
 src/include/access/tableam.h             | 53 +++++++++++++++
 src/backend/access/index/indexam.c       |  6 ++
 src/backend/executor/nodeIndexonlyscan.c | 83 ++++++++++++++++--------
 src/backend/utils/adt/selfuncs.c         | 36 +++++++---
 5 files changed, 143 insertions(+), 36 deletions(-)

diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index 521043304ab..ba694ac4d1c 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -150,6 +150,7 @@ typedef struct IndexScanDescData
 	IndexFetchTableData *xs_heapfetch;
 
 	bool		xs_recheck;		/* T means scan keys must be rechecked */
+	uint8		xs_visrecheck;	/* TMVC_Result; see tableam.h */
 
 	/*
 	 * When fetching with an ordering operator, the values of the ORDER BY
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index 7be7887b4a8..000779ac984 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -24,6 +24,7 @@
 #include "storage/read_stream.h"
 #include "utils/rel.h"
 #include "utils/snapshot.h"
+#include "visibilitymap.h"
 
 
 #define DEFAULT_TABLE_ACCESS_METHOD	"heap"
@@ -256,6 +257,28 @@ typedef struct TM_IndexDeleteOp
 	TM_IndexStatus *status;
 } TM_IndexDeleteOp;
 
+/*
+ * Output of table_index_vischeck_tuple()
+ *
+ * Index-only scans need to know the visibility of the associated table tuples
+ * before they can return the index tuple.  If the index tuple is known to be
+ * visible with a cheap check, we can return it directly without requesting
+ * the visibility info from the table AM directly.
+ *
+ * This AM API exposes a cheap visibility checking API to indexes, allowing
+ * these indexes to check multiple tuples worth of visibility info at once,
+ * and allowing the AM to store these checks, improving the pinning ergonomics
+ * of index AMs by allowing a scan to cache index tuples in memory without
+ * holding pins on those index tuples' pages until the index tuples were
+ * returned.
+ */
+typedef enum TMVC_Result
+{
+	TMVC_Unchecked,
+	TMVC_MaybeVisible,
+	TMVC_Visible,
+} TMVC_Result;
+
 /* "options" flag bits for table_tuple_insert */
 /* TABLE_INSERT_SKIP_WAL was 0x0001; RelationNeedsWAL() now governs */
 #define TABLE_INSERT_SKIP_FSM		0x0002
@@ -1359,6 +1382,36 @@ table_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate)
 	return rel->rd_tableam->index_delete_tuples(rel, delstate);
 }
 
+/*
+ * Determine rough visibility information of index tuples based on each TID.
+ *
+ * Determines which entries from index AM caller's TM_IndexVisibilityCheckOp
+ * state point to TMVC_VISIBLE or TMVC_MAYBE_VISIBLE table tuples, at low IO
+ * overhead.  For the heap AM, the implementation is effectively a wrapper
+ * around VM_ALL_FROZEN.
+ *
+ * On return, all TM_VisChecks indicated by checkop->checktids will have been
+ * updated with the correct visibility status.
+ *
+ * Note that there is no value for "definitely dead" tuples, as the Heap AM
+ * doesn't have an efficient method to determine that a tuple is dead to all
+ * users, as it would have to go into the heap.  If and when AMs are built
+ * that would support VM checks with an equivalent to VM_ALL_DEAD this
+ * decision can be reconsidered.
+ */
+static inline TMVC_Result
+table_index_vischeck_tuple(Relation rel, Buffer *vmbuffer, ItemPointer tid)
+{
+	BlockNumber	blkno = ItemPointerGetBlockNumberNoCheck(tid);
+	TMVC_Result	res;
+
+	if (VM_ALL_VISIBLE(rel, blkno, vmbuffer))
+		res = TMVC_Visible;
+	else
+		res = TMVC_MaybeVisible;
+
+	return res;
+}
 
 /* ----------------------------------------------------------------------------
  *  Functions for manipulations of physical tuples.
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index dcd04b813d8..2e7e484066b 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -581,6 +581,12 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction)
 	/* XXX: we should assert that a snapshot is pushed or registered */
 	Assert(TransactionIdIsValid(RecentXmin));
 
+	/*
+	 * Reset xs_visrecheck, so we don't confuse the next tuple's visibility
+	 * state with that of the previous.
+	 */
+	scan->xs_visrecheck = TMVC_Unchecked;
+
 	/*
 	 * The AM's amgettuple proc finds the next index entry matching the scan
 	 * keys, and puts the TID into scan->xs_heaptid.  It should also set
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index b49194c0167..18818a58f92 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -120,6 +120,7 @@ IndexOnlyNext(IndexOnlyScanState *node)
 	while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
 	{
 		bool		tuple_from_heap = false;
+		TMVC_Result	vischeck = scandesc->xs_visrecheck;
 
 		CHECK_FOR_INTERRUPTS();
 
@@ -127,6 +128,9 @@ IndexOnlyNext(IndexOnlyScanState *node)
 		 * We can skip the heap fetch if the TID references a heap page on
 		 * which all tuples are known visible to everybody.  In any case,
 		 * we'll use the index tuple not the heap tuple as the data source.
+		 * The index may have already pre-checked the visibility of the tuple
+		 * for us, and stored the result in xs_visrecheck, in which case we
+		 * can skip the call.
 		 *
 		 * Note on Memory Ordering Effects: visibilitymap_get_status does not
 		 * lock the visibility map buffer, and therefore the result we read
@@ -156,37 +160,60 @@ IndexOnlyNext(IndexOnlyScanState *node)
 		 *
 		 * It's worth going through this complexity to avoid needing to lock
 		 * the VM buffer, which could cause significant contention.
+		 *
+		 * The index doing these checks for us doesn't materially change these
+		 * considerations.
 		 */
-		if (!VM_ALL_VISIBLE(scandesc->heapRelation,
-							ItemPointerGetBlockNumber(tid),
-							&node->ioss_VMBuffer))
-		{
-			/*
-			 * Rats, we have to visit the heap to check visibility.
-			 */
-			InstrCountTuples2(node, 1);
-			if (!index_fetch_heap(scandesc, node->ioss_TableSlot))
-				continue;		/* no visible tuple, try next index entry */
+		if (vischeck == TMVC_Unchecked)
+			vischeck = table_index_vischeck_tuple(scandesc->heapRelation,
+												  &node->ioss_VMBuffer,
+												  tid);
 
-			ExecClearTuple(node->ioss_TableSlot);
-
-			/*
-			 * Only MVCC snapshots are supported here, so there should be no
-			 * need to keep following the HOT chain once a visible entry has
-			 * been found.  If we did want to allow that, we'd need to keep
-			 * more state to remember not to call index_getnext_tid next time.
-			 */
-			if (scandesc->xs_heap_continue)
-				elog(ERROR, "non-MVCC snapshots are not supported in index-only scans");
+		Assert(vischeck != TMVC_Unchecked);
 
-			/*
-			 * Note: at this point we are holding a pin on the heap page, as
-			 * recorded in scandesc->xs_cbuf.  We could release that pin now,
-			 * but it's not clear whether it's a win to do so.  The next index
-			 * entry might require a visit to the same heap page.
-			 */
-
-			tuple_from_heap = true;
+		switch (vischeck)
+		{
+			case TMVC_Unchecked:
+				elog(ERROR, "Failed to check visibility for tuple");
+				/*
+				 * In case of compilers that don't undertand that elog(ERROR)
+				 * doens't exit, and which have -Wimplicit-fallthrough:
+				 */
+				/* fallthrough */
+			case TMVC_MaybeVisible:
+			{
+				/*
+				 * Rats, we have to visit the heap to check visibility.
+				 */
+				InstrCountTuples2(node, 1);
+				if (!index_fetch_heap(scandesc, node->ioss_TableSlot))
+					continue;	/* no visible tuple, try next index entry */
+
+				ExecClearTuple(node->ioss_TableSlot);
+
+				/*
+				 * Only MVCC snapshots are supported here, so there should be
+				 * no need to keep following the HOT chain once a visible
+				 * entry has been found.  If we did want to allow that, we'd
+				 * need to keep more state to remember not to call
+				 * index_getnext_tid next time.
+				 */
+				if (scandesc->xs_heap_continue)
+					elog(ERROR, "non-MVCC snapshots are not supported in index-only scans");
+
+				/*
+				 * Note: at this point we are holding a pin on the heap page,
+				 * as recorded in scandesc->xs_cbuf.  We could release that
+				 * pin now, but it's not clear whether it's a win to do so.
+				 * The next index entry might require a visit to the same heap
+				 * page.
+				 */
+
+				tuple_from_heap = true;
+				break;
+			}
+			case TMVC_Visible:
+				break;
 		}
 
 		/*
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index f420517961f..e608b6a9309 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -6364,15 +6364,29 @@ get_actual_variable_endpoint(Relation heapRel,
 	/* Set it up for index-only scan */
 	index_scan->xs_want_itup = true;
 	index_rescan(index_scan, scankeys, 1, NULL, 0);
+	index_scan->xs_visrecheck = TMVC_Unchecked;
 
 	/* Fetch first/next tuple in specified direction */
 	while ((tid = index_getnext_tid(index_scan, indexscandir)) != NULL)
 	{
 		BlockNumber block = ItemPointerGetBlockNumber(tid);
+		TMVC_Result visres = index_scan->xs_visrecheck;
 
-		if (!VM_ALL_VISIBLE(heapRel,
-							block,
-							&vmbuffer))
+		if (visres == TMVC_Unchecked)
+			visres = table_index_vischeck_tuple(heapRel, &vmbuffer, tid);
+
+		Assert(visres != TMVC_Unchecked);
+
+		switch (visres)
+		{
+		case TMVC_Unchecked:
+			elog(ERROR, "Failed to check visibility for tuple");
+			/*
+			 * In case of compilers that don't undertand that elog(ERROR)
+			 * doens't exit, and which have -Wimplicit-fallthrough:
+			 */
+			/* fallthrough */
+		case TMVC_MaybeVisible:
 		{
 			/* Rats, we have to visit the heap to check visibility */
 			if (!index_fetch_heap(index_scan, tableslot))
@@ -6382,10 +6396,11 @@ get_actual_variable_endpoint(Relation heapRel,
 				 * advance to the next entry.  Before doing so, count heap
 				 * page fetches and give up if we've done too many.
 				 *
-				 * We don't charge a page fetch if this is the same heap page
-				 * as the previous tuple.  This is on the conservative side,
-				 * since other recently-accessed pages are probably still in
-				 * buffers too; but it's good enough for this heuristic.
+				 * We don't charge a page fetch if this is the same heap
+				 * page as the previous tuple.  This is on the
+				 * conservative side, since other recently-accessed pages
+				 * are probably still in buffers too; but it's good enough
+				 * for this heuristic.
 				 */
 #define VISITED_PAGES_LIMIT 100
 
@@ -6394,7 +6409,7 @@ get_actual_variable_endpoint(Relation heapRel,
 					last_heap_block = block;
 					n_visited_heap_pages++;
 					if (n_visited_heap_pages > VISITED_PAGES_LIMIT)
-						break;
+						goto exit_loop;
 				}
 
 				continue;		/* no visible tuple, try next index entry */
@@ -6407,6 +6422,10 @@ get_actual_variable_endpoint(Relation heapRel,
 			 * We don't care whether there's more than one visible tuple in
 			 * the HOT chain; if any are visible, that's good enough.
 			 */
+			break;
+		}
+		case TMVC_Visible:
+			break;
 		}
 
 		/*
@@ -6435,6 +6454,7 @@ get_actual_variable_endpoint(Relation heapRel,
 		have_data = true;
 		break;
 	}
+exit_loop:
 
 	if (vmbuffer != InvalidBuffer)
 		ReleaseBuffer(vmbuffer);
-- 
2.45.2

