From c113d0c4f7d4bd7209c06fa73f2153140d8afb9c Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <boekewurm+postgres@gmail.com>
Date: Fri, 7 Mar 2025 17:39:23 +0100
Subject: [PATCH v10 1/5] IOS/TableAM: Support AM-specific fast visibility
 tests

Previously, we assumed VM_ALL_VISIBLE is universal across all AMs. This
is probably not the case, so we introduce a new table method called
"table_index_vischeck_tuples" which allows anyone to ask the AM whether
a tuple is definitely visible to everyone or might be invisible to
someone.

The API is intended to replace direct calls to VM_ALL_VISIBLE and as such
doesn't include "definitely dead to everyone", as the Heap AM's VM doesn't
support *definitely dead* as output for its lookups; and thus it would be
too expensive for the Heap AM to produce such results.

A future commit will use this inside GIST and SP-GIST to fix a race
condition between IOS and VACUUM, which causes a bug with tuple
visibility, and a further patch will add support for this to nbtree.
---
 src/include/access/heapam.h              |  2 +
 src/include/access/relscan.h             |  5 ++
 src/include/access/tableam.h             | 73 +++++++++++++++++++++
 src/backend/access/heap/heapam.c         | 64 ++++++++++++++++++
 src/backend/access/heap/heapam_handler.c |  1 +
 src/backend/access/index/indexam.c       |  6 ++
 src/backend/access/table/tableamapi.c    |  1 +
 src/backend/executor/nodeIndexonlyscan.c | 83 ++++++++++++++++--------
 src/backend/utils/adt/selfuncs.c         | 76 +++++++++++++---------
 9 files changed, 254 insertions(+), 57 deletions(-)

diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index 1640d9c32f7..a820f150509 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -378,6 +378,8 @@ extern void simple_heap_update(Relation relation, ItemPointer otid,
 
 extern TransactionId heap_index_delete_tuples(Relation rel,
 											  TM_IndexDeleteOp *delstate);
+extern void heap_index_vischeck_tuples(Relation rel,
+									   TM_IndexVisibilityCheckOp *checkop);
 
 /* in heap/pruneheap.c */
 struct GlobalVisState;
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index b5e0fb386c0..93a6f65ab0e 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -26,6 +26,9 @@
 
 struct ParallelTableScanDescData;
 
+enum TMVC_Result;
+
+
 /*
  * Generic descriptor for table scans. This is the base-class for table scans,
  * which needs to be embedded in the scans of individual AMs.
@@ -176,6 +179,8 @@ typedef struct IndexScanDescData
 
 	bool		xs_recheck;		/* T means scan keys must be rechecked */
 
+	int			xs_visrecheck;	/* TM_VisCheckResult from tableam.h */
+
 	/*
 	 * When fetching with an ordering operator, the values of the ORDER BY
 	 * expressions of the last returned tuple, according to the index.  If
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index b8cb1e744ad..2dbdb9287f1 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -255,6 +255,49 @@ typedef struct TM_IndexDeleteOp
 	TM_IndexStatus *status;
 } TM_IndexDeleteOp;
 
+/*
+ * State used when calling table_index_delete_tuples()
+ *
+ * Index-only scans need to know the visibility of the associated table tuples
+ * before they can return the index tuple.  If the index tuple is known to be
+ * visible with a cheap check, we can return it directly without requesting
+ * the visibility info from the table AM directly.
+ *
+ * This AM API exposes a cheap visibility checking API to indexes, allowing
+ * these indexes to check multiple tuples worth of visibility info at once,
+ * and allowing the AM to store these checks, improving the pinning ergonomics
+ * of index AMs by allowing a scan to cache index tuples in memory without
+ * holding pins on index tuples' pages until the index tuples were returned.
+ *
+ * The AM is called with a list of TIDs, and its output will indicate the
+ * visibility state of each tuple: Unchecked, Dead, MaybeVisible, or Visible.
+ *
+ * HeapAM's implementation of visibility maps only allows for cheap checks of
+ * *definitely visible*; all other results are *maybe visible*. A result for
+ * *definitely not visible* aka dead is currently not accounted for by lack of
+ * Table AMs which support such visibility lookups cheaply.
+ */
+typedef enum TMVC_Result
+{
+	TMVC_Unchecked,
+	TMVC_MaybeVisible,
+	TMVC_Visible,
+} TMVC_Result;
+
+typedef struct TM_VisCheck
+{
+	ItemPointerData	tid;			/* table TID from index tuple */
+	OffsetNumber	idxoffnum;		/* identifier for the TID in this call */
+	TMVC_Result		vischeckresult;	/* output of the visibilitycheck */
+} TM_VisCheck;
+
+typedef struct TM_IndexVisibilityCheckOp
+{
+	int			nchecktids;			/* number of TIDs to check */
+	Buffer	   *vmbuf;				/* pointer to VM buffer to reuse across calls */
+	TM_VisCheck *checktids;			/* the checks to execute */
+} TM_IndexVisibilityCheckOp;
+
 /* "options" flag bits for table_tuple_insert */
 /* TABLE_INSERT_SKIP_WAL was 0x0001; RelationNeedsWAL() now governs */
 #define TABLE_INSERT_SKIP_FSM		0x0002
@@ -501,6 +544,10 @@ typedef struct TableAmRoutine
 	TransactionId (*index_delete_tuples) (Relation rel,
 										  TM_IndexDeleteOp *delstate);
 
+	/* see table_index_delete_tuples() */
+	void		(*index_vischeck_tuples) (Relation rel,
+										  TM_IndexVisibilityCheckOp *checkop);
+
 
 	/* ------------------------------------------------------------------------
 	 * Manipulations of physical tuples.
@@ -1328,6 +1375,32 @@ table_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate)
 	return rel->rd_tableam->index_delete_tuples(rel, delstate);
 }
 
+static inline void
+table_index_vischeck_tuples(Relation rel, TM_IndexVisibilityCheckOp *checkop)
+{
+	return rel->rd_tableam->index_vischeck_tuples(rel, checkop);
+}
+
+static inline TMVC_Result
+table_index_vischeck_tuple(Relation rel, Buffer *vmbuffer, ItemPointer tid)
+{
+	TM_IndexVisibilityCheckOp checkOp;
+	TM_VisCheck		op;
+
+	op.idxoffnum = 0;
+	op.tid = *tid;
+	op.vischeckresult = TMVC_Unchecked;
+	checkOp.checktids = &op;
+	checkOp.nchecktids = 1;
+	checkOp.vmbuf = vmbuffer;
+
+	rel->rd_tableam->index_vischeck_tuples(rel, &checkOp);
+
+	Assert(op.vischeckresult != TMVC_Unchecked);
+
+	return op.vischeckresult;
+}
+
 
 /* ----------------------------------------------------------------------------
  *  Functions for manipulations of physical tuples.
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index b12b583c4d9..a58f19761aa 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -102,6 +102,7 @@ static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status
 									   bool logLockFailure);
 static void index_delete_sort(TM_IndexDeleteOp *delstate);
 static int	bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate);
+static int	heap_cmp_index_vischeck(const void *a, const void *b);
 static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup);
 static HeapTuple ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required,
 										bool *copy);
@@ -8775,6 +8776,69 @@ bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate)
 	return nblocksfavorable;
 }
 
+/*
+ * heapam implementation of tableam's index_vischeck_tuples interface.
+ *
+ * This helper function is called by index AMs during index-only scans,
+ * to do VM-based visibility checks on individual tuples, so that the AM
+ * can hold the tuple in memory for e.g. reordering for extended periods of
+ * time while without holding thousands of pins to conflict with VACUUM.
+ *
+ * It's possible for this to generate a fair amount of I/O, since we may be
+ * checking hundreds of tuples from a single index block, but that is
+ * preferred over holding thousands of pins.
+ */
+void
+heap_index_vischeck_tuples(Relation rel, TM_IndexVisibilityCheckOp *checkop)
+{
+	BlockNumber		prevBlk = InvalidBlockNumber;
+	TMVC_Result		lastResult = TMVC_Unchecked;
+	Buffer		   *vmbuf = checkop->vmbuf;
+	TM_VisCheck	   *checkTids = checkop->checktids;
+
+	/*
+	 * Order the TIDs to heap order, so that we will only need to visit every
+	 * VM page at most once.
+	 */
+	if (checkop->nchecktids > 1)
+		qsort(checkTids, checkop->nchecktids, sizeof(TM_VisCheck),
+			  heap_cmp_index_vischeck);
+
+	for (int i = 0; i < checkop->nchecktids; i++)
+	{
+		TM_VisCheck *check = &checkop->checktids[i];
+		ItemPointer	tid = &check->tid;
+		BlockNumber blkno = ItemPointerGetBlockNumber(tid);
+
+		/* Visibility should be checked just once per tuple. */
+		Assert(check->vischeckresult == TMVC_Unchecked);
+
+		if (blkno != prevBlk)
+		{
+			if (VM_ALL_VISIBLE(rel, blkno, vmbuf))
+				lastResult = TMVC_Visible;
+			else
+				lastResult = TMVC_MaybeVisible;
+
+			prevBlk = blkno;
+		}
+
+		check->vischeckresult = lastResult;
+	}
+}
+
+/*
+ * Compare TM_VisChecks for an efficient ordering.
+ */
+static int
+heap_cmp_index_vischeck(const void *a, const void *b)
+{
+	const TM_VisCheck *visa = (const TM_VisCheck *) a;
+	const TM_VisCheck *visb = (const TM_VisCheck *) b;
+	return ItemPointerCompare(unconstify(ItemPointerData *, &visa->tid),
+							  unconstify(ItemPointerData *, &visb->tid));
+}
+
 /*
  * Perform XLogInsert for a heap-visible operation.  'block' is the block
  * being marked all-visible, and vm_buffer is the buffer containing the
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 4da4dc84580..65c23ff6658 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -2676,6 +2676,7 @@ static const TableAmRoutine heapam_methods = {
 	.tuple_tid_valid = heapam_tuple_tid_valid,
 	.tuple_satisfies_snapshot = heapam_tuple_satisfies_snapshot,
 	.index_delete_tuples = heap_index_delete_tuples,
+	.index_vischeck_tuples = heap_index_vischeck_tuples,
 
 	.relation_set_new_filelocator = heapam_relation_set_new_filelocator,
 	.relation_nontransactional_truncate = heapam_relation_nontransactional_truncate,
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 55ec4c10352..370e442e24e 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -627,6 +627,12 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction)
 	/* XXX: we should assert that a snapshot is pushed or registered */
 	Assert(TransactionIdIsValid(RecentXmin));
 
+	/*
+	 * Reset xs_visrecheck, so we don't confuse the next tuple's visibility
+	 * state with that of the previous.
+	 */
+	scan->xs_visrecheck = TMVC_Unchecked;
+
 	/*
 	 * The AM's amgettuple proc finds the next index entry matching the scan
 	 * keys, and puts the TID into scan->xs_heaptid.  It should also set
diff --git a/src/backend/access/table/tableamapi.c b/src/backend/access/table/tableamapi.c
index 476663b66aa..b3ce90ceaea 100644
--- a/src/backend/access/table/tableamapi.c
+++ b/src/backend/access/table/tableamapi.c
@@ -61,6 +61,7 @@ GetTableAmRoutine(Oid amhandler)
 	Assert(routine->tuple_get_latest_tid != NULL);
 	Assert(routine->tuple_satisfies_snapshot != NULL);
 	Assert(routine->index_delete_tuples != NULL);
+	Assert(routine->index_vischeck_tuples != NULL);
 
 	Assert(routine->tuple_insert != NULL);
 
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index f464cca9507..e02fc1652ff 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -121,6 +121,7 @@ IndexOnlyNext(IndexOnlyScanState *node)
 	while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
 	{
 		bool		tuple_from_heap = false;
+		TMVC_Result	vischeck = scandesc->xs_visrecheck;
 
 		CHECK_FOR_INTERRUPTS();
 
@@ -128,6 +129,9 @@ IndexOnlyNext(IndexOnlyScanState *node)
 		 * We can skip the heap fetch if the TID references a heap page on
 		 * which all tuples are known visible to everybody.  In any case,
 		 * we'll use the index tuple not the heap tuple as the data source.
+		 * The index may have already pre-checked the visibility of the tuple
+		 * for us, and stored the result in xs_visrecheck, in which case we
+		 * can skip the call.
 		 *
 		 * Note on Memory Ordering Effects: visibilitymap_get_status does not
 		 * lock the visibility map buffer, and therefore the result we read
@@ -157,37 +161,60 @@ IndexOnlyNext(IndexOnlyScanState *node)
 		 *
 		 * It's worth going through this complexity to avoid needing to lock
 		 * the VM buffer, which could cause significant contention.
+		 *
+		 * The index doing these checks for us doesn't materially change these
+		 * considerations.
 		 */
-		if (!VM_ALL_VISIBLE(scandesc->heapRelation,
-							ItemPointerGetBlockNumber(tid),
-							&node->ioss_VMBuffer))
-		{
-			/*
-			 * Rats, we have to visit the heap to check visibility.
-			 */
-			InstrCountTuples2(node, 1);
-			if (!index_fetch_heap(scandesc, node->ioss_TableSlot))
-				continue;		/* no visible tuple, try next index entry */
+		if (vischeck == TMVC_Unchecked)
+			vischeck = table_index_vischeck_tuple(scandesc->heapRelation,
+												  &node->ioss_VMBuffer,
+												  tid);
 
-			ExecClearTuple(node->ioss_TableSlot);
-
-			/*
-			 * Only MVCC snapshots are supported here, so there should be no
-			 * need to keep following the HOT chain once a visible entry has
-			 * been found.  If we did want to allow that, we'd need to keep
-			 * more state to remember not to call index_getnext_tid next time.
-			 */
-			if (scandesc->xs_heap_continue)
-				elog(ERROR, "non-MVCC snapshots are not supported in index-only scans");
+		Assert(vischeck != TMVC_Unchecked);
 
-			/*
-			 * Note: at this point we are holding a pin on the heap page, as
-			 * recorded in scandesc->xs_cbuf.  We could release that pin now,
-			 * but it's not clear whether it's a win to do so.  The next index
-			 * entry might require a visit to the same heap page.
-			 */
-
-			tuple_from_heap = true;
+		switch (vischeck)
+		{
+			case TMVC_Unchecked:
+				elog(ERROR, "Failed to check visibility for tuple");
+				/*
+				 * In case of compilers that don't undertand that elog(ERROR)
+				 * doens't exit, and which have -Wimplicit-fallthrough:
+				 */
+				/* fallthrough */
+			case TMVC_MaybeVisible:
+			{
+				/*
+				 * Rats, we have to visit the heap to check visibility.
+				 */
+				InstrCountTuples2(node, 1);
+				if (!index_fetch_heap(scandesc, node->ioss_TableSlot))
+					continue;	/* no visible tuple, try next index entry */
+
+				ExecClearTuple(node->ioss_TableSlot);
+
+				/*
+				 * Only MVCC snapshots are supported here, so there should be
+				 * no need to keep following the HOT chain once a visible
+				 * entry has been found.  If we did want to allow that, we'd
+				 * need to keep more state to remember not to call
+				 * index_getnext_tid next time.
+				 */
+				if (scandesc->xs_heap_continue)
+					elog(ERROR, "non-MVCC snapshots are not supported in index-only scans");
+
+				/*
+				 * Note: at this point we are holding a pin on the heap page,
+				 * as recorded in scandesc->xs_cbuf.  We could release that
+				 * pin now, but it's not clear whether it's a win to do so.
+				 * The next index entry might require a visit to the same heap
+				 * page.
+				 */
+
+				tuple_from_heap = true;
+				break;
+			}
+			case TMVC_Visible:
+				break;
 		}
 
 		/*
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 5b35debc8ff..7e9ca616a67 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -6561,44 +6561,62 @@ get_actual_variable_endpoint(Relation heapRel,
 	while ((tid = index_getnext_tid(index_scan, indexscandir)) != NULL)
 	{
 		BlockNumber block = ItemPointerGetBlockNumber(tid);
+		TMVC_Result visres = index_scan->xs_visrecheck;
 
-		if (!VM_ALL_VISIBLE(heapRel,
-							block,
-							&vmbuffer))
+		if (visres == TMVC_Unchecked)
+			visres = table_index_vischeck_tuple(heapRel, &vmbuffer, tid);
+
+		Assert(visres != TMVC_Unchecked);
+
+		switch (visres)
 		{
-			/* Rats, we have to visit the heap to check visibility */
-			if (!index_fetch_heap(index_scan, tableslot))
-			{
+			case TMVC_Unchecked:
+				elog(ERROR, "Failed to check visibility for tuple");
 				/*
-				 * No visible tuple for this index entry, so we need to
-				 * advance to the next entry.  Before doing so, count heap
-				 * page fetches and give up if we've done too many.
-				 *
-				 * We don't charge a page fetch if this is the same heap page
-				 * as the previous tuple.  This is on the conservative side,
-				 * since other recently-accessed pages are probably still in
-				 * buffers too; but it's good enough for this heuristic.
+				 * In case of compilers that don't undertand that elog(ERROR)
+				 * doens't exit, and which have -Wimplicit-fallthrough:
 				 */
+				/* fallthrough */
+			case TMVC_MaybeVisible:
+			{
+				/* Rats, we have to visit the heap to check visibility */
+				if (!index_fetch_heap(index_scan, tableslot))
+				{
+					/*
+					 * No visible tuple for this index entry, so we need to
+					 * advance to the next entry.  Before doing so, count heap
+					 * page fetches and give up if we've done too many.
+					 *
+					 * We don't charge a page fetch if this is the same heap
+					 * page as the previous tuple.  This is on the
+					 * conservative side, since other recently-accessed pages
+					 * are probably still in buffers too; but it's good enough
+					 * for this heuristic.
+					 */
 #define VISITED_PAGES_LIMIT 100
 
-				if (block != last_heap_block)
-				{
-					last_heap_block = block;
-					n_visited_heap_pages++;
-					if (n_visited_heap_pages > VISITED_PAGES_LIMIT)
-						break;
-				}
+					if (block != last_heap_block)
+					{
+						last_heap_block = block;
+						n_visited_heap_pages++;
+						if (n_visited_heap_pages > VISITED_PAGES_LIMIT)
+							break;
+					}
 
-				continue;		/* no visible tuple, try next index entry */
-			}
+					continue;		/* no visible tuple, try next index entry */
+				}
 
-			/* We don't actually need the heap tuple for anything */
-			ExecClearTuple(tableslot);
+				/* We don't actually need the heap tuple for anything */
+				ExecClearTuple(tableslot);
 
-			/*
-			 * We don't care whether there's more than one visible tuple in
-			 * the HOT chain; if any are visible, that's good enough.
-			 */
+				/*
+				 * We don't care whether there's more than one visible tuple in
+				 * the HOT chain; if any are visible, that's good enough.
+				 */
+				break;
+			}
+			case TMVC_Visible:
+				break;
 		}
 
 		/*
-- 
2.45.2

