From 6bf950901f00bba930738d77ca16831ffdec8dd3 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <boekewurm+postgres@gmail.com>
Date: Fri, 7 Mar 2025 17:39:23 +0100
Subject: [PATCH v12 1/5] IOS/TableAM: Support AM-specific fast visibility
 tests

Previously, we assumed VM_ALL_VISIBLE is universal across all AMs. This
is probably not the case, so we introduce a new table method called
"table_index_vischeck_tuples" which allows anyone to ask the AM whether
a tuple is definitely visible to everyone or might be invisible to
someone.

The API is intended to replace direct calls to VM_ALL_VISIBLE and as such
doesn't include "definitely dead to everyone", as the Heap AM's VM doesn't
support *definitely dead* as output for its lookups; and thus it would be
too expensive for the Heap AM to produce such results.

A future commit will use this inside GIST and SP-GIST to fix a race
condition between IOS and VACUUM, which causes a bug with tuple
visibility, and a further patch will add support for this to nbtree.
---
 src/backend/access/heap/heapam.c         | 177 +++++++++++++++++++++++
 src/backend/access/heap/heapam_handler.c |   1 +
 src/backend/access/heap/visibilitymap.c  |  39 ++---
 src/backend/access/index/indexam.c       |   6 +
 src/backend/access/table/tableamapi.c    |   1 +
 src/backend/executor/nodeIndexonlyscan.c |  83 +++++++----
 src/backend/utils/adt/selfuncs.c         |  76 ++++++----
 src/include/access/heapam.h              |   2 +
 src/include/access/relscan.h             |   5 +
 src/include/access/tableam.h             | 103 +++++++++++++
 src/include/access/visibilitymapdefs.h   |  19 +++
 11 files changed, 430 insertions(+), 82 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index c1a4de14a59..34acd2c06c0 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -101,11 +101,37 @@ static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status
 									   uint16 infomask, Relation rel, int *remaining,
 									   bool logLockFailure);
 static void index_delete_sort(TM_IndexDeleteOp *delstate);
+static inline int heap_ivc_process_block(Relation rel, Buffer *vmbuf,
+										 TM_VisCheck *checks, int nchecks);
+static void heap_ivc_process_all(Relation rel, Buffer *vmbuf,
+								 TM_VisCheck *checks, int nchecks);
 static int	bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate);
 static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup);
 static HeapTuple ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required,
 										bool *copy);
 
+/* sort template definitions for index */
+#define ST_SORT heap_ivc_sortby_tidheapblk
+#define ST_ELEMENT_TYPE TM_VisCheck
+#define ST_DECLARE
+#define ST_DEFINE
+#define ST_SCOPE static inline
+#define ST_COMPARE(a, b) ( \
+	a->tidblkno < b->tidblkno ? -1 : ( \
+		a->tidblkno > b->tidblkno ? 1 : 0 \
+	) \
+)
+
+#include "lib/sort_template.h"
+
+#define ST_SORT heap_ivc_sortby_idx
+#define ST_ELEMENT_TYPE TM_VisCheck
+#define ST_DECLARE
+#define ST_DEFINE
+#define ST_SCOPE static inline
+#define ST_COMPARE(a, b) (((int) a->idxoffnum) - ((int) b->idxoffnum))
+#include "lib/sort_template.h"
+
 
 /*
  * Each tuple lock mode has a corresponding heavyweight lock, and one or two
@@ -8750,6 +8776,157 @@ bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate)
 	return nblocksfavorable;
 }
 
+/*
+ * heapam implementation of tableam's index_vischeck_tuples interface.
+ *
+ * This helper function is called by index AMs during index-only scans,
+ * to do VM-based visibility checks on individual tuples, so that the AM
+ * can hold the tuple in memory for e.g. reordering for extended periods of
+ * time while without holding thousands of pins to conflict with VACUUM.
+ *
+ * It's possible for this to generate a fair amount of I/O, since we may be
+ * checking hundreds of tuples from a single index block, but that is
+ * preferred over holding thousands of pins.
+ *
+ * We use heuristics to balance the costs of sorting TIDs with VM page
+ * lookups.
+ */
+void
+heap_index_vischeck_tuples(Relation rel, TM_IndexVisibilityCheckOp *checkop)
+{
+	Buffer			vmbuf = *checkop->vmbuf;
+	Buffer			storvmbuf = vmbuf;
+	TM_VisCheck	   *checks = checkop->checktids;
+	int				checkntids = checkop->checkntids;
+	int				upcomingvmbufchanges = 0;
+
+	/*
+	 * The first index scan will have to pin the VM buffer, and that first
+	 * change in the vm buffer shouldn't put us into the expensive VM page &
+	 * sort path; so we special-case this operation.
+	 */
+	if (!BufferIsValid(vmbuf))
+	{
+		int			processed;
+		processed = heap_ivc_process_block(rel, &vmbuf, checks,checkntids);
+		checkntids -= processed;
+		checks += processed;
+		storvmbuf = vmbuf;
+		Assert(processed > 0);
+	}
+
+	while (vmbuf == storvmbuf && checkntids > 0)
+	{
+		int			processed;
+
+		processed = heap_ivc_process_block(rel, &vmbuf, checks,checkntids);
+
+		Assert(processed <= checkntids);
+
+		checkntids -= processed;
+		checks += processed;
+	}
+
+	*checkop->vmbuf = vmbuf;
+
+	if (checkntids == 0)
+	{
+		return;
+	}
+
+	upcomingvmbufchanges = 0;
+
+	for (int i = 1; i < checkntids; i++)
+	{
+		/*
+		 * Instead of storing the previous iteration's result, we only match
+		 * the block numbers
+		 */
+		BlockNumber lastblkno = checks[i - 1].tidblkno;
+		BlockNumber newblkno = checks[i].tidblkno;
+		/*
+		 * divide-by-constant can be faster than BufferGetBlockNumber()
+		 */
+		BlockNumber lastvmblkno = HEAPBLK_TO_VMBLOCK(lastblkno);
+		BlockNumber newvmblkno = HEAPBLK_TO_VMBLOCK(newblkno);
+
+		if (lastvmblkno != newvmblkno)
+			upcomingvmbufchanges++;
+	}
+
+	if (upcomingvmbufchanges <= pg_ceil_log2_32(checkntids))
+	{
+		/*
+		 * No big amount of VM buf changes, so do all visibility checks
+		 * without sorting.
+		 */
+		heap_ivc_process_all(rel, checkop->vmbuf, checks, checkntids);
+
+		return;
+	}
+
+	/*
+	 * Order the TIDs to heap order, so that we will only need to visit every
+	 * VM page at most once.
+	 */
+	heap_ivc_sortby_tidheapblk(checks, checkntids);
+
+	/* do all visibility checks */
+	heap_ivc_process_all(rel, checkop->vmbuf, checks, checkntids);
+
+	/* put the checks back in index order */
+	heap_ivc_sortby_idx(checks, checkntids);
+}
+
+
+static inline int
+heap_ivc_process_block(Relation rel, Buffer *vmbuf, TM_VisCheck *checks,
+					   int nchecks)
+{
+	BlockNumber	blkno;
+	BlockNumber	prevblkno = blkno = checks->tidblkno;
+	TMVC_Result	result;
+	int			processed = 0;
+
+	if (VM_ALL_VISIBLE(rel, blkno, vmbuf))
+		result = TMVC_Visible;
+	else
+		result = TMVC_MaybeVisible;
+
+	do
+	{
+		checks->vischeckresult = result;
+
+		nchecks--;
+		processed++;
+		checks++;
+
+		if (nchecks <= 0)
+			return processed;
+
+		blkno = checks->tidblkno;
+	} while (blkno == prevblkno);
+
+	return processed;
+}
+
+static void
+heap_ivc_process_all(Relation rel, Buffer *vmbuf,
+					 TM_VisCheck *checks, int nchecks)
+{
+	while (nchecks > 0)
+	{
+		int			processed;
+
+		processed = heap_ivc_process_block(rel, vmbuf, checks, nchecks);
+
+		Assert(processed <= nchecks);
+
+		nchecks -= processed;
+		checks += processed;
+	}
+}
+
 /*
  * Perform XLogInsert for a heap-visible operation.  'block' is the block
  * being marked all-visible, and vm_buffer is the buffer containing the
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index ac082fefa77..fe4b0b39da7 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -2648,6 +2648,7 @@ static const TableAmRoutine heapam_methods = {
 	.tuple_tid_valid = heapam_tuple_tid_valid,
 	.tuple_satisfies_snapshot = heapam_tuple_satisfies_snapshot,
 	.index_delete_tuples = heap_index_delete_tuples,
+	.index_vischeck_tuples = heap_index_vischeck_tuples,
 
 	.relation_set_new_filelocator = heapam_relation_set_new_filelocator,
 	.relation_nontransactional_truncate = heapam_relation_nontransactional_truncate,
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index 745a04ef26e..ae71c0a6d6e 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -107,17 +107,6 @@
  */
 #define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
 
-/* Number of heap blocks we can represent in one byte */
-#define HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
-
-/* Number of heap blocks we can represent in one visibility map page. */
-#define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE)
-
-/* Mapping from heap block number to the right bit in the visibility map */
-#define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE)
-#define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE)
-#define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
-
 /* Masks for counting subsets of bits in the visibility map. */
 #define VISIBLE_MASK8	(0x55)	/* The lower bit of each bit pair */
 #define FROZEN_MASK8	(0xaa)	/* The upper bit of each bit pair */
@@ -137,9 +126,9 @@ static Buffer vm_extend(Relation rel, BlockNumber vm_nblocks);
 bool
 visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer vmbuf, uint8 flags)
 {
-	BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
-	int			mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
-	int			mapOffset = HEAPBLK_TO_OFFSET(heapBlk);
+	BlockNumber mapBlock = HEAPBLK_TO_VMBLOCK(heapBlk);
+	int			mapByte = HEAPBLK_TO_VMBYTE(heapBlk);
+	int			mapOffset = HEAPBLK_TO_VMOFFSET(heapBlk);
 	uint8		mask = flags << mapOffset;
 	char	   *map;
 	bool		cleared = false;
@@ -190,7 +179,7 @@ visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer vmbuf, uint8 flags
 void
 visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *vmbuf)
 {
-	BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
+	BlockNumber mapBlock = HEAPBLK_TO_VMBLOCK(heapBlk);
 
 	/* Reuse the old pinned buffer if possible */
 	if (BufferIsValid(*vmbuf))
@@ -214,7 +203,7 @@ visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *vmbuf)
 bool
 visibilitymap_pin_ok(BlockNumber heapBlk, Buffer vmbuf)
 {
-	BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
+	BlockNumber mapBlock = HEAPBLK_TO_VMBLOCK(heapBlk);
 
 	return BufferIsValid(vmbuf) && BufferGetBlockNumber(vmbuf) == mapBlock;
 }
@@ -247,9 +236,9 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
 				  XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid,
 				  uint8 flags)
 {
-	BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
-	uint32		mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
-	uint8		mapOffset = HEAPBLK_TO_OFFSET(heapBlk);
+	BlockNumber mapBlock = HEAPBLK_TO_VMBLOCK(heapBlk);
+	uint32		mapByte = HEAPBLK_TO_VMBYTE(heapBlk);
+	uint8		mapOffset = HEAPBLK_TO_VMOFFSET(heapBlk);
 	Page		page;
 	uint8	   *map;
 	uint8		status;
@@ -340,9 +329,9 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
 uint8
 visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *vmbuf)
 {
-	BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
-	uint32		mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
-	uint8		mapOffset = HEAPBLK_TO_OFFSET(heapBlk);
+	BlockNumber mapBlock = HEAPBLK_TO_VMBLOCK(heapBlk);
+	uint32		mapByte = HEAPBLK_TO_VMBYTE(heapBlk);
+	uint8		mapOffset = HEAPBLK_TO_VMOFFSET(heapBlk);
 	char	   *map;
 	uint8		result;
 
@@ -445,9 +434,9 @@ visibilitymap_prepare_truncate(Relation rel, BlockNumber nheapblocks)
 	BlockNumber newnblocks;
 
 	/* last remaining block, byte, and bit */
-	BlockNumber truncBlock = HEAPBLK_TO_MAPBLOCK(nheapblocks);
-	uint32		truncByte = HEAPBLK_TO_MAPBYTE(nheapblocks);
-	uint8		truncOffset = HEAPBLK_TO_OFFSET(nheapblocks);
+	BlockNumber truncBlock = HEAPBLK_TO_VMBLOCK(nheapblocks);
+	uint32		truncByte = HEAPBLK_TO_VMBYTE(nheapblocks);
+	uint8		truncOffset = HEAPBLK_TO_VMOFFSET(nheapblocks);
 
 #ifdef TRACE_VISIBILITYMAP
 	elog(DEBUG1, "vm_truncate %s %d", RelationGetRelationName(rel), nheapblocks);
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 219df1971da..61d1f08220d 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -628,6 +628,12 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction)
 	/* XXX: we should assert that a snapshot is pushed or registered */
 	Assert(TransactionIdIsValid(RecentXmin));
 
+	/*
+	 * Reset xs_visrecheck, so we don't confuse the next tuple's visibility
+	 * state with that of the previous.
+	 */
+	scan->xs_visrecheck = TMVC_Unchecked;
+
 	/*
 	 * The AM's amgettuple proc finds the next index entry matching the scan
 	 * keys, and puts the TID into scan->xs_heaptid.  It should also set
diff --git a/src/backend/access/table/tableamapi.c b/src/backend/access/table/tableamapi.c
index 476663b66aa..b3ce90ceaea 100644
--- a/src/backend/access/table/tableamapi.c
+++ b/src/backend/access/table/tableamapi.c
@@ -61,6 +61,7 @@ GetTableAmRoutine(Oid amhandler)
 	Assert(routine->tuple_get_latest_tid != NULL);
 	Assert(routine->tuple_satisfies_snapshot != NULL);
 	Assert(routine->index_delete_tuples != NULL);
+	Assert(routine->index_vischeck_tuples != NULL);
 
 	Assert(routine->tuple_insert != NULL);
 
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index f464cca9507..e02fc1652ff 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -121,6 +121,7 @@ IndexOnlyNext(IndexOnlyScanState *node)
 	while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
 	{
 		bool		tuple_from_heap = false;
+		TMVC_Result	vischeck = scandesc->xs_visrecheck;
 
 		CHECK_FOR_INTERRUPTS();
 
@@ -128,6 +129,9 @@ IndexOnlyNext(IndexOnlyScanState *node)
 		 * We can skip the heap fetch if the TID references a heap page on
 		 * which all tuples are known visible to everybody.  In any case,
 		 * we'll use the index tuple not the heap tuple as the data source.
+		 * The index may have already pre-checked the visibility of the tuple
+		 * for us, and stored the result in xs_visrecheck, in which case we
+		 * can skip the call.
 		 *
 		 * Note on Memory Ordering Effects: visibilitymap_get_status does not
 		 * lock the visibility map buffer, and therefore the result we read
@@ -157,37 +161,60 @@ IndexOnlyNext(IndexOnlyScanState *node)
 		 *
 		 * It's worth going through this complexity to avoid needing to lock
 		 * the VM buffer, which could cause significant contention.
+		 *
+		 * The index doing these checks for us doesn't materially change these
+		 * considerations.
 		 */
-		if (!VM_ALL_VISIBLE(scandesc->heapRelation,
-							ItemPointerGetBlockNumber(tid),
-							&node->ioss_VMBuffer))
-		{
-			/*
-			 * Rats, we have to visit the heap to check visibility.
-			 */
-			InstrCountTuples2(node, 1);
-			if (!index_fetch_heap(scandesc, node->ioss_TableSlot))
-				continue;		/* no visible tuple, try next index entry */
+		if (vischeck == TMVC_Unchecked)
+			vischeck = table_index_vischeck_tuple(scandesc->heapRelation,
+												  &node->ioss_VMBuffer,
+												  tid);
 
-			ExecClearTuple(node->ioss_TableSlot);
-
-			/*
-			 * Only MVCC snapshots are supported here, so there should be no
-			 * need to keep following the HOT chain once a visible entry has
-			 * been found.  If we did want to allow that, we'd need to keep
-			 * more state to remember not to call index_getnext_tid next time.
-			 */
-			if (scandesc->xs_heap_continue)
-				elog(ERROR, "non-MVCC snapshots are not supported in index-only scans");
+		Assert(vischeck != TMVC_Unchecked);
 
-			/*
-			 * Note: at this point we are holding a pin on the heap page, as
-			 * recorded in scandesc->xs_cbuf.  We could release that pin now,
-			 * but it's not clear whether it's a win to do so.  The next index
-			 * entry might require a visit to the same heap page.
-			 */
-
-			tuple_from_heap = true;
+		switch (vischeck)
+		{
+			case TMVC_Unchecked:
+				elog(ERROR, "Failed to check visibility for tuple");
+				/*
+				 * In case of compilers that don't undertand that elog(ERROR)
+				 * doens't exit, and which have -Wimplicit-fallthrough:
+				 */
+				/* fallthrough */
+			case TMVC_MaybeVisible:
+			{
+				/*
+				 * Rats, we have to visit the heap to check visibility.
+				 */
+				InstrCountTuples2(node, 1);
+				if (!index_fetch_heap(scandesc, node->ioss_TableSlot))
+					continue;	/* no visible tuple, try next index entry */
+
+				ExecClearTuple(node->ioss_TableSlot);
+
+				/*
+				 * Only MVCC snapshots are supported here, so there should be
+				 * no need to keep following the HOT chain once a visible
+				 * entry has been found.  If we did want to allow that, we'd
+				 * need to keep more state to remember not to call
+				 * index_getnext_tid next time.
+				 */
+				if (scandesc->xs_heap_continue)
+					elog(ERROR, "non-MVCC snapshots are not supported in index-only scans");
+
+				/*
+				 * Note: at this point we are holding a pin on the heap page,
+				 * as recorded in scandesc->xs_cbuf.  We could release that
+				 * pin now, but it's not clear whether it's a win to do so.
+				 * The next index entry might require a visit to the same heap
+				 * page.
+				 */
+
+				tuple_from_heap = true;
+				break;
+			}
+			case TMVC_Visible:
+				break;
 		}
 
 		/*
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index a96b1b9c0bc..035bd7a82be 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -6730,44 +6730,62 @@ get_actual_variable_endpoint(Relation heapRel,
 	while ((tid = index_getnext_tid(index_scan, indexscandir)) != NULL)
 	{
 		BlockNumber block = ItemPointerGetBlockNumber(tid);
+		TMVC_Result visres = index_scan->xs_visrecheck;
 
-		if (!VM_ALL_VISIBLE(heapRel,
-							block,
-							&vmbuffer))
+		if (visres == TMVC_Unchecked)
+			visres = table_index_vischeck_tuple(heapRel, &vmbuffer, tid);
+
+		Assert(visres != TMVC_Unchecked);
+
+		switch (visres)
 		{
-			/* Rats, we have to visit the heap to check visibility */
-			if (!index_fetch_heap(index_scan, tableslot))
-			{
+			case TMVC_Unchecked:
+				elog(ERROR, "Failed to check visibility for tuple");
 				/*
-				 * No visible tuple for this index entry, so we need to
-				 * advance to the next entry.  Before doing so, count heap
-				 * page fetches and give up if we've done too many.
-				 *
-				 * We don't charge a page fetch if this is the same heap page
-				 * as the previous tuple.  This is on the conservative side,
-				 * since other recently-accessed pages are probably still in
-				 * buffers too; but it's good enough for this heuristic.
+				 * In case of compilers that don't undertand that elog(ERROR)
+				 * doens't exit, and which have -Wimplicit-fallthrough:
 				 */
+				/* fallthrough */
+			case TMVC_MaybeVisible:
+			{
+				/* Rats, we have to visit the heap to check visibility */
+				if (!index_fetch_heap(index_scan, tableslot))
+				{
+					/*
+					 * No visible tuple for this index entry, so we need to
+					 * advance to the next entry.  Before doing so, count heap
+					 * page fetches and give up if we've done too many.
+					 *
+					 * We don't charge a page fetch if this is the same heap
+					 * page as the previous tuple.  This is on the
+					 * conservative side, since other recently-accessed pages
+					 * are probably still in buffers too; but it's good enough
+					 * for this heuristic.
+					 */
 #define VISITED_PAGES_LIMIT 100
 
-				if (block != last_heap_block)
-				{
-					last_heap_block = block;
-					n_visited_heap_pages++;
-					if (n_visited_heap_pages > VISITED_PAGES_LIMIT)
-						break;
-				}
+					if (block != last_heap_block)
+					{
+						last_heap_block = block;
+						n_visited_heap_pages++;
+						if (n_visited_heap_pages > VISITED_PAGES_LIMIT)
+							break;
+					}
 
-				continue;		/* no visible tuple, try next index entry */
-			}
+					continue;		/* no visible tuple, try next index entry */
+				}
 
-			/* We don't actually need the heap tuple for anything */
-			ExecClearTuple(tableslot);
+				/* We don't actually need the heap tuple for anything */
+				ExecClearTuple(tableslot);
 
-			/*
-			 * We don't care whether there's more than one visible tuple in
-			 * the HOT chain; if any are visible, that's good enough.
-			 */
+				/*
+				 * We don't care whether there's more than one visible tuple in
+				 * the HOT chain; if any are visible, that's good enough.
+				 */
+				break;
+			}
+			case TMVC_Visible:
+				break;
 		}
 
 		/*
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index e48fe434cd3..1b66aa0bacc 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -368,6 +368,8 @@ extern void simple_heap_update(Relation relation, ItemPointer otid,
 
 extern TransactionId heap_index_delete_tuples(Relation rel,
 											  TM_IndexDeleteOp *delstate);
+extern void heap_index_vischeck_tuples(Relation rel,
+									   TM_IndexVisibilityCheckOp *checkop);
 
 /* in heap/pruneheap.c */
 struct GlobalVisState;
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index b5e0fb386c0..93a6f65ab0e 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -26,6 +26,9 @@
 
 struct ParallelTableScanDescData;
 
+enum TMVC_Result;
+
+
 /*
  * Generic descriptor for table scans. This is the base-class for table scans,
  * which needs to be embedded in the scans of individual AMs.
@@ -176,6 +179,8 @@ typedef struct IndexScanDescData
 
 	bool		xs_recheck;		/* T means scan keys must be rechecked */
 
+	int			xs_visrecheck;	/* TM_VisCheckResult from tableam.h */
+
 	/*
 	 * When fetching with an ordering operator, the values of the ORDER BY
 	 * expressions of the last returned tuple, according to the index.  If
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index 8713e12cbfb..47666cf96ea 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -248,6 +248,63 @@ typedef struct TM_IndexDeleteOp
 	TM_IndexStatus *status;
 } TM_IndexDeleteOp;
 
+/*
+ * State used when calling table_index_delete_tuples()
+ *
+ * Index-only scans need to know the visibility of the associated table tuples
+ * before they can return the index tuple.  If the index tuple is known to be
+ * visible with a cheap check, we can return it directly without requesting
+ * the visibility info from the table AM directly.
+ *
+ * This AM API exposes a cheap visibility checking API to indexes, allowing
+ * these indexes to check multiple tuples worth of visibility info at once,
+ * and allowing the AM to store these checks, improving the pinning ergonomics
+ * of index AMs by allowing a scan to cache index tuples in memory without
+ * holding pins on index tuples' pages until the index tuples were returned.
+ *
+ * The AM is called with a list of TIDs, and its output will indicate the
+ * visibility state of each tuple: Unchecked, Dead, MaybeVisible, or Visible.
+ *
+ * HeapAM's implementation of visibility maps only allows for cheap checks of
+ * *definitely visible*; all other results are *maybe visible*. A result for
+ * *definitely not visible* aka dead is currently not accounted for by lack of
+ * Table AMs which support such visibility lookups cheaply.
+ */
+typedef enum TMVC_Result
+{
+	TMVC_Unchecked,
+	TMVC_Visible,
+	TMVC_MaybeVisible,
+} TMVC_Result;
+
+typedef struct TM_VisCheck
+{
+	/* table TID from index tuple */
+	BlockNumber		tidblkno;
+	uint16			tidoffset;
+	/* identifier for the TID in this visibility check operation context */
+	OffsetNumber	idxoffnum;
+	/* the result of the visibility check operation */
+	TMVC_Result		vischeckresult;
+} TM_VisCheck;
+
+static inline void
+PopulateTMVischeck(TM_VisCheck *check, ItemPointer tid, OffsetNumber idxoff)
+{
+	Assert(ItemPointerIsValid(tid));
+	check->tidblkno = ItemPointerGetBlockNumberNoCheck(tid);
+	check->tidoffset = ItemPointerGetOffsetNumberNoCheck(tid);
+	check->idxoffnum = idxoff;
+	check->vischeckresult = TMVC_Unchecked;
+}
+
+typedef struct TM_IndexVisibilityCheckOp
+{
+	int			checkntids;			/* number of TIDs to check */
+	Buffer	   *vmbuf;				/* pointer to VM buffer to reuse across calls */
+	TM_VisCheck *checktids;			/* the checks to execute */
+} TM_IndexVisibilityCheckOp;
+
 /* "options" flag bits for table_tuple_insert */
 /* TABLE_INSERT_SKIP_WAL was 0x0001; RelationNeedsWAL() now governs */
 #define TABLE_INSERT_SKIP_FSM		0x0002
@@ -494,6 +551,10 @@ typedef struct TableAmRoutine
 	TransactionId (*index_delete_tuples) (Relation rel,
 										  TM_IndexDeleteOp *delstate);
 
+	/* see table_index_vischeck_tuples() */
+	void		(*index_vischeck_tuples) (Relation rel,
+										  TM_IndexVisibilityCheckOp *checkop);
+
 
 	/* ------------------------------------------------------------------------
 	 * Manipulations of physical tuples.
@@ -1318,6 +1379,48 @@ table_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate)
 	return rel->rd_tableam->index_delete_tuples(rel, delstate);
 }
 
+/*
+ * Determine rough visibility information of index tuples based on each TID.
+ *
+ * Determines which entries from index AM caller's TM_IndexVisibilityCheckOp
+ * state point to TMVC_VISIBLE or TMVC_MAYBE_VISIBLE table tuples, at low IO
+ * overhead.  For the heap AM, the implementation is effectively a wrapper
+ * around VM_ALL_FROZEN.
+ *
+ * On return, all TM_VisChecks indicated by checkop->checktids will have been
+ * updated with the correct visibility status.
+ *
+ * Note that there is no value for "definitely dead" tuples, as the Heap AM
+ * doesn't have an efficient method to determine that a tuple is dead to all
+ * users, as it would have to go into the heap.  If and when AMs are built
+ * that would support VM checks with an equivalent to VM_ALL_DEAD this
+ * decision can be reconsidered.
+ */
+static inline void
+table_index_vischeck_tuples(Relation rel, TM_IndexVisibilityCheckOp *checkop)
+{
+	return rel->rd_tableam->index_vischeck_tuples(rel, checkop);
+}
+
+static inline TMVC_Result
+table_index_vischeck_tuple(Relation rel, Buffer *vmbuffer, ItemPointer tid)
+{
+	TM_IndexVisibilityCheckOp checkOp;
+	TM_VisCheck		op;
+
+	PopulateTMVischeck(&op, tid, 0);
+
+	checkOp.checktids = &op;
+	checkOp.checkntids = 1;
+	checkOp.vmbuf = vmbuffer;
+
+	rel->rd_tableam->index_vischeck_tuples(rel, &checkOp);
+
+	Assert(op.vischeckresult != TMVC_Unchecked);
+
+	return op.vischeckresult;
+}
+
 
 /* ----------------------------------------------------------------------------
  *  Functions for manipulations of physical tuples.
diff --git a/src/include/access/visibilitymapdefs.h b/src/include/access/visibilitymapdefs.h
index 5ad5c020877..c75303f63fd 100644
--- a/src/include/access/visibilitymapdefs.h
+++ b/src/include/access/visibilitymapdefs.h
@@ -12,6 +12,7 @@
  */
 #ifndef VISIBILITYMAPDEFS_H
 #define VISIBILITYMAPDEFS_H
+#include "storage/bufpage.h"
 
 /* Number of bits for one heap page */
 #define BITS_PER_HEAPBLOCK 2
@@ -31,4 +32,22 @@
 #define VISIBILITYMAP_XLOG_CATALOG_REL	0x04
 #define VISIBILITYMAP_XLOG_VALID_BITS	(VISIBILITYMAP_VALID_BITS | VISIBILITYMAP_XLOG_CATALOG_REL)
 
+/*
+ * Size of the bitmap on each visibility map page, in bytes. There's no
+ * extra headers, so the whole page minus the standard page header is
+ * used for the bitmap.
+ */
+#define VM_MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
+
+/* Number of heap blocks we can represent in one byte */
+#define VM_HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
+
+/* Number of heap blocks we can represent in one visibility map page. */
+#define VM_HEAPBLOCKS_PER_PAGE (VM_MAPSIZE * VM_HEAPBLOCKS_PER_BYTE)
+
+/* Mapping from heap block number to the right bit in the visibility map */
+#define HEAPBLK_TO_VMBLOCK(x) ((x) / VM_HEAPBLOCKS_PER_PAGE)
+#define HEAPBLK_TO_VMBYTE(x) (((x) % VM_HEAPBLOCKS_PER_PAGE) / VM_HEAPBLOCKS_PER_BYTE)
+#define HEAPBLK_TO_VMOFFSET(x) (((x) % VM_HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
+
 #endif							/* VISIBILITYMAPDEFS_H */
-- 
2.48.1

