From 75243037b2e979e67693da16a0945395564428b2 Mon Sep 17 00:00:00 2001
From: Jinbao Chen <cjinbao@vmware.com>
Date: Mon, 2 Nov 2020 12:47:02 +0800
Subject: [PATCH] Add table am 'tid_visible'

We directly call the heap function VM_ALL_VISIBLE in the
IndexOnlyNext function. This is not in line with the design idea of
table am. If the new storage type needs to implement index only
scan, he must hack the IndexOnlyNext function.

So this patch add a new table am 'tid_visible' to test visibility
of tid.

Co-authored-by: Jinbao Chen <cjinbao@vmware.com>
Co-authored-by: Asim R P <pasim@vmware.com>
---
 src/backend/access/heap/heapam_handler.c | 49 ++++++++++++++++++++++++++++++++
 src/backend/executor/nodeIndexonlyscan.c | 39 +++----------------------
 src/include/access/heapam.h              |  1 +
 src/include/access/tableam.h             | 22 ++++++++++++++
 4 files changed, 76 insertions(+), 35 deletions(-)

diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index dcaea7135f..5208676c0d 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -27,6 +27,7 @@
 #include "access/syncscan.h"
 #include "access/tableam.h"
 #include "access/tsmapi.h"
+#include "access/visibilitymap.h"
 #include "access/xact.h"
 #include "catalog/catalog.h"
 #include "catalog/index.h"
@@ -82,6 +83,7 @@ heapam_index_fetch_begin(Relation rel)
 
 	hscan->xs_base.rel = rel;
 	hscan->xs_cbuf = InvalidBuffer;
+	hscan->xs_vbuf = InvalidBuffer;
 
 	return &hscan->xs_base;
 }
@@ -96,6 +98,11 @@ heapam_index_fetch_reset(IndexFetchTableData *scan)
 		ReleaseBuffer(hscan->xs_cbuf);
 		hscan->xs_cbuf = InvalidBuffer;
 	}
+	if (BufferIsValid(hscan->xs_vbuf))
+	{
+		ReleaseBuffer(hscan->xs_vbuf);
+		hscan->xs_vbuf = InvalidBuffer;
+	}
 }
 
 static void
@@ -170,6 +177,47 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
 	return got_heap_tuple;
 }
 
+static bool
+heapam_tid_visible(struct IndexFetchTableData *scan,
+									 ItemPointer tid,
+									 Snapshot snapshot)
+{
+	IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
+
+	/*
+	 * Note on Memory Ordering Effects: visibilitymap_get_status does not
+	 * lock the visibility map buffer, and therefore the result we read
+	 * here could be slightly stale.  However, it can't be stale enough to
+	 * matter.
+	 *
+	 * We need to detect clearing a VM bit due to an insert right away,
+	 * because the tuple is present in the index page but not visible. The
+	 * reading of the TID by this scan (using a shared lock on the index
+	 * buffer) is serialized with the insert of the TID into the index
+	 * (using an exclusive lock on the index buffer). Because the VM bit
+	 * is cleared before updating the index, and locking/unlocking of the
+	 * index page acts as a full memory barrier, we are sure to see the
+	 * cleared bit if we see a recently-inserted TID.
+	 *
+	 * Deletes do not update the index page (only VACUUM will clear out
+	 * the TID), so the clearing of the VM bit by a delete is not
+	 * serialized with this test below, and we may see a value that is
+	 * significantly stale. However, we don't care about the delete right
+	 * away, because the tuple is still visible until the deleting
+	 * transaction commits or the statement ends (if it's our
+	 * transaction). In either case, the lock on the VM buffer will have
+	 * been released (acting as a write barrier) after clearing the bit.
+	 * And for us to have a snapshot that includes the deleting
+	 * transaction (making the tuple invisible), we must have acquired
+	 * ProcArrayLock after that time, acting as a read barrier.
+	 *
+	 * It's worth going through this complexity to avoid needing to lock
+	 * the VM buffer, which could cause significant contention.
+	 */
+	 return VM_ALL_VISIBLE(hscan->xs_base.rel,
+						   ItemPointerGetBlockNumber(tid),
+						   &hscan->xs_vbuf);
+}
 
 /* ------------------------------------------------------------------------
  * Callbacks for non-modifying operations on individual tuples for heap AM
@@ -2524,6 +2572,7 @@ static const TableAmRoutine heapam_methods = {
 	.index_fetch_reset = heapam_index_fetch_reset,
 	.index_fetch_end = heapam_index_fetch_end,
 	.index_fetch_tuple = heapam_index_fetch_tuple,
+	.tid_visible = heapam_tid_visible,
 
 	.tuple_insert = heapam_tuple_insert,
 	.tuple_insert_speculative = heapam_tuple_insert_speculative,
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index 5617ac29e7..961e92c536 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -125,42 +125,11 @@ IndexOnlyNext(IndexOnlyScanState *node)
 		CHECK_FOR_INTERRUPTS();
 
 		/*
-		 * We can skip the heap fetch if the TID references a heap page on
-		 * which all tuples are known visible to everybody.  In any case,
-		 * we'll use the index tuple not the heap tuple as the data source.
-		 *
-		 * Note on Memory Ordering Effects: visibilitymap_get_status does not
-		 * lock the visibility map buffer, and therefore the result we read
-		 * here could be slightly stale.  However, it can't be stale enough to
-		 * matter.
-		 *
-		 * We need to detect clearing a VM bit due to an insert right away,
-		 * because the tuple is present in the index page but not visible. The
-		 * reading of the TID by this scan (using a shared lock on the index
-		 * buffer) is serialized with the insert of the TID into the index
-		 * (using an exclusive lock on the index buffer). Because the VM bit
-		 * is cleared before updating the index, and locking/unlocking of the
-		 * index page acts as a full memory barrier, we are sure to see the
-		 * cleared bit if we see a recently-inserted TID.
-		 *
-		 * Deletes do not update the index page (only VACUUM will clear out
-		 * the TID), so the clearing of the VM bit by a delete is not
-		 * serialized with this test below, and we may see a value that is
-		 * significantly stale. However, we don't care about the delete right
-		 * away, because the tuple is still visible until the deleting
-		 * transaction commits or the statement ends (if it's our
-		 * transaction). In either case, the lock on the VM buffer will have
-		 * been released (acting as a write barrier) after clearing the bit.
-		 * And for us to have a snapshot that includes the deleting
-		 * transaction (making the tuple invisible), we must have acquired
-		 * ProcArrayLock after that time, acting as a read barrier.
-		 *
-		 * It's worth going through this complexity to avoid needing to lock
-		 * the VM buffer, which could cause significant contention.
+		 * Use tid_visible table AM API to Check if visibility of the TID can
+		 * be determined without fetching the tuple from underlying relation.
 		 */
-		if (!VM_ALL_VISIBLE(scandesc->heapRelation,
-							ItemPointerGetBlockNumber(tid),
-							&node->ioss_VMBuffer))
+		if (!table_index_tid_visible(
+				scandesc->xs_heapfetch, tid, scandesc->xs_snapshot))
 		{
 			/*
 			 * Rats, we have to visit the heap to check visibility.
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index 92b19dba32..317cbdcff3 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -81,6 +81,7 @@ typedef struct IndexFetchHeapData
 
 	Buffer		xs_cbuf;		/* current heap buffer in scan, if any */
 	/* NB: if xs_cbuf is not InvalidBuffer, we hold a pin on that buffer */
+	Buffer 		xs_vbuf;		/* current buffer in visibility map */
 } IndexFetchHeapData;
 
 /* Result codes for HeapTupleSatisfiesVacuum */
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index 387eb34a61..1e44fadfe2 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -305,6 +305,17 @@ typedef struct TableAmRoutine
 									  TupleTableSlot *slot,
 									  bool *call_again, bool *all_dead);
 
+	/*
+	 * Test the visibility of the tid using AM specific visibility mechanism.
+	 * This is used for index-only scans, to skip fetching a tuple from
+	 * underlying relation only to determine visibility.  If it returns true,
+	 * the tid is visible and there is no need to read the underlying table.
+	 * Otherwise, the tuple from the underlying table must be read to
+	 * determine visibility.
+	 */
+	bool		(*tid_visible) (struct IndexFetchTableData *scan,
+								ItemPointer tid,
+								Snapshot snapshot);
 
 	/* ------------------------------------------------------------------------
 	 * Callbacks for non-modifying operations on individual tuples
@@ -1051,6 +1062,17 @@ extern bool table_index_fetch_tuple_check(Relation rel,
 										  Snapshot snapshot,
 										  bool *all_dead);
 
+/*
+ * Used by index-only scan to determine whether the tuple can be served solely
+ * from the index because it's known to be visible to the snapshot.
+ */
+static inline bool
+table_index_tid_visible(struct IndexFetchTableData *scan,
+                       ItemPointer tid,
+                       Snapshot snapshot)
+{
+   return scan->rel->rd_tableam->tid_visible(scan, tid, snapshot);
+}
 
 /* ------------------------------------------------------------------------
  * Functions for non-modifying operations on individual tuples
-- 
2.14.2

