From d0f1f588b42522ab5c177ad55be368f3d281a565 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <boekewurm+postgres@gmail.com>
Date: Fri, 7 Mar 2025 22:55:24 +0100
Subject: [PATCH v12 2/5] GIST: Fix visibility issues in IOS

Previously, GIST IOS could buffer tuples from pages while VACUUM came
along and cleaned up an ALL_DEAD tuple, marking the tuple's page
ALL_VISIBLE again and making IOS mistakenly believe the tuple is indeed
visible.

With this patch, pins now conflict with GIST vacuum, and we now do
preliminary visibility checks to be used by IOS so that the IOS
infrastructure knows to recheck the heap page even if that page is now
ALL_VISIBLE.

Idea from Heikki Linnakangas
---
 src/backend/access/gist/gistget.c    | 163 ++++++++++++++++++++++-----
 src/backend/access/gist/gistscan.c   |  11 +-
 src/backend/access/gist/gistvacuum.c |   6 +-
 src/include/access/gist_private.h    |  27 ++++-
 4 files changed, 168 insertions(+), 39 deletions(-)

diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c
index 387d9972345..16fda28d4ad 100644
--- a/src/backend/access/gist/gistget.c
+++ b/src/backend/access/gist/gistget.c
@@ -17,6 +17,7 @@
 #include "access/genam.h"
 #include "access/gist_private.h"
 #include "access/relscan.h"
+#include "access/tableam.h"
 #include "lib/pairingheap.h"
 #include "miscadmin.h"
 #include "pgstat.h"
@@ -394,10 +395,14 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
 		return;
 	}
 
-	so->nPageData = so->curPageData = 0;
+	if (scan->numberOfOrderBys)
+		so->os.nsortData = 0;
+	else
+		so->nos.nPageData = so->nos.curPageData = 0;
+
 	scan->xs_hitup = NULL;		/* might point into pageDataCxt */
-	if (so->pageDataCxt)
-		MemoryContextReset(so->pageDataCxt);
+	if (so->nos.pageDataCxt)
+		MemoryContextReset(so->nos.pageDataCxt);
 
 	/*
 	 * We save the LSN of the page as we read it, so that we know whether it
@@ -457,9 +462,9 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
 			/*
 			 * Non-ordered scan, so report tuples in so->pageData[]
 			 */
-			so->pageData[so->nPageData].heapPtr = it->t_tid;
-			so->pageData[so->nPageData].recheck = recheck;
-			so->pageData[so->nPageData].offnum = i;
+			so->nos.pageData[so->nos.nPageData].heapPtr = it->t_tid;
+			so->nos.pageData[so->nos.nPageData].recheck = recheck;
+			so->nos.pageData[so->nos.nPageData].offnum = i;
 
 			/*
 			 * In an index-only scan, also fetch the data from the tuple.  The
@@ -467,12 +472,12 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
 			 */
 			if (scan->xs_want_itup)
 			{
-				oldcxt = MemoryContextSwitchTo(so->pageDataCxt);
-				so->pageData[so->nPageData].recontup =
+				oldcxt = MemoryContextSwitchTo(so->nos.pageDataCxt);
+				so->nos.pageData[so->nos.nPageData].recontup =
 					gistFetchTuple(giststate, r, it);
 				MemoryContextSwitchTo(oldcxt);
 			}
-			so->nPageData++;
+			so->nos.nPageData++;
 		}
 		else
 		{
@@ -501,7 +506,11 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
 				 * In an index-only scan, also fetch the data from the tuple.
 				 */
 				if (scan->xs_want_itup)
+				{
 					item->data.heap.recontup = gistFetchTuple(giststate, r, it);
+					so->os.sortData[so->os.nsortData] = &item->data.heap;
+					so->os.nsortData += 1;
+				}
 			}
 			else
 			{
@@ -526,7 +535,101 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
 		}
 	}
 
-	UnlockReleaseBuffer(buffer);
+	/* Allow writes to the buffer, but don't yet allow VACUUM */
+	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+
+	/*
+	 * If we're in an index-only scan, we need to do visibility checks before
+	 * we release the pin, so that VACUUM can't clean up dead tuples from this
+	 * index page and mark the page ALL_VISIBLE before the tuple was returned.
+	 *
+	 * See also docs section "Index Locking Considerations".
+	 */
+	if (scan->xs_want_itup)
+	{
+		TM_IndexVisibilityCheckOp	op;
+		op.vmbuf = &so->vmbuf;
+
+		if (scan->numberOfOrderBys > 0)
+		{
+			op.checkntids = so->os.nsortData;
+
+			if (op.checkntids > 0)
+			{
+				op.checktids = palloc(op.checkntids * sizeof(TM_VisCheck));
+
+				for (int off = 0; off < op.checkntids; off++)
+				{
+					Assert(ItemPointerIsValid(&so->os.sortData[off]->heapPtr));
+
+					PopulateTMVischeck(&op.checktids[off],
+									   &so->os.sortData[off]->heapPtr,
+									   off);
+				}
+			}
+		}
+		else
+		{
+			op.checkntids = so->nos.nPageData;
+
+			if (op.checkntids > 0)
+			{
+				op.checktids = palloc_array(TM_VisCheck, op.checkntids);
+
+				for (int off = 0; off < op.checkntids; off++)
+				{
+					Assert(ItemPointerIsValid(&so->nos.pageData[off].heapPtr));
+
+					PopulateTMVischeck(&op.checktids[off],
+									   &so->nos.pageData[off].heapPtr,
+									   off);
+				}
+			}
+		}
+
+		if (op.checkntids > 0)
+		{
+			table_index_vischeck_tuples(scan->heapRelation, &op);
+
+			if (scan->numberOfOrderBys > 0)
+			{
+				for (int off = 0; off < op.checkntids; off++)
+				{
+					TM_VisCheck *check = &op.checktids[off];
+					GISTSearchHeapItem *item = so->os.sortData[check->idxoffnum];
+
+					/* sanity checks */
+					Assert(check->idxoffnum < op.checkntids);
+					Assert(check->tidblkno == ItemPointerGetBlockNumberNoCheck(&item->heapPtr));
+					Assert(check->tidoffset == ItemPointerGetOffsetNumberNoCheck(&item->heapPtr));
+
+					item->visrecheck = check->vischeckresult;
+				}
+				/* reset state */
+				so->os.nsortData = 0;
+			}
+			else
+			{
+				for (int off = 0; off < op.checkntids; off++)
+				{
+					TM_VisCheck *check = &op.checktids[off];
+					GISTSearchHeapItem *item = &so->nos.pageData[check->idxoffnum];
+
+					Assert(check->idxoffnum < op.checkntids);
+					Assert(check->tidblkno == ItemPointerGetBlockNumberNoCheck(&item->heapPtr));
+					Assert(check->tidoffset == ItemPointerGetOffsetNumberNoCheck(&item->heapPtr));
+
+					item->visrecheck = check->vischeckresult;
+				}
+			}
+
+			/* clean up the used resources */
+			pfree(op.checktids);
+		}
+	}
+
+	/* Allow VACUUM to process the buffer again */
+	ReleaseBuffer(buffer);
 }
 
 /*
@@ -588,7 +691,10 @@ getNextNearest(IndexScanDesc scan)
 
 			/* in an index-only scan, also return the reconstructed tuple. */
 			if (scan->xs_want_itup)
+			{
 				scan->xs_hitup = item->data.heap.recontup;
+				scan->xs_visrecheck = item->data.heap.visrecheck;
+			}
 			res = true;
 		}
 		else
@@ -629,10 +735,10 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir)
 			scan->instrument->nsearches++;
 
 		so->firstCall = false;
-		so->curPageData = so->nPageData = 0;
+		so->nos.curPageData = so->nos.nPageData = 0;
 		scan->xs_hitup = NULL;
-		if (so->pageDataCxt)
-			MemoryContextReset(so->pageDataCxt);
+		if (so->nos.pageDataCxt)
+			MemoryContextReset(so->nos.pageDataCxt);
 
 		fakeItem.blkno = GIST_ROOT_BLKNO;
 		memset(&fakeItem.data.parentlsn, 0, sizeof(GistNSN));
@@ -649,9 +755,9 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir)
 		/* Fetch tuples index-page-at-a-time */
 		for (;;)
 		{
-			if (so->curPageData < so->nPageData)
+			if (so->nos.curPageData < so->nos.nPageData)
 			{
-				if (scan->kill_prior_tuple && so->curPageData > 0)
+				if (scan->kill_prior_tuple && so->nos.curPageData > 0)
 				{
 
 					if (so->killedItems == NULL)
@@ -667,17 +773,20 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir)
 					}
 					if (so->numKilled < MaxIndexTuplesPerPage)
 						so->killedItems[so->numKilled++] =
-							so->pageData[so->curPageData - 1].offnum;
+							so->nos.pageData[so->nos.curPageData - 1].offnum;
 				}
 				/* continuing to return tuples from a leaf page */
-				scan->xs_heaptid = so->pageData[so->curPageData].heapPtr;
-				scan->xs_recheck = so->pageData[so->curPageData].recheck;
+				scan->xs_heaptid = so->nos.pageData[so->nos.curPageData].heapPtr;
+				scan->xs_recheck = so->nos.pageData[so->nos.curPageData].recheck;
 
 				/* in an index-only scan, also return the reconstructed tuple */
 				if (scan->xs_want_itup)
-					scan->xs_hitup = so->pageData[so->curPageData].recontup;
+				{
+					scan->xs_hitup = so->nos.pageData[so->nos.curPageData].recontup;
+					scan->xs_visrecheck = so->nos.pageData[so->nos.curPageData].visrecheck;
+				}
 
-				so->curPageData++;
+				so->nos.curPageData++;
 
 				return true;
 			}
@@ -687,8 +796,8 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir)
 			 * necessary
 			 */
 			if (scan->kill_prior_tuple
-				&& so->curPageData > 0
-				&& so->curPageData == so->nPageData)
+				&& so->nos.curPageData > 0
+				&& so->nos.curPageData == so->nos.nPageData)
 			{
 
 				if (so->killedItems == NULL)
@@ -704,7 +813,7 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir)
 				}
 				if (so->numKilled < MaxIndexTuplesPerPage)
 					so->killedItems[so->numKilled++] =
-						so->pageData[so->curPageData - 1].offnum;
+						so->nos.pageData[so->nos.curPageData - 1].offnum;
 			}
 			/* find and process the next index page */
 			do
@@ -733,7 +842,7 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir)
 				gistScanPage(scan, item, item->distances, NULL, NULL);
 
 				pfree(item);
-			} while (so->nPageData == 0);
+			} while (so->nos.nPageData == 0);
 		}
 	}
 }
@@ -756,10 +865,10 @@ gistgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 		scan->instrument->nsearches++;
 
 	/* Begin the scan by processing the root page */
-	so->curPageData = so->nPageData = 0;
+	so->nos.curPageData = so->nos.nPageData = 0;
 	scan->xs_hitup = NULL;
-	if (so->pageDataCxt)
-		MemoryContextReset(so->pageDataCxt);
+	if (so->nos.pageDataCxt)
+		MemoryContextReset(so->nos.pageDataCxt);
 
 	fakeItem.blkno = GIST_ROOT_BLKNO;
 	memset(&fakeItem.data.parentlsn, 0, sizeof(GistNSN));
diff --git a/src/backend/access/gist/gistscan.c b/src/backend/access/gist/gistscan.c
index 700fa959d03..a59672ce979 100644
--- a/src/backend/access/gist/gistscan.c
+++ b/src/backend/access/gist/gistscan.c
@@ -204,9 +204,9 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys,
 		scan->xs_hitupdesc = so->giststate->fetchTupdesc;
 
 		/* Also create a memory context that will hold the returned tuples */
-		so->pageDataCxt = AllocSetContextCreate(so->giststate->scanCxt,
-												"GiST page data context",
-												ALLOCSET_DEFAULT_SIZES);
+		so->nos.pageDataCxt = AllocSetContextCreate(so->giststate->scanCxt,
+													"GiST page data context",
+													ALLOCSET_DEFAULT_SIZES);
 	}
 
 	/* create new, empty pairing heap for search queue */
@@ -347,6 +347,11 @@ void
 gistendscan(IndexScanDesc scan)
 {
 	GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
+	if (BufferIsValid(so->vmbuf))
+	{
+		ReleaseBuffer(so->vmbuf);
+		so->vmbuf = InvalidBuffer;
+	}
 
 	/*
 	 * freeGISTstate is enough to clean up everything made by gistbeginscan,
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index 6a359c98c60..d0b8afc252f 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -325,10 +325,10 @@ restart:
 	recurse_to = InvalidBlockNumber;
 
 	/*
-	 * We are not going to stay here for a long time, aggressively grab an
-	 * exclusive lock.
+	 * We are not going to stay here for a long time, aggressively grab a
+	 * cleanup lock.
 	 */
-	LockBuffer(buffer, GIST_EXCLUSIVE);
+	LockBufferForCleanup(buffer);
 	page = (Page) BufferGetPage(buffer);
 
 	if (gistPageRecyclable(page))
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index 39404ec7cdb..a4bc344381c 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -22,6 +22,7 @@
 #include "storage/buffile.h"
 #include "utils/hsearch.h"
 #include "access/genam.h"
+#include "tableam.h"
 
 /*
  * Maximum number of "halves" a page can be split into in one operation.
@@ -124,6 +125,8 @@ typedef struct GISTSearchHeapItem
 								 * index-only scans */
 	OffsetNumber offnum;		/* track offset in page to mark tuple as
 								 * LP_DEAD */
+	uint8		visrecheck;		/* Cached visibility check result for this
+								 * heap pointer. */
 } GISTSearchHeapItem;
 
 /* Unvisited item, either index page or heap tuple */
@@ -170,12 +173,24 @@ typedef struct GISTScanOpaqueData
 	BlockNumber curBlkno;		/* current number of block */
 	GistNSN		curPageLSN;		/* pos in the WAL stream when page was read */
 
-	/* In a non-ordered search, returnable heap items are stored here: */
-	GISTSearchHeapItem pageData[BLCKSZ / sizeof(IndexTupleData)];
-	OffsetNumber nPageData;		/* number of valid items in array */
-	OffsetNumber curPageData;	/* next item to return */
-	MemoryContext pageDataCxt;	/* context holding the fetched tuples, for
-								 * index-only scans */
+	/* info used by Index-Only Scans */
+	Buffer		vmbuf;			/* reusable buffer for IOS' vm lookups */
+
+	union {
+		struct {
+			/* In a non-ordered search, returnable heap items are stored here: */
+			GISTSearchHeapItem pageData[BLCKSZ / sizeof(IndexTupleData)];
+			OffsetNumber nPageData;		/* number of valid items in array */
+			OffsetNumber curPageData;	/* next item to return */
+			MemoryContext pageDataCxt;	/* context holding the fetched tuples,
+										 * for index-only scans */
+		} nos;
+		struct {
+			/* In an ordered search, we use this as scratch space */
+			GISTSearchHeapItem *sortData[BLCKSZ / sizeof(IndexTupleData)];
+			OffsetNumber	nsortData;	/* number of items in sortData */
+		} os;
+	};
 } GISTScanOpaqueData;
 
 typedef GISTScanOpaqueData *GISTScanOpaque;
-- 
2.48.1

